In [1]:
import pickle
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import ShuffleSplit
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [2]:
path_df_fin = "pickles/raw_dataset_fin.pickle"
path_df_inv = "pickles/raw_dataset_inv.pickle"

with open(path_df_fin, 'rb') as data:
    df_fin = pickle.load(data)
    
with open(path_df_inv, 'rb') as data:
    df_inv = pickle.load(data)

In [3]:
# X_train
with open('pickles/X_train_inv.pickle', 'rb') as data:
    X_train_inv = pickle.load(data)
    
# X_test    
with open('pickles/X_test_inv.pickle', 'rb') as data:
    X_test_inv = pickle.load(data)
    
# y_train
with open('pickles/y_train_inv.pickle', 'rb') as data:
    y_train_inv = pickle.load(data)
    
# y_test
with open('pickles/y_test_inv.pickle', 'rb') as data:
    y_test_inv = pickle.load(data)
    
# features_train
with open('pickles/features_train_inv.pickle', 'rb') as data:
    features_train_inv = pickle.load(data)

# labels_train
with open('pickles/labels_train_inv.pickle', 'rb') as data:
    labels_train_inv = pickle.load(data)

# features_test
with open('pickles/features_test_inv.pickle', 'rb') as data:
    features_test_inv = pickle.load(data)

# labels_test
with open('pickles/labels_test_inv.pickle', 'rb') as data:
    labels_test_inv = pickle.load(data)

In [4]:
# X_train
with open('pickles/X_train_fin.pickle', 'rb') as data:
    X_train_fin = pickle.load(data)
    
# X_test    
with open('pickles/X_test_fin.pickle', 'rb') as data:
    X_test_fin = pickle.load(data)
    
# y_train
with open('pickles/y_train_fin.pickle', 'rb') as data:
    y_train_fin = pickle.load(data)
    
# y_test
with open('pickles/y_test_fin.pickle', 'rb') as data:
    y_test_fin = pickle.load(data)
    
# features_train
with open('pickles/features_train_fin.pickle', 'rb') as data:
    features_train_fin = pickle.load(data)

# labels_train
with open('pickles/labels_train_fin.pickle', 'rb') as data:
    labels_train_fin = pickle.load(data)

# features_test
with open('pickles/features_test_fin.pickle', 'rb') as data:
    features_test_fin = pickle.load(data)

# labels_test
with open('pickles/labels_test_fin.pickle', 'rb') as data:
    labels_test_fin = pickle.load(data)

In [5]:
# TF-IDF object
with open('pickles/tfidf.pickle', 'rb') as data:
    tfidf = pickle.load(data)

In [6]:
print(features_train_fin.shape)
print(features_test_fin.shape)
print('\n')
print(features_train_inv.shape)
print(features_test_inv.shape)

(223, 400)
(40, 400)


(238, 400)
(42, 400)


# FINTECH

In [7]:
# Create the parameter grid based on the results of random search 
bootstrap = [False]
max_depth = [3, 4, 5, 8, 10, 15]
max_features = ['sqrt']
min_samples_leaf = [1, 2]
min_samples_split = [2, 3, 5]
n_estimators = [800]

param_grid = {
    'bootstrap': bootstrap,
    'max_depth': max_depth,
    'max_features': max_features,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'n_estimators': n_estimators
}

# Create a base model
rfc = RandomForestClassifier(random_state=8)

# Manually create the splits in CV in order to be able to fix a random_state (GridSearchCV doesn't have that argument)
cv_sets = ShuffleSplit(n_splits = 3, test_size = .33, random_state = 8)

# Instantiate the grid search model
grid_search_fin = GridSearchCV(estimator=rfc, 
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=cv_sets,
                           verbose=1)

# Fit the grid search to the data
grid_search_fin.fit(features_train_fin, labels_train_fin)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed:  1.7min finished


GridSearchCV(cv=ShuffleSplit(n_splits=3, random_state=8, test_size=0.33, train_size=None),
             error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_...
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False, random_state=8,
                                              verbose=0, warm_start=False),
  

In [8]:
print("The best hyperparameters from Grid Search are:")
print(grid_search_fin.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(grid_search_fin.best_score_)

The best hyperparameters from Grid Search are:
{'bootstrap': False, 'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 800}

The mean accuracy of a model with these hyperparameters is:
0.7792792792792793


In [9]:
best_rfc_fin = grid_search_fin.best_estimator_

In [10]:
best_rfc_fin

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
                       max_depth=4, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=800,
                       n_jobs=None, oob_score=False, random_state=8, verbose=0,
                       warm_start=False)

In [11]:
best_rfc_fin.fit(features_train_fin, labels_train_fin)

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
                       max_depth=4, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=800,
                       n_jobs=None, oob_score=False, random_state=8, verbose=0,
                       warm_start=False)

In [12]:
rfc_pred_fin = best_rfc_fin.predict(features_test_fin)

In [13]:
# Training accuracy
print("The training accuracy is: ")
print(accuracy_score(labels_train_fin, best_rfc_fin.predict(features_train_fin)))

The training accuracy is: 
0.8430493273542601


In [14]:
# Test accuracy
print("The test accuracy is: ")
print(accuracy_score(labels_test_fin, rfc_pred_fin))

The test accuracy is: 
0.825


# INVESTMENT

In [15]:
# Create the parameter grid based on the results of random search 
bootstrap = [False]
max_depth = [3, 4, 5, 6, 7, 10, 15, 30]
max_features = ['sqrt']
min_samples_leaf = [1, 2, 4]
min_samples_split = [2, 3, 5]
n_estimators = [800]

param_grid = {
    'bootstrap': bootstrap,
    'max_depth': max_depth,
    'max_features': max_features,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'n_estimators': n_estimators
}

# Create a base model
rfc = RandomForestClassifier(random_state=8)

# Manually create the splits in CV in order to be able to fix a random_state (GridSearchCV doesn't have that argument)
cv_sets = ShuffleSplit(n_splits = 3, test_size = .33, random_state = 8)

# Instantiate the grid search model
grid_search_inv = GridSearchCV(estimator=rfc, 
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=cv_sets,
                           verbose=1)

# Fit the grid search to the data
grid_search_inv.fit(features_train_inv, labels_train_inv)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 216 out of 216 | elapsed:  3.4min finished


GridSearchCV(cv=ShuffleSplit(n_splits=3, random_state=8, test_size=0.33, train_size=None),
             error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_...
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False, random_state=8,
                                              verbose=0, warm_start=False),
  

In [16]:
print("The best hyperparameters from Grid Search are:")
print(grid_search_inv.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(grid_search_inv.best_score_)

The best hyperparameters from Grid Search are:
{'bootstrap': False, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 800}

The mean accuracy of a model with these hyperparameters is:
0.6708860759493671


In [17]:
best_rfc_inv = grid_search_inv.best_estimator_

In [18]:
best_rfc_inv

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
                       max_depth=10, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=800,
                       n_jobs=None, oob_score=False, random_state=8, verbose=0,
                       warm_start=False)

In [19]:
best_rfc_inv.fit(features_train_inv, labels_train_inv)

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
                       max_depth=10, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=800,
                       n_jobs=None, oob_score=False, random_state=8, verbose=0,
                       warm_start=False)

In [20]:
rfc_pred_inv = best_rfc_inv.predict(features_test_inv)

In [21]:
# Training accuracy
print("The training accuracy is: ")
print(accuracy_score(labels_train_inv, best_rfc_inv.predict(features_train_inv)))

The training accuracy is: 
0.9369747899159664


In [22]:
# Test accuracy
print("The test accuracy is: ")
print(accuracy_score(labels_test_inv, rfc_pred_inv))

The test accuracy is: 
0.6904761904761905


In [23]:
with open('models/best_rfc_fin.pickle', 'wb') as output:
    pickle.dump(best_rfc_fin, output)

In [24]:
with open('models/best_rfc_inv.pickle', 'wb') as output:
    pickle.dump(best_rfc_inv, output)