In [8]:
import pickle
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import ShuffleSplit
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [9]:
path_df_fin = "pickles/raw_dataset_fin.pickle"
path_df_inv = "pickles/raw_dataset_inv.pickle"

with open(path_df_fin, 'rb') as data:
    df_fin = pickle.load(data)
    
with open(path_df_inv, 'rb') as data:
    df_inv = pickle.load(data)

In [10]:
# X_train
with open('pickles/X_train_inv.pickle', 'rb') as data:
    X_train_inv = pickle.load(data)
    
# X_test    
with open('pickles/X_test_inv.pickle', 'rb') as data:
    X_test_inv = pickle.load(data)
    
# y_train
with open('pickles/y_train_inv.pickle', 'rb') as data:
    y_train_inv = pickle.load(data)
    
# y_test
with open('pickles/y_test_inv.pickle', 'rb') as data:
    y_test_inv = pickle.load(data)
    
# features_train
with open('pickles/features_train_inv.pickle', 'rb') as data:
    features_train_inv = pickle.load(data)

# labels_train
with open('pickles/labels_train_inv.pickle', 'rb') as data:
    labels_train_inv = pickle.load(data)

# features_test
with open('pickles/features_test_inv.pickle', 'rb') as data:
    features_test_inv = pickle.load(data)

# labels_test
with open('pickles/labels_test_inv.pickle', 'rb') as data:
    labels_test_inv = pickle.load(data)

In [11]:
# X_train
with open('pickles/X_train_fin.pickle', 'rb') as data:
    X_train_fin = pickle.load(data)
    
# X_test    
with open('pickles/X_test_fin.pickle', 'rb') as data:
    X_test_fin = pickle.load(data)
    
# y_train
with open('pickles/y_train_fin.pickle', 'rb') as data:
    y_train_fin = pickle.load(data)
    
# y_test
with open('pickles/y_test_fin.pickle', 'rb') as data:
    y_test_fin = pickle.load(data)
    
# features_train
with open('pickles/features_train_fin.pickle', 'rb') as data:
    features_train_fin = pickle.load(data)

# labels_train
with open('pickles/labels_train_fin.pickle', 'rb') as data:
    labels_train_fin = pickle.load(data)

# features_test
with open('pickles/features_test_fin.pickle', 'rb') as data:
    features_test_fin = pickle.load(data)

# labels_test
with open('pickles/labels_test_fin.pickle', 'rb') as data:
    labels_test_fin = pickle.load(data)

In [12]:
# TF-IDF object
with open('pickles/tfidf.pickle', 'rb') as data:
    tfidf = pickle.load(data)

In [13]:
print(features_train_fin.shape)
print(features_test_fin.shape)
print('\n')
print(features_train_inv.shape)
print(features_test_inv.shape)

(223, 300)
(40, 300)


(238, 300)
(42, 300)


# FINTECH

In [14]:
# Create the parameter grid based on the results of random search 
C = [.0001, .001, .01, .1]
degree = [3, 4, 5]
gamma = [1, 10, 100]
probability = [True]

param_grid = [
  {'C': C, 'kernel':['linear'], 'probability':probability},
  {'C': C, 'kernel':['poly'], 'degree':degree, 'probability':probability},
  {'C': C, 'kernel':['rbf'], 'gamma':gamma, 'probability':probability}
]

# Create a base model
svc = svm.SVC(random_state=8)

# Manually create the splits in CV in order to be able to fix a random_state (GridSearchCV doesn't have that argument)
cv_sets = ShuffleSplit(n_splits = 3, test_size = .33, random_state = 8)

# Instantiate the grid search model
grid_search_fin = GridSearchCV(estimator=svc, 
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=cv_sets,
                           verbose=1)

# Fit the grid search to the data
grid_search_fin.fit(features_train_fin, labels_train_fin)

Fitting 3 folds for each of 28 candidates, totalling 84 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Parallel(n_jobs=1)]: Done  84 out of  84 | elapsed:    3.1s finished


GridSearchCV(cv=ShuffleSplit(n_splits=3, random_state=8, test_size=0.33, train_size=None),
             error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=8, shrinking=True,
                           tol=0.001, verbose=False),
             iid='wa...one,
             param_grid=[{'C': [0.0001, 0.001, 0.01, 0.1], 'kernel': ['linear'],
                          'probability': [True]},
                         {'C': [0.0001, 0.001, 0.01, 0.1], 'degree': [3, 4, 5],
                          'kernel': ['poly'], 'probability': [True]},
                         {'C': [0.0001, 0.001, 0.01, 0.1],
                          'gamma': [1, 10, 100], 'kernel': ['rbf'],
                          'probability': [True]}],
             pre_

In [15]:
print("The best hyperparameters from Grid Search are:")
print(grid_search_fin.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(grid_search_fin.best_score_)

The best hyperparameters from Grid Search are:
{'C': 0.0001, 'kernel': 'linear', 'probability': True}

The mean accuracy of a model with these hyperparameters is:
0.7432432432432432


In [16]:
best_svc_fin = grid_search_fin.best_estimator_

In [17]:
best_svc_fin

SVC(C=0.0001, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=True, random_state=8,
    shrinking=True, tol=0.001, verbose=False)

In [18]:
best_svc_fin.fit(features_train_fin, labels_train_fin)

SVC(C=0.0001, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=True, random_state=8,
    shrinking=True, tol=0.001, verbose=False)

In [19]:
svc_pred_fin = best_svc_fin.predict(features_test_fin)

In [20]:
# Training accuracy
print("The training accuracy is: ")
print(accuracy_score(labels_train_fin, best_svc_fin.predict(features_train_fin)))

The training accuracy is: 
0.6905829596412556


In [21]:
# Test accuracy
print("The test accuracy is: ")
print(accuracy_score(labels_test_fin, svc_pred_fin))

The test accuracy is: 
0.7


# INVESTMENT

In [22]:
# Create the parameter grid based on the results of random search 
C = [.0001, .001, .01, .1]
degree = [3, 4, 5]
gamma = [1, 10, 100]
probability = [True]

param_grid = [
  {'C': C, 'kernel':['linear'], 'probability':probability},
  {'C': C, 'kernel':['poly'], 'degree':degree, 'probability':probability},
  {'C': C, 'kernel':['rbf'], 'gamma':gamma, 'probability':probability}
]

# Create a base model
svc = svm.SVC(random_state=8)

# Manually create the splits in CV in order to be able to fix a random_state (GridSearchCV doesn't have that argument)
cv_sets = ShuffleSplit(n_splits = 3, test_size = .33, random_state = 8)

# Instantiate the grid search model
grid_search_inv = GridSearchCV(estimator=svc, 
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=cv_sets,
                           verbose=1)

# Fit the grid search to the data
grid_search_inv.fit(features_train_inv, labels_train_inv)

Fitting 3 folds for each of 28 candidates, totalling 84 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Parallel(n_jobs=1)]: Done  84 out of  84 | elapsed:    3.6s finished


GridSearchCV(cv=ShuffleSplit(n_splits=3, random_state=8, test_size=0.33, train_size=None),
             error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=8, shrinking=True,
                           tol=0.001, verbose=False),
             iid='wa...one,
             param_grid=[{'C': [0.0001, 0.001, 0.01, 0.1], 'kernel': ['linear'],
                          'probability': [True]},
                         {'C': [0.0001, 0.001, 0.01, 0.1], 'degree': [3, 4, 5],
                          'kernel': ['poly'], 'probability': [True]},
                         {'C': [0.0001, 0.001, 0.01, 0.1],
                          'gamma': [1, 10, 100], 'kernel': ['rbf'],
                          'probability': [True]}],
             pre_

In [23]:
print("The best hyperparameters from Grid Search are:")
print(grid_search_inv.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(grid_search_inv.best_score_)

The best hyperparameters from Grid Search are:
{'C': 0.0001, 'kernel': 'linear', 'probability': True}

The mean accuracy of a model with these hyperparameters is:
0.6582278481012658


In [24]:
best_svc_inv = grid_search_inv.best_estimator_

In [25]:
best_svc_inv

SVC(C=0.0001, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=True, random_state=8,
    shrinking=True, tol=0.001, verbose=False)

In [26]:
best_svc_inv.fit(features_train_inv, labels_train_inv)

SVC(C=0.0001, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=True, random_state=8,
    shrinking=True, tol=0.001, verbose=False)

In [27]:
svc_pred_inv = best_svc_inv.predict(features_test_inv)

In [28]:
# Training accuracy
print("The training accuracy is: ")
print(accuracy_score(labels_train_inv, best_svc_inv.predict(features_train_inv)))

The training accuracy is: 
0.6512605042016807


In [29]:
# Test accuracy
print("The test accuracy is: ")
print(accuracy_score(labels_test_inv, svc_pred_inv))

The test accuracy is: 
0.6428571428571429


failed