In [None]:
# Classification Analysis (Monthly)
We will model the returns movement using technical and sentiment analysis. Technical analysis is the method of using statistical methods and trends based on historical data, such as daily total volume or value of a traded stock, and evaluate the historical patterns to predict future stock price movement. On the other hand, sentiment analysis utilises textual data to predict the sentiment direction that influences stock price movements. We utilise self-engineered features from ESS scores. Reference: https://medium.com/codex/stock-predication-using-regression-algorithm-in-python-fb8b426453b9

import pandas as pd
import pickle
import numpy as np
import math 
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import svm, tree
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from pandas.tseries.offsets import MonthEnd
%matplotlib inline
plt.style.use('fivethirtyeight')

%run import_library.ipynb

file = open('outputs/finalised_df/df_cn_month.pickle', "rb")
df = pickle.load(file)
df

# for feature selection for respective Ml models

feature_select_dict = {'xgb': [], 'svm_clf': [], 'sgd': [], 'bnb': [], 'rf': []} # dictionary of feature selection for different models

## Multi-Collinearity Detection
We want to check if there are presence of multi-collinearity among independent variables. Reference: https://www.analyticsvidhya.com/blog/2021/03/multicollinearity-in-data-science/

### Variance Inflation Factor (VIF)

names = X_clf # classification variables

import copy

vif_df = df.copy()
vif_df = vif_df[names]
high_collinearity = []

for index in range(0, len(names)):
    
    y = vif_df.loc[:, vif_df.columns == names[index]]
    x = vif_df.loc[:, vif_df.columns != names[index]]
    model = sm.OLS(y, x) #Fit ordinary least squares method
    results = model.fit()
    rsq = results.rsquared
    vif = round(1 / (1 - rsq), 2)
    print("R Square value of {} column is {} keeping all other columns as independent features".format(
      names[index], (round(rsq, 2))
            )
    )
    print("Variance Inflation Factor of {} column is {} n".format(
         names[index], vif)
    )
    print()
    
    if vif >=5: # high multicollinearity
        high_collinearity.append(names[index])

high_collinearity

# features to remove (manual basis) - for technical indicators, we will be using mid_window

remove_features = ['model_score', 'rsi_5', 'rsi_50', 'evm_5', 'evm_50', 'bol_buy_5', 'bol_buy_50', 'bol_wband_5', 'bol_wband_50', 'adx_5', 'adx_50']
remove_features

# Monthly Modelling

## Train Test Split

MinMaxScaler to encode numerical variables

def returns_movement(s):
    """
    Indicates the individual returns from historical data one period ahead, individual holdings returns must be more than others returns
    """
    
    if (s['returns_lead_1'] > s['others_returns_lead_1']):
        return 1
    
    else:
        return 0

df['returns_movement'] = df.apply(lambda x: returns_movement(x), axis=1) # y true for the next week data

X_train, y_train, X_test, y_test = _train_test_split(df, list(df.sedol.unique()), "2019-01-01")

# Scaling and Transformation
min_max_scaler = MinMaxScaler()
X_train[X_clf] = min_max_scaler.fit_transform(X_train[X_clf])
X_test[X_clf] = min_max_scaler.transform(X_test[X_clf])

df.returns_movement.value_counts()

### Stepwise Regression
To eliminate multilcollinearity for certain models, we seek to drop highly correlated variables

import pandas as pd
import statsmodels.api as sm

def forward_regression(X, y,
                       threshold_in=0.01,
                       verbose=False):
    initial_list = []
    included = list(initial_list)
    while True:
        changed=False
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        if not changed:
            break

    return included

def backward_regression(X, y,
                           threshold_out=0.05,
                           verbose=False):
    included=list(X.columns)
    while True:
        changed=False
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() # null if pvalues is empty
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included

forward_regression(X_train, y_train)

backward_regression(X_train, y_train)

## XGBoost Classifier

import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error as MSE
from itertools import product
from sklearn.preprocessing import LabelEncoder

#import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from xgboost import XGBRegressor
from xgboost import plot_importance

from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb

def plot_features(booster, figsize):    
    fig, ax = plt.subplots(1,1,figsize=figsize)
    return plot_importance(booster=booster, ax=ax)

xg = XGBClassifier()
xg.fit(X_train, y_train)

print(f"Accuracy Score: {metrics.accuracy_score(y_train, xg.predict(X_train))}")
print(f"Precision Score: {metrics.precision_score(y_train, xg.predict(X_train))}")

print(f"Accuracy Score: {metrics.accuracy_score(y_test, xg.predict(X_test))}")
print(f"Precision Score: {metrics.precision_score(y_test, xg.predict(X_test))}")

%%time

xg_params = {'max_depth': [3,6,9],
           'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 1.0],
           'n_estimators': [10, 50, 100, 200, 300],
           'colsample_bytree': [0.3, 0.7, 1.0, 1.3],
            'min_child_weight': range(1,10,2),
            'gamma': [0, 2, 4, 6, 8, 10]} #2,3 might not want these to prevent overfitting

# xg_clf = GridSearchCV(xg, xg_params, scoring='precision', verbose=5, n_jobs=-1, cv=5)
xg_clf = GridSearchCV(xg, xg_params, scoring='precision', verbose=5, n_jobs=-1, cv=3)
xg_clf.fit(X_train, y_train)

xg_clf.best_params_

Best parameter for xgboost classification:
{'colsample_bytree': 1.0,
 'gamma': 10,
 'learning_rate': 0.05,
 'max_depth': 6,
 'min_child_weight': 3,
 'n_estimators': 10}

xg= XGBClassifier(**xg_clf.best_params_)
xg.fit(X_train, y_train)

# train
print(f"Train Accuracy Score: {metrics.accuracy_score(y_train, xg.predict(X_train))}")
print(f"Train Precision Score: {metrics.precision_score(y_train, xg.predict(X_train))}")

# test
print(f"Test Accuracy Score: {metrics.accuracy_score(y_test, xg.predict(X_test))}")
print(f"Test Precision Score: {metrics.precision_score(y_test, xg.predict(X_test))}")

# manual input based on previous gridsearch cv
xg = XGBClassifier(**{'colsample_bytree': 1.0,
 'learning_rate': 0.05,
 'max_depth': 6,
 'min_child_weight': 3,
 'n_estimators': 10,
 'gamma':10})
xg.fit(X_train, y_train)

# train
print(f"Train Accuracy Score: {metrics.accuracy_score(y_train, xg.predict(X_train))}")
print(f"Train Precision Score: {metrics.precision_score(y_train, xg.predict(X_train))}")

# test
print(f"Test Accuracy Score: {metrics.accuracy_score(y_test, xg.predict(X_test))}")
print(f"Test Precision Score: {metrics.precision_score(y_test, xg.predict(X_test))}")

pd.DataFrame(list(xg.get_booster().get_score(importance_type='gain').items())).sort_values(1,ascending=False)

### Selecting threshold based on precision score

from sklearn.feature_selection import SelectFromModel
from numpy import sort, array
from sklearn.metrics import precision_score

# Fit model using each importance as a threshold
xg_clf_tuned = XGBClassifier(**{'colsample_bytree': 1.0,
 'learning_rate': 0.05,
 'max_depth': 6,
 'min_child_weight': 3,
 'n_estimators': 10,
 'gamma':10})
xg_clf_tuned.fit(X_train, y_train)
thresholds = sort(xg_clf_tuned.feature_importances_)

xg_final_params = {'threshold': 0, 'precision': 0, 'num_features': 0, 'columns': []}

for thresh in thresholds:
    
    # select features using threshold
    selection = SelectFromModel(xg_clf_tuned, threshold=thresh, prefit=True)
    select_X_train = selection.transform(X_train)
    
    # train model
    selection_model = XGBClassifier(**{'colsample_bytree': 1.0,
                        'learning_rate': 0.05,
                        'max_depth': 6,
                        'min_child_weight': 3,
                        'n_estimators': 10,
                        'gamma':10})

    selection_model.fit(select_X_train, y_train)
    
    # eval model
    select_X_test = selection.transform(X_test)
    predictions = selection_model.predict(select_X_test)
    precision = precision_score(y_test, predictions)
    print("Thresh=%.3f, n=%d, Precision: %.2f%%" % (thresh, select_X_train.shape[1], precision*100.0))
    num_features = select_X_train.shape[1]
    
    if precision > xg_final_params['precision'] and num_features >= 5: # must have at least 5 features
        xg_final_params["threshold"] = thresh
        xg_final_params["precision"] = precision
        xg_final_params["num_features"] = num_features
        xg_final_params["columns"] = X_train.columns[selection.get_support()].tolist()

# from sklearn.feature_selection import SelectFromModel
# from numpy import sort, array
# from sklearn.metrics import precision_score

# # Fit model using each importance as a threshold
# xg_clf_tuned = XGBClassifier(**xg_clf.best_params_)
# xg_clf_tuned.fit(X_train, y_train)
# thresholds = sort(xg_clf_tuned.feature_importances_)

# xg_final_params = {'threshold': 0, 'precision': 0, 'num_features': 0, 'columns': []}

# for thresh in thresholds:
    
#     # select features using threshold
#     selection = SelectFromModel(xg_clf_tuned, threshold=thresh, prefit=True)
#     select_X_train = selection.transform(X_train)
    
#     # train model
#     selection_model = XGBClassifier(**xg_clf.best_params_)
#     selection_model.fit(select_X_train, y_train)
    
#     # eval model
#     select_X_test = selection.transform(X_test)
#     predictions = selection_model.predict(select_X_test)
#     precision = precision_score(y_test, predictions)
#     print("Thresh=%.3f, n=%d, Precision: %.2f%%" % (thresh, select_X_train.shape[1], precision*100.0))
#     num_features = select_X_train.shape[1]
    
#     if precision > xg_final_params['precision'] and num_features >= 5: # must have at least 5 features
#         xg_final_params["threshold"] = thresh
#         xg_final_params["precision"] = precision
#         xg_final_params["num_features"] = num_features
#         xg_final_params["columns"] = X_train.columns[selection.get_support()].tolist()

xg_final_params

X_train_xg = X_train[xg_final_params["columns"]]
X_test_xg = X_test[xg_final_params["columns"]]

xg_clf_filtered = XGBClassifier(**{'colsample_bytree': 1.0,
                                    'learning_rate': 0.05,
                                    'max_depth': 6,
                                    'min_child_weight': 3,
                                    'n_estimators': 10,
                                    'gamma':10})
xg_clf_filtered.fit(X_train_xg, y_train)

print(f"Test Accuracy Score: {metrics.accuracy_score(y_test, xg_clf_filtered.predict(X_test_xg))}")
print(f"Test Precision Score: {metrics.precision_score(y_test, xg_clf_filtered.predict(X_test_xg))}")

# X_train_xg = X_train[xg_final_params["columns"]]
# X_test_xg = X_test[xg_final_params["columns"]]

# xg_clf_filtered = XGBClassifier(**xg_clf.best_params_)
# xg_clf_filtered.fit(X_train_xg, y_train)

# print(f"Test Accuracy Score: {metrics.accuracy_score(y_test, xg_clf_filtered.predict(X_test_xg))}")
# print(f"Test Precision Score: {metrics.precision_score(y_test, xg_clf_filtered.predict(X_test_xg))}")

The accuracy and precision has improved based on feature selection.

feature_select_dict['xgb'] = xg_final_params["columns"]

import shap

xg_clf_filtered = XGBClassifier(**{'colsample_bytree': 1.0,
                                    'learning_rate': 0.05,
                                    'max_depth': 6,
                                    'min_child_weight': 3,
                                    'n_estimators': 10,
                                    'gamma':10})
xg_clf_filtered.fit(X_train_xg, y_train)
explainer = shap.Explainer(xg_clf_filtered)
shap_values = explainer(X_train_xg)

# visualise prediction explanation
shap.plots.waterfall(shap_values[0])

# import shap

# xg_clf_filtered = XGBClassifier(**xg_clf.best_params_)
# xg_clf_filtered.fit(X_train_xg, y_train)
# explainer = shap.Explainer(xg_clf_filtered)
# shap_values = explainer(X_train_xg)

# # visualise prediction explanation
# shap.plots.waterfall(shap_values[0])

# summarise the effects of all the features

shap.plots.beeswarm(shap_values)

# mean absolute value of the SHAP values

shap.plots.bar(shap_values)

## SVM

%%time

from sklearn.model_selection import GridSearchCV
from sklearn import svm


svm_clf = svm.SVC()
svm_clf.fit(X_train, y_train)

# train
print(f"Train Accuracy Score: {metrics.accuracy_score(y_train, svm_clf.predict(X_train))}")
print(f"Train Precision Score: {metrics.precision_score(y_train, svm_clf.predict(X_train))}")

# test
print(f"Test Accuracy Score: {metrics.accuracy_score(y_test, svm_clf.predict(X_test))}")
print(f"Test Precision Score: {metrics.precision_score(y_test, svm_clf.predict(X_test))}")

# svm_params = {'C': [0.1, 1, 10, 100, 1000], 'kernel': ['rbf', 'poly', 'sigmoid', 'linear']}
svm_params = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['rbf', 'poly', 'sigmoid', 'linear']}

svm_clf = GridSearchCV(svm_clf, svm_params, scoring='precision', verbose=1, n_jobs=-1, cv=3)
svm_clf.fit(X_train, y_train)

svm_clf.best_params_

# train
print(f"Train Accuracy Score: {metrics.accuracy_score(y_train, svm_clf.predict(X_train))}")
print(f"Train Precision Score: {metrics.precision_score(y_train, svm_clf.predict(X_train))}")

# test
print(f"Test Accuracy Score: {metrics.accuracy_score(y_test, svm_clf.predict(X_test))}")
print(f"Test Precision Score: {metrics.precision_score(y_test, svm_clf.predict(X_test))}")

from mlxtend.feature_selection import SequentialFeatureSelector as sfs

# Build step forward feature selection
sfs1 = sfs(svm.SVC(**svm_clf.best_params_),
           k_features=8,
           forward=True,
           floating=False,
           verbose=2,
           scoring='precision',
           cv=5)

# Perform SFFS
sfs1 = sfs1.fit(X_train, y_train)

# features
feat_cols = list(sfs1.k_feature_idx_)
print(feat_cols)
svm_filtered_df = X_train.iloc[:,feat_cols]

The optimal feature selection is based on 8 features to reach the maximum precision score of 0.618

feature_select_dict['svm'] = svm_filtered_df.columns.tolist()

from sklearn.model_selection import GridSearchCV
from sklearn import svm

svm_clf_tuned = svm.SVC(**svm_clf.best_params_)
X_train_svm = X_train[svm_filtered_df.columns.tolist()]
X_test_svm = X_test[svm_filtered_df.columns.tolist()]
svm_clf_tuned.fit(X_train_svm, y_train)

print(f"Accuracy Score: {metrics.accuracy_score(y_test, svm_clf_tuned.predict(X_test_svm))}")
print(f"Precision Score: {metrics.precision_score(y_test, svm_clf_tuned.predict(X_test_svm))}")

svm_clf.best_params_['probability'] = True

# manual bypass - without feature selection

feature_select_dict['svm'] = X_clf # all columns
svm_clf_tuned = svm.SVC(**svm_clf.best_params_)
svm_clf_tuned.fit(X_train, y_train)





## SGD Classifier
Stochastic Gradient Descent (SGD): used for discriminative learning of linear classifiers under convex loss functions such as SVM and Logistic Regression.

SGD = SGDClassifier(shuffle=False)
SGD.fit(X_train, y_train)

print(f"Accuracy Score: {metrics.accuracy_score(y_test, SGD.predict(X_test))}")
print(f"Precision Score: {metrics.precision_score(y_test, SGD.predict(X_test))}")

%%time

from sklearn.model_selection import GridSearchCV
import tqdm
from tqdm import tqdm_notebook as tqdm


sgd_params = {"alpha": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
                "penalty": ["l1", "l2", "none"],}

sgd_clf = GridSearchCV(SGD, sgd_params, scoring='precision', verbose=5, n_jobs=-1, cv=3)
sgd_clf.fit(X_train, y_train)

sgd_clf.best_params_

print(f"Accuracy Score: {metrics.accuracy_score(y_train, sgd_clf.predict(X_train))}")
print(f"Precision Score: {metrics.precision_score(y_train, sgd_clf.predict(X_train))}")

print(f"Accuracy Score: {metrics.accuracy_score(y_test, sgd_clf.predict(X_test))}")
print(f"Precision Score: {metrics.precision_score(y_test, sgd_clf.predict(X_test))}")

from mlxtend.feature_selection import SequentialFeatureSelector as sfs

# Build step forward feature selection
sfs1 = sfs(SGDClassifier(shuffle=False),
           k_features=9,
           forward=True,
           floating=False,
           verbose=2,
           scoring='precision',
           cv=5)

# Perform SFFS
sfs1 = sfs1.fit(X_train, y_train)

# features
feat_cols = list(sfs1.k_feature_idx_)
print(feat_cols)
X_train.iloc[:,feat_cols]

The optimal number of features is 9, with a precision score of 0.747 respectively. Columns are cpi_growth, event_sentiment_score, returns, bol_wband_14, gdp_growth.

X_train_sgd = X_train.iloc[:,feat_cols]
X_test_sgd = X_test.iloc[:,feat_cols]

sgd_clf_tuned = SGDClassifier(**sgd_clf.best_params_, shuffle=False)
sgd_clf_tuned.fit(X_train_sgd, y_train)

print(f"Accuracy Score: {metrics.accuracy_score(y_test, sgd_clf_tuned.predict(X_test_sgd))}")
print(f"Precision Score: {metrics.precision_score(y_test, sgd_clf_tuned.predict(X_test_sgd))}")

feature_select_dict['sgd'] = X_train_sgd.columns.tolist()

## Bernoulli NB

bnb = BernoulliNB()
bnb.fit(X_train, y_train)

print(f"Accuracy Score: {metrics.accuracy_score(y_test, bnb.predict(X_test))}")
print(f"Precision Score: {metrics.precision_score(y_test, bnb.predict(X_test))}")

feature_select_dict['bnb'] = X_clf

## Random Forest Classifier
Take quite long to run the gridsearchcv methodology. Reference link for hyperparameter: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

from sklearn import metrics

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

print(f"Accuracy Score: {metrics.accuracy_score(y_test, rf.predict(X_test))}")
print(f"Precision Score: {metrics.precision_score(y_test, rf.predict(X_test))}")

Hyperparameters: <br>
1. n_estimators: number of trees in the forest of the model.
2. max_depth: maximum depth for each trees (each tree will expand until every lead is pure (means all same class)
3. min_samples_split: minimum number of samples required to split an internal leaf node. (ie number of leaf before you can split)
4. min_samples_leaf: minimum number of samples required tob at the leaf node.
5. n_estimators: number of trees, the more trees, the more time complexity.

param_grid = {
    'bootstrap': [True, False],
    'max_depth': range(1, 5, 1),
    'max_features': ['auto', 'sqrt', 'log2'],
    'min_samples_leaf': range(1,10,2), # prevent overfitting, increase the leaf size
    'min_samples_split': range(1,10,2),
    'n_estimators': [50, 100, 200, 300, 400, 500],
    'criterion': ['gini', 'entropy']
}

%%time

rfc = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, verbose=5, n_jobs=-1, scoring='precision')
rfc.fit(X_train, y_train)

rfc.best_params_

# the best params tuned for rfc

"""
{'bootstrap': True,
 'criterion': 'gini',
 'max_depth': 2,
 'max_features': 'log2',
 'min_samples_leaf': 5,
 'min_samples_split': 9,
 'n_estimators': 200}
"""

# train
print(f"Train Accuracy Score: {metrics.accuracy_score(y_train, rfc.predict(X_train))}")
print(f"Train Precision Score: {metrics.precision_score(y_train, rfc.predict(X_train))}")

# test
print(f"Test Accuracy Score: {metrics.accuracy_score(y_test, rfc.predict(X_test))}")
print(f"Test Precision Score: {metrics.precision_score(y_test, rfc.predict(X_test))}")

rf_clf_tuned = RandomForestClassifier(**rfc.best_params_)
rf_clf_tuned.fit(X_train, y_train)

# import pickle
# import joblib

# # save the output of gridsearch process to pickle file (dont need to keep tuning)
# filename = 'outputs/rfc_gridsearch_monthly.pickle'
# dbfile = open(filename, 'wb')
# pickle.dump(rfc, dbfile)
# dbfile.close()

# # load the output of gridsearch trained process
# file = open('outputs/rfc_gridsearch_monthly.pickle', "rb")
# rfc = pickle.load(file)

from mlxtend.feature_selection import SequentialFeatureSelector as sfs

%time

rf = RandomForestClassifier(**rfc.best_params_)
# Build step forward feature selection
sfs1 = sfs(rf,
           k_features=9,
           forward=True,
           floating=False,
           verbose=2,
           scoring='precision',
           cv=5)

# Perform SFFS
sfs1 = sfs1.fit(X_train, y_train)

# features
feat_cols = list(sfs1.k_feature_idx_)
print(feat_cols)
X_train.iloc[:,feat_cols]

X_train_rf = X_train.iloc[:,feat_cols]
X_test_rf = X_test.iloc[:,feat_cols]

rf_clf_tuned = RandomForestClassifier(**rfc.best_params_)
rf_clf_tuned.fit(X_train_rf, y_train)

print(f"Accuracy Score: {metrics.accuracy_score(y_test, rf_clf_tuned.predict(X_test_rf))}")
print(f"Precision Score: {metrics.precision_score(y_test, rf_clf_tuned.predict(X_test_rf))}")

# feature_select_dict['rf'] = X_train_rf.columns.tolist()

feature_select_dict['rf'] = X_train.columns.tolist()

from sklearn.ensemble import RandomForestClassifier

importances = rf_clf_tuned.feature_importances_

final_df = pd.DataFrame({"Features": pd.DataFrame(X_train_rf).columns, "Importances": importances})
final_df.set_index('Importances')

final_df = final_df.sort_values("Importances")

final_df.plot.bar(x='Features', y='Importances', color = 'teal')

final_df

### Selecting threshold based on precision score (additional testing)

rfc

from sklearn.feature_selection import SelectFromModel
from numpy import sort, array
from sklearn.metrics import precision_score

# Fit model using each importance as a threshold
thresholds = sort(rf_clf_tuned.feature_importances_)

rfc_final_params = {'threshold': 0, 'precision': 0, 'num_features': 0, 'columns': []}

for thresh in thresholds:
    
    # select features using threshold
    selection = SelectFromModel(rf_clf_tuned, threshold=thresh, prefit=True)
    select_X_train = selection.transform(X_train_rf)
    
    # train model
    selection_model = RandomForestClassifier(**rfc.best_params_)
    selection_model.fit(select_X_train, y_train)
    
    # eval model
    select_X_test = selection.transform(X_test_rf)
    predictions = selection_model.predict(select_X_test)
    precision = precision_score(y_test, predictions)
    print("Thresh=%.3f, n=%d, Precision: %.2f%%" % (thresh, select_X_train.shape[1], precision*100.0))
    num_features = select_X_train.shape[1]
    
    if precision > rfc_final_params['precision'] and num_features >= 5: # must have at least 5 features
        rfc_final_params["threshold"] = thresh
        rfc_final_params["precision"] = precision
        rfc_final_params["num_features"] = num_features
        rfc_final_params["columns"] = X_train_rf.columns[selection.get_support()].tolist()

rfc_final_params

X_train_rfc = X_train[rfc_final_params["columns"]]
X_test_rfc = X_test[rfc_final_params["columns"]]

rfc_clf_filtered = RandomForestClassifier(**rfc.best_params_)
rfc_clf_filtered.fit(X_train_rfc, y_train)

print(f"Accuracy Score: {metrics.accuracy_score(y_test, rfc_clf_filtered.predict(X_test_rfc))}")
print(f"Precision Score: {metrics.precision_score(y_test, rfc_clf_filtered.predict(X_test_rfc))}")

feature_select_dict

# hardcode normal random forest

X_train_rf = X_train.iloc[:,feat_cols]
X_test_rf = X_test.iloc[:,feat_cols]

rf_clf_tuned = RandomForestClassifier()
rf_clf_tuned.fit(X_train_rf, y_train)

print(f"Accuracy Score: {metrics.accuracy_score(y_test, rf_clf_tuned.predict(X_test_rf))}")
print(f"Precision Score: {metrics.precision_score(y_test, rf_clf_tuned.predict(X_test_rf))}")

feature_select_dict['rf'] = X_train_rf.columns.tolist()

## Model Performance

print("Model performance with returns prediction")
model_performance([xg_clf_filtered, svm_clf_tuned, sgd_clf_tuned, bnb, rf_clf_tuned], ["xgb", 'svm', 'sgd', 'bnb', 'rf'], feature_select_dict)

# Ensemble Models

base_models = [
    ('xgb', xg_clf_filtered),
    ('svm', svm_clf_tuned),
    ('sgd', sgd_clf_tuned),
    ('bnb', bnb),
    ('rf', rf_clf_tuned)
]

meta_model = svm.SVC(C=100, gamma=1, probability=True)

%%time
from sklearn.ensemble import StackingClassifier

stacked_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, n_jobs=-1, verbose=5)
stacked_model.fit(X_train, y_train)
stacked_model.score(X_test, y_test)

%%time

stacked_model_svm = StackingClassifier(estimators=base_models, final_estimator=xg_clf_filtered, n_jobs=-1, verbose=5)
stacked_model_svm.fit(X_train, y_train)
stacked_model_svm.score(X_test, y_test)

## for plotting
import matplotlib.pyplot as plt
import seaborn as sns

classes = np.unique(y_test)
fig, ax = plt.subplots()
cm = metrics.confusion_matrix(y_test, stacked_model_svm.predict(X_test), labels=classes)
sns.heatmap(cm, annot=True, fmt='d', cmap=plt.cm.Blues, cbar=False)
ax.set(xlabel="Pred", ylabel="True", title="Confusion matrix")
ax.set_yticklabels(labels=classes, rotation=0)
plt.show()

# roc curve and auc
from sklearn.metrics import roc_auc_score, roc_curve

def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()
    
probs = stacked_model_svm.predict_proba(X_test)
probs = probs[:, 1]
auc = roc_auc_score(y_test, probs)
print('AUC: %.2f' % auc)
fpr, tpr, thresholds = roc_curve(y_test, probs)
plot_roc_curve(fpr, tpr)

feature_select_dict['stack'] = X_clf

print("Model performance with returns prediction")
model_performance([xg_clf_filtered, svm_clf_tuned, sgd_clf_tuned, bnb, rf_clf_tuned, stacked_model], ["xgb", 'svm', 'sgd', 'bnb', 'rf', 'stack'], feature_select_dict)

## Export Model

# # FOR XGBOOST
# # save model to pickle
# import pickle
# import joblib

# # save the model to disk
# filename = 'outputs/xgb_monthly_combined_ess_features.pickle'
# dbfile =  open(filename, 'wb')
# pickle.dump(xg_clf_filtered, dbfile)
# dbfile.close()

# scaler_filename = "outputs/min_max_scaler_monthly_combined_ess_features.save"
# joblib.dump(min_max_scaler, scaler_filename)

# filename = "outputs/feature_select_dict_monthly_ess_features.pickle" # be aware and remember to change accordingly
# dbfile =  open(filename, 'wb')
# pickle.dump(feature_select_dict['xgb'], dbfile)
# dbfile.close()

# # FOR RANDOM FOREST
# # save model to pickle
# import pickle
# import joblib

# # save the model to disk
# filename = 'outputs/rf_monthly_combined_ess_features.pickle'
# dbfile =  open(filename, 'wb')
# pickle.dump(rf_clf_tuned, dbfile)
# dbfile.close()

# scaler_filename = "outputs/min_max_scaler_monthly_combined_ess_features.save"
# joblib.dump(min_max_scaler, scaler_filename)

# filename = "outputs/rf_feature_select_dict_monthly_ess_features.pickle" # be aware and remember to change accordingly
# dbfile =  open(filename, 'wb')
# pickle.dump(feature_select_dict['rf'], dbfile)
# dbfile.close()