## 1. Imports

In [1]:
import pickle
import pandas as pd
import numpy as np
from joblib import dump, load
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

## 2. Loading the data

In [2]:
stock_ticker = "SPY"

In [3]:
df = pd.read_csv(f"../../../data/stock_prediction/stock_prediction_data/stock_prediction_data_{stock_ticker}.csv", index_col=[0])

In [4]:
# log_return_t+1 is to predict -> positive or negative return
target = df["log_return_shift"].apply(lambda x: 1 if x > 0 else 0)

# used features to predict stock return
features = ["Negative", "Neutral", "Positive", "count", "pos_minus_neg","pos_minus_neg_times_count", "Volume", "pos_minus_neg_diff", "count_diff"]
df = df[features]


In [5]:
df

Unnamed: 0,Negative,Neutral,Positive,count,pos_minus_neg,pos_minus_neg_times_count,Volume,pos_minus_neg_diff,count_diff
2021-05-04,0.467318,0.228605,0.304077,374,-0.163241,-61.052021,101591200,0.017058,92.0
2021-05-05,0.501610,0.224523,0.273867,308,-0.227744,-70.145086,60162200,-0.064503,-66.0
2021-05-06,0.466016,0.237503,0.296481,418,-0.169535,-70.865820,74321400,0.058208,110.0
2021-05-07,0.464900,0.232654,0.302446,547,-0.162454,-88.862488,67733800,0.007081,129.0
2021-05-10,0.556672,0.195418,0.247910,466,-0.308762,-143.883209,81852400,-0.146308,-81.0
...,...,...,...,...,...,...,...,...,...
2022-04-22,0.514130,0.252199,0.233671,645,-0.280458,-180.895673,132471800,-0.051080,217.0
2022-04-25,0.424861,0.290111,0.285028,576,-0.139833,-80.543813,119647700,0.140625,-69.0
2022-04-26,0.439146,0.273064,0.287790,602,-0.151355,-91.115755,103996300,-0.011522,26.0
2022-04-27,0.448051,0.286386,0.265564,600,-0.182487,-109.492358,122030000,-0.031132,-2.0


In [6]:
target.value_counts(normalize=True)

1    0.536
0    0.464
Name: log_return_shift, dtype: float64

## 3. Data transformation

In [7]:
# Data normalization
df = (df - df.mean())/df.std()

In [8]:
# Stratified split
X_train, X_test, Y_train, Y_test = train_test_split(df, target, stratify=target, test_size=0.25, random_state=42)

In [9]:
Y_train

2021-11-10    1
2021-11-18    0
2021-08-13    1
2021-06-11    1
2021-06-24    1
             ..
2021-11-01    1
2021-05-24    0
2022-02-23    1
2022-03-31    1
2021-12-21    1
Name: log_return_shift, Length: 187, dtype: int64

In [10]:
Y_test.value_counts(normalize=True)

1    0.539683
0    0.460317
Name: log_return_shift, dtype: float64

## 4. Model training and prediction

In [11]:
# Create LR classifier
clf_lr = LogisticRegression(random_state=42).fit(X_train, Y_train)

In [12]:
train_acc_lr = clf_lr.score(X_train, Y_train)
test_acc_lr = clf_lr.score(X_test, Y_test)

In [13]:
# Create DT classifier
clf_dt = DecisionTreeClassifier().fit(X_train, Y_train)

In [14]:
train_acc_dt = clf_dt.score(X_train, Y_train)
test_acc_dt = clf_dt.score(X_test, Y_test)

In [15]:
# Create RF classifier
clf_rf = RandomForestClassifier(random_state=42).fit(X_train, Y_train)

In [16]:
train_acc_rf = clf_rf.score(X_train, Y_train)
test_acc_rf = clf_rf.score(X_test, Y_test)

In [17]:
# Create XGBoost classifier
cl_xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss').fit(X_train, Y_train)

In [18]:
train_acc_xgb = cl_xgb.score(X_train, Y_train)
test_acc_xgb = cl_xgb.score(X_test, Y_test)

In [19]:
# Create KNN classifier
clf_knn = KNeighborsClassifier().fit(X_train, Y_train)

In [20]:
train_acc_knn = clf_knn.score(X_train, Y_train)
test_acc_knn = clf_knn.score(X_test, Y_test)

In [25]:
# Create MLP classifier
clf_mlp = MLPClassifier(random_state=42, max_iter=100).fit(X_train, Y_train)



In [26]:
train_acc_mlp = clf_mlp.score(X_train, Y_train)
test_acc_mlp = clf_mlp.score(X_test, Y_test)

In [27]:
result_df = pd.DataFrame(
    data={
        "Model": ["Logistic Regression", "Decision Tree", "Random Forest", "XGBoost", "K-Nearest Neighbour", "Neural Network"],
        "Train Accuracy": [train_acc_lr, train_acc_dt, train_acc_rf, train_acc_xgb, train_acc_knn, train_acc_mlp],
        "Test Accuracy": [test_acc_lr, test_acc_dt, test_acc_rf, test_acc_xgb, test_acc_knn, test_acc_mlp]
    }
)

In [28]:
result_df

Unnamed: 0,Model,Train Accuracy,Test Accuracy
0,Logistic Regression,0.561497,0.460317
1,Decision Tree,1.0,0.507937
2,Random Forest,1.0,0.555556
3,XGBoost,1.0,0.492063
4,K-Nearest Neighbour,0.652406,0.47619
5,Neural Network,0.68984,0.492063


In [29]:
result_df.to_excel(f"../../../data/stock_prediction/stock_prediction_model_results/{stock_ticker}.xlsx")

## 5. Saving all models

In [30]:
dump(clf_lr, f'../../../models/stock_prediction/{stock_ticker}/Logistic_Regression.joblib')

['../../../models/stock_prediction/SPY/Logistic_Regression.joblib']

In [31]:
dump(clf_dt, f'../../../models/stock_prediction/{stock_ticker}/Decision_Tree.joblib')

['../../../models/stock_prediction/SPY/Decision_Tree.joblib']

In [32]:
dump(clf_rf, f'../../../models/stock_prediction/{stock_ticker}/Random_Forest.joblib')

['../../../models/stock_prediction/SPY/Random_Forest.joblib']

In [33]:
dump(cl_xgb, f'../../../models/stock_prediction/{stock_ticker}/XGBoost.joblib')

['../../../models/stock_prediction/SPY/XGBoost.joblib']

In [34]:
dump(clf_knn, f'../../../models/stock_prediction/{stock_ticker}/K-Nearest_Neighbour.joblib')

['../../../models/stock_prediction/SPY/K-Nearest_Neighbour.joblib']

In [35]:
dump(clf_mlp, f'../../../models/stock_prediction/{stock_ticker}/Neural_Network.joblib')

['../../../models/stock_prediction/SPY/Neural_Network.joblib']