### Using Logistic Regression Algorithm to calculate the percentage of observations that is correctly predicted by the model (hit rate) for the Top 25 Stocks in the SP500.

In [50]:
# Import the required modules
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression

In [51]:
# Import the top 25 sp500 stocks dataset into a Pandas Dataframe
stocks_df = pd.read_csv(
    Path("../Resources/merged_top25_sp500_stock_data.csv"), 
    index_col='date', 
    infer_datetime_format=True, 
    parse_dates=True
)

stocks_df = stocks_df.sort_values(by=['date'], ascending = True)

# drop un-necessary columns
stocks_df = stocks_df.loc[:, ~stocks_df.columns.str.contains('^Unnamed')]
stocks_df.drop(['open', 'high', 'low', 'close', 'volume', 'date_utc', 'symbol'], axis=1, inplace=True)

stocks_df['adjclose'] = stocks_df['adjclose'].pct_change()
stocks_df = stocks_df.dropna()

# Review the DataFrame
stocks_df.info()
stocks_df.head()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 62882 entries, 2012-10-31 to 2022-10-28
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   adjclose  62882 non-null  float64
dtypes: float64(1)
memory usage: 982.5 KB


Unnamed: 0_level_0,adjclose
date,Unnamed: 1_level_1
2012-10-31,0.774547
2012-10-31,-0.753941
2012-10-31,5.027638
2012-10-31,0.799708
2012-10-31,-0.978228


In [52]:
lags = 5

for i in range(0, lags):
	stocks_df['Lag_' + str(i+1)] = stocks_df['adjclose'].shift(i+1)

stocks_df['Returns'] = stocks_df['adjclose'].pct_change()

stocks_df.head()

Unnamed: 0_level_0,adjclose,Lag_1,Lag_2,Lag_3,Lag_4,Lag_5,Returns
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2012-10-31,0.774547,,,,,,
2012-10-31,-0.753941,0.774547,,,,,-1.973396
2012-10-31,5.027638,-0.753941,0.774547,,,,-7.668475
2012-10-31,0.799708,5.027638,-0.753941,0.774547,,,-0.840938
2012-10-31,-0.978228,0.799708,5.027638,-0.753941,0.774547,,-2.223231


In [53]:
# convert returns to the sign of direction
stocks_df['Direction'] = np.sign(stocks_df['Returns'])

stocks_df = stocks_df.dropna()
stocks_df = stocks_df.drop(stocks_df.index[0:5])

##remove rows with any values that are not infinite
stocks_df = stocks_df[np.isfinite(stocks_df).all(1)]

stocks_df

Unnamed: 0_level_0,adjclose,Lag_1,Lag_2,Lag_3,Lag_4,Lag_5,Returns,Direction
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2012-11-01,0.188723,-0.273411,0.152882,0.371134,0.792042,-0.222468,-1.690252,-1.0
2012-11-01,-0.589351,0.188723,-0.273411,0.152882,0.371134,0.792042,-4.122844,-1.0
2012-11-01,-0.232438,-0.589351,0.188723,-0.273411,0.152882,0.371134,-0.605605,-1.0
2012-11-01,2.337224,-0.232438,-0.589351,0.188723,-0.273411,0.152882,-11.055276,-1.0
2012-11-01,-0.946991,2.337224,-0.232438,-0.589351,0.188723,-0.273411,-1.405178,-1.0
...,...,...,...,...,...,...,...,...
2022-10-28,5.317310,-0.856042,1.613182,-0.448276,1.209844,-0.425436,-7.211509,-1.0
2022-10-28,-0.797217,5.317310,-0.856042,1.613182,-0.448276,1.209844,-1.149929,-1.0
2022-10-28,4.923305,-0.797217,5.317310,-0.856042,1.613182,-0.448276,-7.175618,-1.0
2022-10-28,-0.899472,4.923305,-0.797217,5.317310,-0.856042,1.613182,-1.182697,-1.0


In [54]:
X = stocks_df[["Lag_1", "Lag_2"]]
y = stocks_df["Direction"]

## Split into training and testing windows
#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

start_test = pd.Timestamp("2012-11-02")

X_train = X[X.index < start_test]
X_test = X[X.index >= start_test]
y_train = y[y.index < start_test]
y_test = y[y.index >= start_test]

In [55]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [56]:
lr_accuracy = accuracy_score(y_test, y_pred)
lr_accuracy

0.8345747389865037

In [57]:
pred = (1.0 + (y_pred == y_test))/2.0
hit_rate = np.mean(pred)
print('Logistic Regression - Hit Rate of Top 25 SP500 Stocks: {:.4f}'.format(hit_rate))

Logistic Regression - Hit Rate of Top 25 SP500 Stocks: 0.9173
