In [18]:
import yfinance as yf

"""
Closing Price
Represents Sj(t)
"""
def closing_price(stockDF, T):
    return stockDF.loc[str(T), 'Close']


"""
Daily Rate of Return.
Represents Yj(t)
"""
def daily_rate_of_return(stockDF, T):
    today = closing_price(stockDF, T)
    yesterday = closing_price(stockDF, T - 1)
    return (today - yesterday) / yesterday


"""
Vj
Creates a list of data corresponding to two weeks of trading days before the day T
"""
def vj(stockDF, T):
    rates_of_return = []
    for i in range(0, 15):
        rates_of_return.append(daily_rate_of_return(stockDF, T - i))
    # Invert the elements in rates_of_return
    rates_of_return = rates_of_return[:: -1]
    return rates_of_return


"""
True Class
returns 0, 1, 2 based on the value of the input
Corresponds to STABLE, UP and DOWN
"""
def true_class(targ):
    # if UP
    if targ >= 0.006:
        return 1
    # if DOWN
    elif targ <= -0.006:
        return 2
    # If STABLE
    else:
        return 0


### TARG is defined as tomorrows Yj(T)
def targ(stockDF, T):
    return daily_rate_of_return(stockDF, T+1)


def make_zt(X):
    return X.map(true_class)

def make_xt(stocks, T):
    """
    Xt Represents an array containing all our line vector stock data "Vj(t)"
    """
    xt_list = []
    ticker_list = []
    for stock, ticker in stocks:
        xt_list.append(vj(stock, T))
        ticker_list.append(ticker)
    return xt_list, ticker_list


In [19]:
################### Slide 2 #########################
tickers = ["AAPL", "MSFT", "TSLA", "META", "GOOGL", "AMZN", "NVDA", "AMD", "DIS", "NFLX",
           "JPM", "KO", "BAC", "C", "WFC", "GS", "AXP", "MCD", "DJI", "SPY"]

stocks = []

for ticker in tickers:
    stocks.append((yf.Ticker(ticker).history(start='2016-01-01', end='2022-12-31'), ticker))

for stock in stocks:
    new_index = [str(i + 1) for i in range(len(stock[0]))]
    stock[0].index = new_index


In [20]:
################### Slide 4 #########################
Y = []
for stock, ticker in stocks:
    for day in range(2, len(stock) - 1):
        Y.append((ticker, day, daily_rate_of_return(stock, day)))

In [21]:
################### Slide 4 cont #########################

import matplotlib.cm as cm

colors = cm.get_cmap('rainbow', 3)  # using 'rainbow' colormap here, you can choose any
unique_tickers = list(set([x[0] for x in Y]))  # gets unique tickers


# Chunking function
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]


# Dividing tickers into chunks of 5
chunks_of_tickers = list(chunks(unique_tickers, 3))
"""
# Now plotting
for chunk in chunks_of_tickers:
    for i, ticker in enumerate(chunk):
        ticker_data = [x for x in Y if x[0] == ticker]  # filter out data for this ticker only
        days = [x[1] for x in ticker_data]  # x-axis data
        rates_of_return = [x[2] for x in ticker_data]  # y-axis data
        plt.plot(days, rates_of_return, color=colors(i), label=ticker)
    plt.xlabel('Day')
    plt.ylabel('Rate of Return')
    plt.title('Rate of Return Over Days for Each Ticker')
    plt.legend()  # add a legend
    plt.show()
"""



  colors = cm.get_cmap('rainbow', 3)  # using 'rainbow' colormap here, you can choose any


"\n# Now plotting\nfor chunk in chunks_of_tickers:\n    for i, ticker in enumerate(chunk):\n        ticker_data = [x for x in Y if x[0] == ticker]  # filter out data for this ticker only\n        days = [x[1] for x in ticker_data]  # x-axis data\n        rates_of_return = [x[2] for x in ticker_data]  # y-axis data\n        plt.plot(days, rates_of_return, color=colors(i), label=ticker)\n    plt.xlabel('Day')\n    plt.ylabel('Rate of Return')\n    plt.title('Rate of Return Over Days for Each Ticker')\n    plt.legend()  # add a legend\n    plt.show()\n"

In [22]:
################### Slide 5 #########################
Xt, tickers = make_xt(stocks, 97)

# extract vj values and convert to dataframe

import numpy as np
import pandas as pd

# reshape it into 1D and convert into a DataFrame
X_data = pd.DataFrame(Xt, index=tickers).T

print(X_data)

        AAPL      MSFT      TSLA      META     GOOGL      AMZN      NVDA  \
0  -0.001067  0.014839  0.004320  0.008420  0.009225  0.036780  0.013229   
1   0.016446 -0.016400 -0.039206 -0.009615 -0.008357 -0.018323 -0.011945   
2  -0.010401  0.001808 -0.042011  0.005365  0.004136 -0.000626 -0.022772   
3  -0.004059  0.001404 -0.049560 -0.002118  0.004695 -0.017603  0.006041   
4  -0.005577  0.009011  0.016073  0.014260  0.014649  0.022546  0.010294   
5   0.000755 -0.006351 -0.027963 -0.002092  0.005447  0.008606 -0.001415   
6   0.006789  0.018974 -0.001101  0.010567  0.014058  0.034307  0.019274   
7  -0.009741  0.000588  0.001294 -0.008133 -0.011942  0.014451  0.002781   
8  -0.023457  0.009011 -0.008040  0.006359 -0.003395  0.006590 -0.013589   
9   0.001992 -0.008348  0.001592 -0.003908 -0.004450 -0.011157  0.152095   
10  0.037119  0.014683  0.003275 -0.009515  0.007547  0.001042  0.029527   
11 -0.004154 -0.018652 -0.017428 -0.011123 -0.013844 -0.021656  0.002133   
12  0.011445

In [23]:
################### Slide 6 #########################
Z_data = make_zt(X_data)
print(Z_data)


    AAPL  MSFT  TSLA  META  GOOGL  AMZN  NVDA  AMD  DIS  NFLX  JPM  KO  BAC  \
0      0     1     0     1      1     1     1    1    1     1    1   0    1   
1      1     2     2     2      2     2     2    2    0     2    2   0    2   
2      2     0     2     0      0     0     2    0    0     2    2   0    2   
3      0     0     2     0      0     2     1    1    1     2    0   0    0   
4      0     1     1     1      1     1     1    0    0     1    0   0    0   
5      0     2     2     0      0     1     0    2    0     0    2   0    2   
6      1     1     0     1      1     1     1    0    1     1    1   1    1   
7      2     0     0     2      2     1     0    0    2     2    0   2    2   
8      2     1     2     1      0     1     2    2    0     2    0   1    0   
9      0     2     0     0      0     2     1    1    2     0    2   2    2   
10     1     1     0     2      1     0     1    1    0     1    1   0    0   
11     0     2     2     2      2     2     0    0  

In [24]:
################### Slide 7 #########################
# Slide 7 simply describes the goal of the next few slides


In [25]:
################### Slide 8 #########################

CL0 = Z_data.apply(lambda x: (x == 0))
CL1 = Z_data.apply(lambda x: (x == 1))
CL2 = Z_data.apply(lambda x: (x == 2))

s0 = CL0.sum().sum()
s1 = CL1.sum().sum()
s2 = CL2.sum().sum()

N = s0 + s1 + s2

print(N)
print(s0)
print(s1)
print(s2)

R0 = s0/N
R1 = s1/N
R2 = s2/N
print(f"Initial Stable Ratio: {R0}\nInitial Up Ratio: {R1}\nInitial Down Ratio: {R2}")

"""
Due to the data set already containing a fairly balanced set of classes, there is no need to clone the data in X_data
"""


300
112
90
98
Initial Stable Ratio: 0.37333333333333335
Initial Up Ratio: 0.3
Initial Down Ratio: 0.32666666666666666


'\nDue to the data set already containing a fairly balanced set of classes, there is no need to clone the data in X_data\n'

In [39]:
################### Slide 9 part 1 #########################
# Take all but the last day of trading in X_train
X_train = X_data.iloc[:-1, :]
# Select stock 20 as our target for Random Forest Classifier
target = Z_data.iloc[:,19]
# Take all but the first days rate of return from Y_Train
Y_train = target.iloc[1:]
M = X_train.shape[1]
print(M)

# X_train and Y_train are now dataframes whose elements are offset by one day, producing a training set of a stock's daily value and tomorrow's price action

20


In [40]:
################### Slide 9 part 2 #########################
# 2M/3 proves that M must be a number less than 100, as max_samples must be a number smaller than n_estimators
# I.E., M cannot be a number in the range 300, for a random forest with 100 trees

from sklearn.ensemble import RandomForestClassifier
import time
# Create our random forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42, max_features=18, max_samples=int((2*M)/3), oob_score=True)
start_time = time.time()
clf.fit(X_train, Y_train)
end_time = time.time()

oob_score = clf.oob_score_
elapsed_time = end_time - start_time

print(f"Training took {elapsed_time} seconds.")

print(f"OOB : {round(oob_score*100,1)}%")
z=clf.predict(X_train)

Training took 0.11841726303100586 seconds.
OOB : 100.0%


  warn(


In [28]:
################### Slide 10 part 1 #########################
# Create a reuseable Train Random Forest function to be used for the remaining slides

import time

def TrainRandomForest(num_tree, num_select_features, X_train, Y_train):
    from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
    # Create our random forest classifier
    clf = RandomForestClassifier(n_estimators=num_tree, random_state=42, oob_score=True,
                                 bootstrap=True, max_samples=0.87,
                                 max_features=num_select_features)
    # Calculate computation start time
    start_time = time.time()
    # Train the model
    clf.fit(X_train, Y_train)
    # Calculate computation end time
    end_time = time.time()
    # Total time is end - start
    elapsed_time = end_time - start_time
    # Get out of bag score
    oob_score = clf.oob_score_
    return clf, oob_score, elapsed_time, clf.feature_importances_


In [29]:
################### Slide 10 part 2 #########################
# Compute one iteration of the Random Forest

clf, oob_score, elapsed_time, features = TrainRandomForest(100, 18, X_train, Y_train)
print(f"Training time was {elapsed_time} seconds")

print(f"OOB : {round(oob_score*100,1)}%")


Training time was 0.09115386009216309 seconds
OOB : 35.7%


In [36]:
################### Slide 10 part 3 #########################
# Compute the confusion matrix
from sklearn.metrics import confusion_matrix

# Confusion Matrix for Training Set
predicted_z_train = clf.predict(X_train)
predicted_z_train = pd.DataFrame(predicted_z_train)
cmatrix_train = confusion_matrix(Y_train, predicted_z_train, labels=[0, 1, 2])
print("Confusion Matrix for Training Set: ")
print(cmatrix_train)

# Confusion Matrix for Test Set
"""
predicted_z_test = clf.predict(X_test)
predicted_z_test = pd.DataFrame(predicted_z_test)
cmatrix_test = confusion_matrix(Y_test, predicted_z_test, labels=[0, 1, 2])
print("Confusion Matrix for Test Set: ")
print(cmatrix_test)
"""

[[7 0 0]
 [0 3 0]
 [0 0 4]]
Confusion Matrix for Training Set: 
[[7 0 0]
 [0 3 0]
 [0 0 4]]


In [ ]:
################### Slide 11 #########################
trees = [100,200,300,400,500]
selected_features = [18,36,72,140,200]
OOB_accuracies = []
computation_times = []
data = []
for tree in trees:
    for SF in selected_features:
        clf, oob_score, elapsed_time, features = TrainRandomForest(tree, SF, X_train, Y_train)
        data.append((tree, SF, f"{round(oob_score*100,2)}%", elapsed_time))

# print(data)
df = pd.DataFrame(data, columns=['Tree', 'SF', 'oob_score', 'computation_time'])
df_pivot_oob = df.pivot(index='Tree', columns='SF', values='oob_score')
df_pivot_time = df.pivot(index='Tree', columns='SF', values='computation_time')

print(f"Out of Box scores\n {df_pivot_oob}")
print(f"Computation times (s)\n {df_pivot_time}")
# Best TR* = 100; Best SF* = 36

In [None]:
################### Slide 12-14 #########################
from sklearn.inspection import permutation_importance
RF_prime = RandomForestClassifier(n_estimators=500, max_features=36, criterion='gini', n_jobs=-1, oob_score=True)

RF_prime.fit(X_train, Y_train)

oob_prime = RF_prime.oob_score_
features = RF_prime.n_features_in_
importances = RF_prime.feature_importances_

# Print the feature ranking
print(oob_prime)



In [ ]:
################### Slide 15 part 1 #########################
print("Feature ranking:")
print(features)

for feature_name, importance in zip(X_train.columns, RF_prime.feature_importances_):
    print(f"{feature_name}: {importance}")


In [None]:
################### Slide 15 part 2 #########################
############# Top 10 most important features ################

# Create tuples of importance, feature
feature_importances = zip(RF_prime.feature_importances_, X_train.columns)

# Sort them by the importance
sorted_feature_importances = sorted(feature_importances, reverse=True)

# Get the ten most important features
top_ten_features = sorted_feature_importances[:10]

# Print the ten most important features
for importance, feature_name in top_ten_features:
    print(f"{feature_name}: {importance}")

In [None]:
################### Slide 15 part 3 #########################
############# Top 10 least important features ################
# Create tuples of (importance, feature)
feature_importances = zip(RF_prime.feature_importances_, X_train.columns)

# Sort them by the importance in ascending order
sorted_feature_importances = sorted(feature_importances, key=lambda x: x[0])

# Get the ten least important features
least_important_features = sorted_feature_importances[:10]

# Print the ten least important features
for importance, feature_name in least_important_features:
    print(f"{feature_name}: {importance}")

In [None]:
################### Slide 16 part 1 ########################
### Create a new dataset of the most important features ####

# Create tuples of importance, feature
feature_importances = zip(RF_prime.feature_importances_, X_train.columns)

# Sort them by the importance
sorted_feature_importances = sorted(feature_importances, reverse=True)

# Get the feature names of ten most important features
top_ten_features = [feature for importance, feature in sorted_feature_importances[:10]]

# Create a new dataset with these features
X_most = X_train[top_ten_features]

In [None]:
################### Slide 16 part 2 ########################
################### Train the new Random Forest model ########################
RF_prime_most = RandomForestClassifier(n_estimators=500, max_features=20, oob_score=True)
RF_prime_most.fit(X_most, Y_train)
print(RF_prime_most.oob_score_)

from sklearn.metrics import confusion_matrix
cmatrix = confusion_matrix(Y_train, predicted_z, labels=[0, 1, 2])
print(cmatrix)
