In [1]:
import yfinance as yf
import matplotlib.pyplot as plt
import pandas as pd

"""
Closing Price
Represents Sj(t)
"""


def closing_price(stockDF, T):
    return stockDF.loc[str(T), 'Close']


def daily_rate_of_return(stockDF, T):
    today = closing_price(stockDF, T)
    yesterday = closing_price(stockDF, T - 1)
    return (today - yesterday) / yesterday


def vj(stockDF, T):
    rates_of_return = []
    for i in range(0, 15):
        rates_of_return.append(daily_rate_of_return(stockDF, T - i))
    # Invert the elements in rates_of_return
    rates_of_return = rates_of_return[:: -1]
    return rates_of_return


# Try this both ways before submitting
def true_class(targ):
    # if UP
    if targ >= 0.006:
        return 1
    # if DOWN
    elif targ <= -0.006:
        return 2
    # If STABLE
    else:
        return 0


def targ(stockDF, T):
    return daily_rate_of_return(stockDF, T+1)


def make_zt(X):
    return X.map(true_class)

def make_xt(stocks, T):
    """
    Xt Represents an array containing all our line vector stock data "Vj(t)"
    """
    xt_list = []
    ticker_list = []
    for stock, ticker in stocks:
        xt_list.append(vj(stock, T))
        ticker_list.append(ticker)
    return xt_list, ticker_list


In [ ]:
tickers = ["AAPL", "MSFT", "TSLA", "META", "GOOGL", "AMZN", "NVDA", "AMD", "DIS", "NFLX",
           "JPM", "KO", "BAC", "C", "WFC", "GS", "AXP", "MCD", "DJI", "SPY"]

stocks = []

for ticker in tickers:
    stocks.append((yf.Ticker(ticker).history(start='2016-01-01', end='2022-12-31'), ticker))

for stock in stocks:
    new_index = [str(i + 1) for i in range(len(stock[0]))]
    stock[0].index = new_index


In [ ]:
Y = []
for stock, ticker in stocks:
    for day in range(2, len(stock) - 1):
        Y.append((ticker, day, daily_rate_of_return(stock, day)))

In [ ]:
import matplotlib.cm as cm

colors = cm.get_cmap('rainbow', 3)  # using 'rainbow' colormap here, you can choose any
unique_tickers = list(set([x[0] for x in Y]))  # gets unique tickers


# Chunking function
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]


# Dividing tickers into chunks of 5
chunks_of_tickers = list(chunks(unique_tickers, 3))
"""
# Now plotting
for chunk in chunks_of_tickers:
    for i, ticker in enumerate(chunk):
        ticker_data = [x for x in Y if x[0] == ticker]  # filter out data for this ticker only
        days = [x[1] for x in ticker_data]  # x-axis data
        rates_of_return = [x[2] for x in ticker_data]  # y-axis data
        plt.plot(days, rates_of_return, color=colors(i), label=ticker)
    plt.xlabel('Day')
    plt.ylabel('Rate of Return')
    plt.title('Rate of Return Over Days for Each Ticker')
    plt.legend()  # add a legend
    plt.show()
"""



In [65]:
Xt, tickers = make_xt(stocks, 97)

# extract vj values and convert to dataframe

import numpy as np
import pandas as pd

# reshape it into 1D and convert into a DataFrame
X_data = pd.DataFrame(Xt, index=tickers).T

print(X_data)
Z_data = make_zt(X_data)
print(Z_data)

        AAPL      MSFT      TSLA      META     GOOGL      AMZN      NVDA  \
0  -0.001067  0.014839  0.004320  0.008420  0.009225  0.036780  0.013228   
1   0.016446 -0.016400 -0.039206 -0.009615 -0.008357 -0.018323 -0.011944   
2  -0.010402  0.001808 -0.042011  0.005365  0.004136 -0.000626 -0.022772   
3  -0.004059  0.001404 -0.049560 -0.002118  0.004695 -0.017603  0.006042   
4  -0.005577  0.009011  0.016073  0.014260  0.014649  0.022546  0.010295   
5   0.000755 -0.006351 -0.027963 -0.002092  0.005447  0.008606 -0.001415   
6   0.006789  0.018974 -0.001101  0.010567  0.014058  0.034307  0.019274   
7  -0.009741  0.000588  0.001294 -0.008133 -0.011942  0.014451  0.002781   
8  -0.023457  0.009011 -0.008040  0.006359 -0.003395  0.006590 -0.013588   
9   0.001993 -0.008347  0.001592 -0.003908 -0.004450 -0.011157  0.152094   
10  0.037119  0.014683  0.003275 -0.009515  0.007547  0.001042  0.029527   
11 -0.004154 -0.018652 -0.017428 -0.011123 -0.013844 -0.021656  0.002133   
12  0.011445

In [66]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import time
# Create our random forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42, oob_score=True)

start_time = time.time()
#X_train = X_data.iloc[:-1, :]
target = Z_data['SPY']
#target = target.iloc[1:]
clf.fit(X_data, target)
end_time = time.time()

elapsed_time = end_time - start_time

print(f"Training took {elapsed_time} seconds.")
print(f"OOB score: {clf.oob_score_}")

Training took 0.08655667304992676 seconds.
OOB score: 0.5333333333333333


In [67]:
CL0 = Z_data.apply(lambda x: (x == 0))
CL1 = Z_data.apply(lambda x: (x == 1))
CL2 = Z_data.apply(lambda x: (x == 2))

s0 = CL0.sum().sum()
s1 = CL1.sum().sum()
s2 = CL2.sum().sum()

N = s0 + s1 + s2

print(N)
print(s0)
print(s1)
print(s2)

R0 = s0/N
R1 = s1/N
R2 = s2/N
print(f"Initial Stable Ratio: {R0}\nInitial Up Ratio: {R1}\nInitial Down Ratio: {R2}")


300
112
90
98
Initial Stable Ratio: 0.37333333333333335
Initial Up Ratio: 0.3
Initial Down Ratio: 0.32666666666666666


In [61]:
import numpy as np
## Increase the number of Stable and Up days
import numpy as np

percent_increase = 0.65
percent_increase_stable = 1.00

# Applying condition on all columns and count True results, then scale
n_up = int(s1 * percent_increase)
n_stable = int(s0 * percent_increase_stable)
print(f"Number of 'UP' entries to add {n_up}")
print(f"Number of 'STABLE' entries to add {n_stable}")



Number of 'UP' entries to add 50
Number of 'STABLE' entries to add 114


In [27]:
eligible_rows_up = X_data.apply(lambda x: (x.max() >= 0.006), axis=1)
filtered_X_data_up = X_data.loc[eligible_rows_up, :]
sampled_rows_up = filtered_X_data_up.sample(n=n_up, replace=True, axis=0)
print(sampled_rows_up)

"""
# Sample n_up columns with replacement 
sampled_columns = filtered_X_data.sample(n=n_up, replace=True, axis=1)

# Select the first row and transpose it to be 1 x n_up
X_up = sampled_columns.iloc[0].T

# Make it dataframe and transpose to align to original form
X_up = pd.DataFrame(X_up).T
# Rename the columns
X_up.columns = [N + (i + 1) for i in range(X_up.shape[1])]
# Update the row labels
N = N + X_up.shape[1]
X_up = X_up.reset_index(drop=True)
print(X_up.shape)
"""

        AAPL      MSFT      TSLA      META     GOOGL      AMZN      NVDA  \
8  -0.004839 -0.008433 -0.001317  0.003054  0.012090  0.007541  0.008115   
3   0.016192 -0.000573 -0.014929  0.001849  0.002955  0.017610  0.001688   
3   0.016192 -0.000573 -0.014929  0.001849  0.002955  0.017610  0.001688   
4   0.014513  0.009178  0.010200  0.019075  0.016738  0.000243  0.016846   
0  -0.019570 -0.018165 -0.019648  0.002336 -0.002889 -0.001799 -0.041350   
11  0.053167  0.035856  0.012902  0.040144  0.025858  0.037147  0.023381   
3   0.016192 -0.000573 -0.014929  0.001849  0.002955  0.017610  0.001688   
9   0.001345  0.004549 -0.029406 -0.009553 -0.000723 -0.004717  0.005489   
0  -0.019570 -0.018165 -0.019648  0.002336 -0.002889 -0.001799 -0.041350   
13  0.005531  0.007337 -0.014360  0.003402  0.000232  0.007912  0.009852   
11  0.053167  0.035856  0.012902  0.040144  0.025858  0.037147  0.023381   
10 -0.005063 -0.006103  0.006392 -0.002014  0.011286  0.005684  0.011645   
4   0.014513

'\n# Sample n_up columns with replacement \nsampled_columns = filtered_X_data.sample(n=n_up, replace=True, axis=1)\n\n# Select the first row and transpose it to be 1 x n_up\nX_up = sampled_columns.iloc[0].T\n\n# Make it dataframe and transpose to align to original form\nX_up = pd.DataFrame(X_up).T\n# Rename the columns\nX_up.columns = [N + (i + 1) for i in range(X_up.shape[1])]\n# Update the row labels\nN = N + X_up.shape[1]\nX_up = X_up.reset_index(drop=True)\nprint(X_up.shape)\n'

In [513]:

# Identify columns where values are between -0.006 and 0.006
eligible_columns = X_data.apply(lambda col: -0.006 < col.max() < 0.006)

# Filter those columns
filtered_X_data = X_data.loc[:, eligible_columns]

# Sample n_stable columns with replacement
sampled_columns = filtered_X_data.sample(n=n_stable, replace=True, axis=1)

# Select the first row and transpose it to be 1 x n_stable
X_stable = sampled_columns.iloc[0].T

# Make it dataframe and transpose to align to original form
X_stable = pd.DataFrame(X_stable).T

X_stable.columns = [N + (i + 1) for i in range(X_stable.shape[1])]
print(X_stable.shape)

ValueError: a must be greater than 0 unless no samples are taken

In [514]:
X_extended = pd.concat([X_data, X_up, X_stable], axis=1)
print(X_extended.shape)


(15, 145)


In [515]:
M = X_extended.shape[1]
print(M)
Z_extended = make_zt(X_extended)

# Slide 8
CL0 = Z_data.apply(lambda x: (x == 0).sum()).sum()
CL1 = Z_data.apply(lambda x: (x == 1).sum()).sum()
CL2 = Z_data.apply(lambda x: (x == 2).sum()).sum()

print(CL0)
print(CL1)
print(CL2)

M = CL0 + CL1 + CL2


print(N)
print(f"Altered Stable Ratio: {ratio0}\nAltered Up Ratio: {ratio1}\nAltered Down Ratio: {ratio2}")


145
68
88
144
300
Altered Stable Ratio: 0.0
Altered Up Ratio: 0.0
Altered Down Ratio: 0.0


In [36]:
"""
X_train = X_extended.drop(X_extended.columns[1], axis=1)
Z_train = Z_extended.drop(Z_extended.columns[-1], axis=1)
"""
X_train = X_data.iloc[:-1, :]
Y_train = target.iloc[1:]
M = X_train.size
print(M)


280


In [52]:
from sklearn.ensemble import RandomForestClassifier
import time
# Create our random forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42, max_features=int((2*M)/3), oob_score=True)
start_time = time.time()
clf.fit(X_train, Y_train)
end_time = time.time()

oob_score = clf.oob_score_
elapsed_time = end_time - start_time

print(f"Training took {elapsed_time} seconds.")

print(f"OOB : {oob_score}")
z=clf.predict(X_train)

Training took 0.1153116226196289 seconds.
OOB : 0.5


In [38]:
import time

def TrainRandomForest(num_tree, num_select_features):
    from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
    # Create our random forest classifier
    clf = RandomForestClassifier(n_estimators=num_tree, random_state=42, oob_score=True,
                                 bootstrap=True, max_samples=0.87,
                                 max_features=num_select_features)
    start_time = time.time()
    # Drop the first X and last Z so that we have a list of rates of return for 299 days correlated with
    # the 299 up, down, stable labels corresponding to the next day's return
    clf.fit(X_train, Y_train)
    end_time = time.time()
    oob_score = clf.oob_score_
    elapsed_time = end_time - start_time
    return clf, oob_score, elapsed_time


In [43]:
clf, oob_score, elapsed_time = TrainRandomForest(100, 32)
print(f"Training time was {elapsed_time} seconds")
predicted_z = clf.predict(X_train)
predicted_z = pd.DataFrame(predicted_z)
print(f"OOB Score: {oob_score}")
print(predicted_z)

Training time was 0.10720396041870117 seconds
OOB Score: 0.42857142857142855
    0
0   2
1   2
2   0
3   1
4   2
5   1
6   2
7   0
8   2
9   0
10  1
11  2
12  1
13  2


In [44]:
from sklearn.metrics import confusion_matrix
cmatrix = confusion_matrix(Y_train, predicted_z, labels=[0, 1, 2])
print(cmatrix)


[[3 0 0]
 [0 4 0]
 [0 0 7]]


In [45]:
trees = [100,200,300,400,500]
selected_features = [18,36,72,140,200]
OOB_accuracies = []
computation_times = []
data = []
for tree in trees:
    for SF in selected_features:
        clf, oob_score, elapsed_time = TrainRandomForest(tree, SF)
        data.append((tree, SF, oob_score, elapsed_time))

# print(data)
df = pd.DataFrame(data, columns=['Tree', 'SF', 'oob_score', 'computation_time'])
df_pivot_oob = df.pivot(index='Tree', columns='SF', values='oob_score')
df_pivot_time = df.pivot(index='Tree', columns='SF', values='computation_time')

print(f"OOB\n {df_pivot_oob}")
print(f"Computation times (s)\n {df_pivot_time}")
# Best TR* = 100; Best SF* = 36

OOB
 SF         18        36        72        140       200
Tree                                                  
100   0.428571  0.428571  0.428571  0.428571  0.428571
200   0.428571  0.428571  0.428571  0.428571  0.428571
300   0.500000  0.500000  0.500000  0.500000  0.500000
400   0.500000  0.428571  0.428571  0.428571  0.428571
500   0.500000  0.500000  0.500000  0.500000  0.500000
Computation times (s)
 SF         18        36        72        140       200
Tree                                                  
100   0.117942  0.087714  0.084721  0.084286  0.108509
200   0.212884  0.182407  0.172620  0.173774  0.173813
300   0.254383  0.278475  0.251826  0.252055  0.248673
400   0.329934  0.352914  0.333705  0.333898  0.342823
500   0.422747  0.424908  0.422900  0.421166  0.421869


In [50]:
RF_prime = RandomForestClassifier(n_estimators=500, min_samples_split=18, min_impurity_decrease=0, max_leaf_nodes=12,max_features=16, criterion='gini', n_jobs=-1, oob_score=True)

RF_prime.fit(X_train, Y_train)

oob_prime = RF_prime.oob_score_
importances = RF_prime.feature_importances_
features = RF_prime.n_features_in_

# Print the feature ranking
print(oob_prime)

print("Feature ranking:")
print(features)
print(importances)


0.5
Feature ranking:
20
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [75]:
import numpy as np

# Assuming 'rfModel' is your trained Random Forest Model

k = 10  # you can change this value depending on how many top features you want

"""
# Get the most important features 'k'
for i in range(k):
    print(f"{X_data.iloc[indices[i]]} ({importances[indices[i]]})")
"""

Feature ranking:


'\n# Get the most important features \'k\'\nfor i in range(k):\n    print(f"{X_data.iloc[indices[i]]} ({importances[indices[i]]})")\n'