#Random Forest Regression



##Objective: Construct a Random Forest Regression to predict stock prices based on raw and enguneered features.

##Import Libraries

In [1]:
import numpy as np
import pandas as pd
import pandas_datareader.data as web

import yfinance as yf

import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
from IPython import display
display.set_matplotlib_formats('svg') # Improve matplotlib visual quality

  display.set_matplotlib_formats('svg') # Improve matplotlib visual quality


##Import Dataset

In [3]:
# Financial Companies: ['BAC', 'WFC', 'JPM', 'GS', 'MS', 'UBS']
ticker = ['GS']
period = 'max'

stock_data = yf.download(ticker, period = period)[['Adj Close', 'Volume']].copy()
stock_data

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1999-05-04,50.231976,22320900
1999-05-05,49.339729,7565700
1999-05-06,48.492142,2905700
1999-05-07,52.908619,4862300
1999-05-10,50.455006,2589400
...,...,...
2024-07-01,463.660004,1850500
2024-07-02,465.609985,1491800
2024-07-03,467.920013,993900
2024-07-05,464.750000,1593800


##Data Preprocessing

###Feature Engineering

Features Engineered:


* **Lagged values:** stock prices/returns often exhibit autocorrelation, past returns can influence future returns.
*   **Moving Averages:** they identify trends or cycles & crossovers of a short Simple Moving Average and long Simple Moving Averages often signal buy/sell periods.
* **Exponential Moving Average:** similar to a moving average but higher weight is placed on recent prices/returns which makes the indicator more responsive to information.
* **Volatility:** the degree of variation in stock prices, which is regarded as the riskiness of an asset.
* **Volume Moving Average:** high volume often indicates significant price movement and strong investor interest.
* **Treasury Rate:** the interest rates on debt securities issued by the U.S. Treasury. The relationship between Treasury rates and stock prices are generally considered inversely related.
* **Federal Funds Rate:** rate ar which depository institutions lend reserve balances to other depository institutions overnight. The rate influences other interest rates throughout the economy, such as, mortgages, credit card, and business loans.


In [4]:


# Lagged value
for lag in [1, 5, 10, 15]:
  stock_data[f'Lagged {lag}'] = stock_data['Adj Close'].shift(lag)

# Moving Averages
for window in [5, 14, 30]:
  stock_data[f'MA_{window}'] = stock_data['Adj Close'].rolling(window).mean()

# Exponential Moving Averages
for span in [12, 26]:
  stock_data[f'EMA_{span}'] = stock_data['Adj Close'].ewm(span = span, adjust=False).mean()

# Volatility Averages
for vol in [5, 14, 30]:
  stock_data[f'Volatility_{vol}'] = stock_data['Adj Close'].rolling(vol).std()

# Volume Averages
for window in [5, 14, 30]:
  stock_data[f'Volume_{window}_MA'] = stock_data['Volume'].rolling(window).mean()

# # Fetch interest rate data from FRED
# start_date = stock_data.index.min().strftime('%Y-%m-%d')
# end_date = stock_data.index.max().strftime('%Y-%m-%d')

# interest_rate = web.DataReader('DGS10', 'fred', start_date, end_date)
# interest_rate.columns = ['10Y_Treasury_Rate']
# stock_data = stock_data.join(interest_rate, how='left')
# stock_data['10Y_Treasury_Rate'].fillna(method='ffill', inplace=True)

# # Fetch Federal Funds Rate data from FRED
# fed_rate = web.DataReader('FEDFUNDS', 'fred', start_date, end_date)
# fed_rate.columns = ['Fed_Rate']
# stock_data = stock_data.join(fed_rate, how='left')
# stock_data['Fed_Rate'].fillna(method='ffill', inplace=True)

In [5]:
stock_data

Unnamed: 0_level_0,Adj Close,Volume,Lagged 1,Lagged 5,Lagged 10,Lagged 15,MA_5,MA_14,MA_30,EMA_12,EMA_26,Volatility_5,Volatility_14,Volatility_30,Volume_5_MA,Volume_14_MA,Volume_30_MA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1999-05-04,50.231976,22320900,,,,,,,,50.231976,50.231976,,,,,,
1999-05-05,49.339729,7565700,50.231976,,,,,,,50.094707,50.165883,,,,,,
1999-05-06,48.492142,2905700,49.339729,,,,,,,49.848158,50.041902,,,,,,
1999-05-07,52.908619,4862300,48.492142,,,,,,,50.318998,50.254252,,,,,,
1999-05-10,50.455006,2589400,52.908619,,,,50.285494,,,50.339923,50.269122,1.660043,,,8048800.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-07-01,463.660004,1850500,452.320007,462.089996,446.459991,454.910004,455.035999,452.777854,455.388833,454.829077,452.473649,6.524900,6.298291,5.819654,2458100.0,2.184421e+06,2.133507e+06
2024-07-02,465.609985,1491800,463.660004,457.380005,450.179993,453.549988,456.681995,454.302139,455.412280,456.487678,453.446711,8.109649,6.653176,5.860597,2429780.0,2.152536e+06,2.128037e+06
2024-07-03,467.920013,993900,465.609985,455.859985,457.429993,444.269989,459.094000,455.674997,455.671104,458.246499,454.518808,9.481484,7.354319,6.236679,2202220.0,2.099743e+06,2.074783e+06
2024-07-05,464.750000,1593800,467.920013,445.959991,458.049988,448.700012,462.852002,456.989282,455.576758,459.247038,455.276674,6.092401,7.202219,6.069475,1953940.0,2.081507e+06,2.035823e+06


In [6]:
# stock_data.drop(columns='Adj Close', inplace = True)
stock_data.dropna(inplace = True)

In [7]:
stock_data

Unnamed: 0_level_0,Adj Close,Volume,Lagged 1,Lagged 5,Lagged 10,Lagged 15,MA_5,MA_14,MA_30,EMA_12,EMA_26,Volatility_5,Volatility_14,Volatility_30,Volume_5_MA,Volume_14_MA,Volume_30_MA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1999-06-15,44.789429,831700,43.718754,45.503201,45.815468,46.618473,45.128463,46.089513,47.965725,45.803437,47.050936,1.089818,1.371866,2.430475,779540.0,1.032486e+06,2.374037e+06
1999-06-16,46.573841,690300,44.789429,46.752289,45.146317,46.038528,45.092773,46.048086,47.843787,45.921961,47.015596,1.024299,1.345764,2.404481,795320.0,9.673000e+05,1.653017e+06
1999-06-17,46.395428,955700,46.573841,45.235527,45.413975,47.153812,45.324754,45.907881,47.745644,45.994802,46.969657,1.183661,1.178407,2.401403,853620.0,9.483857e+05,1.432683e+06
1999-06-18,46.261578,613900,46.395428,45.146317,46.663078,48.358303,45.547806,45.748555,47.671291,46.035844,46.917207,1.245115,0.925854,2.412001,782540.0,9.278143e+05,1.356290e+06
1999-06-21,49.607399,1811000,46.261578,43.718754,47.064571,48.492142,46.725535,46.019407,47.561251,46.585314,47.116481,1.760745,1.386829,2.233524,980520.0,9.890357e+05,1.254580e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-07-01,463.660004,1850500,452.320007,462.089996,446.459991,454.910004,455.035999,452.777854,455.388833,454.829077,452.473649,6.524900,6.298291,5.819654,2458100.0,2.184421e+06,2.133507e+06
2024-07-02,465.609985,1491800,463.660004,457.380005,450.179993,453.549988,456.681995,454.302139,455.412280,456.487678,453.446711,8.109649,6.653176,5.860597,2429780.0,2.152536e+06,2.128037e+06
2024-07-03,467.920013,993900,465.609985,455.859985,457.429993,444.269989,459.094000,455.674997,455.671104,458.246499,454.518808,9.481484,7.354319,6.236679,2202220.0,2.099743e+06,2.074783e+06
2024-07-05,464.750000,1593800,467.920013,445.959991,458.049988,448.700012,462.852002,456.989282,455.576758,459.247038,455.276674,6.092401,7.202219,6.069475,1953940.0,2.081507e+06,2.035823e+06


###Splitting the Dataset into Train & Test Sets

In [8]:
x = stock_data.drop(columns = 'Adj Close')
y = stock_data['Adj Close']

In [9]:
x.shape, y.shape

((6306, 16), (6306,))

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=1, shuffle=False)

In [11]:
x_train.shape, y_train.shape

((5990, 16), (5990,))

In [12]:
x_test

Unnamed: 0_level_0,Volume,Lagged 1,Lagged 5,Lagged 10,Lagged 15,MA_5,MA_14,MA_30,EMA_12,EMA_26,Volatility_5,Volatility_14,Volatility_30,Volume_5_MA,Volume_14_MA,Volume_30_MA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2023-04-04,1536800,314.608521,306.823273,305.705597,310.397949,311.822021,305.686329,319.847321,310.273876,316.201480,2.894199,6.182523,18.199520,1711780.0,2.686236e+06,2.978687e+06
2023-04-05,1329300,310.879700,309.020142,302.227325,300.801270,311.978101,306.129547,318.665584,310.201055,315.727336,2.721196,6.243221,17.635257,1682460.0,2.466964e+06,2.949537e+06
2023-04-06,1311000,309.800537,309.424774,303.364258,303.595490,312.220905,307.427545,317.427284,310.268399,315.350407,2.479875,4.936425,16.804546,1653400.0,2.105621e+06,2.928857e+06
2023-04-10,1114400,310.638794,315.176971,301.167419,292.466827,311.806598,308.489482,316.261320,310.704867,315.184113,1.986513,4.374720,15.786757,1420060.0,1.902100e+06,2.885050e+06
2023-04-11,1559600,313.105438,314.608521,306.910004,298.238312,311.970398,309.183905,315.119185,311.431429,315.202143,2.286433,4.661062,14.469241,1370220.0,1.781900e+06,2.842290e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-07-01,1850500,452.320007,462.089996,446.459991,454.910004,455.035999,452.777854,455.388833,454.829077,452.473649,6.524900,6.298291,5.819654,2458100.0,2.184421e+06,2.133507e+06
2024-07-02,1491800,463.660004,457.380005,450.179993,453.549988,456.681995,454.302139,455.412280,456.487678,453.446711,8.109649,6.653176,5.860597,2429780.0,2.152536e+06,2.128037e+06
2024-07-03,993900,465.609985,455.859985,457.429993,444.269989,459.094000,455.674997,455.671104,458.246499,454.518808,9.481484,7.354319,6.236679,2202220.0,2.099743e+06,2.074783e+06
2024-07-05,1593800,467.920013,445.959991,458.049988,448.700012,462.852002,456.989282,455.576758,459.247038,455.276674,6.092401,7.202219,6.069475,1953940.0,2.081507e+06,2.035823e+06


##Train the Random Forest Regression Model

In [13]:
# bootstrap = False
# Time series data have a temporal order that should be preserved. Bootstrapping can disrupt this order because it involves random sampling with replacement,
# potentially mixing data points from different time periods and breaking the time dependency.

rf = RandomForestRegressor(n_estimators = 1000,bootstrap=False, random_state = 0)
rf.fit(x_train, y_train)

##Predict returns with model

In [14]:
pred = rf.predict(x_test)

In [15]:
pred.shape, y_test.shape

((316,), (316,))

In [16]:
data = {
        'True Value': y_test.values.flatten(),
        'predicted': pred.flatten(),
        }

test_results = pd.DataFrame(data, index = y_test.index)
train_values = pd.DataFrame({'Trained Value': y_train})

combined = pd.concat([train_values, test_results])
combined

Unnamed: 0_level_0,Trained Value,True Value,predicted
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1999-06-15,44.789429,,
1999-06-16,46.573841,,
1999-06-17,46.395428,,
1999-06-18,46.261578,,
1999-06-21,49.607399,,
...,...,...,...
2024-07-01,,463.660004,391.605174
2024-07-02,,465.609985,391.605174
2024-07-03,,467.920013,391.605174
2024-07-05,,464.750000,392.987793


##Visualize results

In [17]:
px.line(combined, title = f'{ticker[0]} True vs. Predicted Price')

In [18]:
rmse = np.sqrt(mean_squared_error(y_test, pred))
print('Root Mean Squared Error is:', rmse)

Root Mean Squared Error is: 25.063242706573146
