In [31]:
import numpy as np
import pandas as pd

In [32]:
df = pd.read_csv('Data/price/clean_price_data')

In [33]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Week Day,Day,Next_High,Next_Low
0,2017-09-11,308.644989,329.451996,307.056,320.884003,893249984,1,0,324.717987,294.541992
1,2017-10-11,320.67099,324.717987,294.541992,299.252991,885985984,1,2,319.453003,298.191986
2,2017-11-11,298.585999,319.453003,298.191986,314.681,842300992,0,5,319.153015,298.513
3,2017-12-11,314.690002,319.153015,298.513,307.90799,1613479936,1,0,328.415009,307.024994
4,2017-11-13,307.024994,328.415009,307.024994,316.716003,1041889984,1,0,340.177002,316.763


In [34]:
df.set_index('Date', inplace=True)
df = df.drop(index=df.index[-1])
df.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Week Day,Day,Next_High,Next_Low
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2023-04-05,1905.168945,1915.871704,1868.629761,1877.704102,6578438233,1,2,1998.413574,1876.946655
2023-05-05,1877.83313,1998.413574,1876.946655,1995.060913,9498261360,1,4,2017.559448,1870.605591
2023-06-05,1995.479126,2017.559448,1870.605591,1900.221802,9623243037,1,0,1933.998413,1873.076416
2023-07-05,1899.917603,1933.998413,1873.076416,1873.076416,6883942236,1,2,1886.162109,1818.450562
2023-08-05,1872.475098,1886.162109,1818.450562,1849.042725,9895102899,0,5,1855.824951,1833.420288


In [35]:
train_size = 0.7
train_index = int(len(df) * train_size)

train_data = df.iloc[:train_index]
test_data = df.iloc[train_index:]

In [36]:
X_train = train_data.drop(['Next_High', 'Next_Low'], axis=1)
y_train = train_data['Next_High']

X_test = test_data.drop(['Next_High', 'Next_Low'], axis=1)
y_test = test_data['Next_High']

In [37]:
from sklearn.preprocessing import RobustScaler

In [38]:
scaler = RobustScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [39]:
from sklearn.linear_model import ElasticNet

In [40]:
linear_model = ElasticNet()

In [41]:
linear_model.fit(X_train_scaled, y_train)

ElasticNet()

In [42]:
linear_pred = linear_model.predict(X_test_scaled)

In [43]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [44]:
RMSE = np.sqrt(mean_squared_error(y_test, linear_pred))
RMSE

106.5580869555413

RMSE of RandomForest was: 156
RMSE of AdaBoostRegressor was: 146
RMSE of GradientBoostingRegressor was: 137

In [45]:
(mean_absolute_error(y_test, linear_pred)/y_train.mean())*100

10.381443335650133

In [46]:
from sklearn.model_selection import GridSearchCV

In [47]:
param_grid = {'alpha':[0.25, 0.5, 0.6, 0.75], 'l1_ratio':[0.5, 0.6, 0.75]}

In [48]:
grid = GridSearchCV(linear_model, param_grid=param_grid, cv=5)

In [49]:
grid.fit(X_train_scaled, y_train)

GridSearchCV(cv=5, estimator=ElasticNet(),
             param_grid={'alpha': [0.25, 0.5, 0.6, 0.75],
                         'l1_ratio': [0.5, 0.6, 0.75]})

In [50]:
grid.best_params_

{'alpha': 0.25, 'l1_ratio': 0.75}

In [51]:
best_linear = ElasticNet(alpha=0.25, l1_ratio=0.75)

In [52]:
best_linear.fit(X_train_scaled, y_train)

ElasticNet(alpha=0.25, l1_ratio=0.75)

In [53]:
best_pred = best_linear.predict(X_test_scaled)

In [54]:
RMSE = np.sqrt(mean_squared_error(y_test, best_pred))
RMSE

74.50322942237855

In [55]:
best_linear = ElasticNet(alpha=0.25, l1_ratio=0.75)

In [56]:
best_linear.fit(X_train_scaled, y_train)

ElasticNet(alpha=0.25, l1_ratio=0.75)

In [57]:
best_pred = best_linear.predict(X_test_scaled)

In [58]:
RMSE = np.sqrt(mean_squared_error(y_test, best_pred))
RMSE

74.50322942237855

In [59]:
import joblib

In [60]:
joblib.dump(value=best_linear, filename='models/linear_model.pkl')

['models/linear_model.pkl']

Second Model