In [1]:
from statistics import mean
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
actual = pd.read_csv("/Users/mz195/BTC_price_db.csv")
arima = pd.read_csv("/Users/mz195/arima_predictions.csv")
varmax = pd.read_csv("/Users/mz195/varmax_predictions.csv")
ses = pd.read_csv("/Users/mz195/ses_predictions.csv")

In [3]:
print("Actual:\t{}".format(actual.shape))
print("ARIMA:\t{}".format(arima.shape))
print("VARMAX:\t{}".format(varmax.shape))
print("SES:\t{}".format(ses.shape))

Actual:	(11333, 2)
ARIMA:	(5675, 2)
VARMAX:	(3506, 2)
SES:	(3023, 2)


In [4]:
arima_df = actual.merge(arima, on='time_')
arima_varmax_df = arima_df.merge(varmax, on='time_')
all_models_df = arima_varmax_df.merge(ses, on='time_')
all_models_df.shape

(3017, 5)

In [5]:
all_models_df.head()

Unnamed: 0,time_,price_avg,arima,varmax,ses
0,2021-06-27 22:03:30,32981.63915,32966.545305,32973.673844,32970.854879
1,2021-06-27 22:04:00,32973.2575,32984.204069,32978.171274,32971.348904
2,2021-06-27 22:04:30,32973.2575,32979.936616,32966.950351,32962.978442
3,2021-06-27 22:05:00,32965.21585,32975.250152,32967.764012,32965.962593
4,2021-06-27 22:05:30,32965.21585,32971.040485,32968.27105,32969.270639


In [6]:
X = all_models_df.drop(["time_", "price_avg"], axis=1)
y = all_models_df.price_avg

In [7]:
MAE_list = []
RMSE_list = []
MSE_list = []
test_score_list = []
train_score_list = []

for _ in range(1000):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75)
    
    scaler = StandardScaler()
    scaled_X_train = scaler.fit_transform(X_train)
    scaled_X_test = scaler.transform(X_test)
    
    poly_reg = PolynomialFeatures(degree = 2)
    poly_train = poly_reg.fit_transform(pd.DataFrame(scaled_X_train))
    poly_test = poly_reg.transform(pd.DataFrame(scaled_X_test))
    
    lr = LinearRegression()
    lr.fit(poly_train, y_train)
    train_score_list.append(lr.score(poly_train,y_train))
    test_score_list.append(lr.score(poly_test,y_test))
    
    preds = lr.predict(poly_test)
    MAE_list.append(mean_absolute_error(y_test, preds))
    RMSE_list.append(mean_squared_error(y_test, preds, squared=False))
    MSE_list.append(mean_squared_error(y_test, preds))

In [8]:
# Cost Functions for the Linear Regression
print("---- Cost functions for LR ----")
print("Avg Training Score:\t\t", mean(train_score_list))
print("Avg Testing Score:\t\t", mean(test_score_list))
print("Avg MAE:\t\t\t", mean(MAE_list))
print("Avg RMSE:\t\t\t", mean(RMSE_list))
print("Avg MSE:\t\t\t", mean(MSE_list))

---- Cost functions for LR ----
Avg Training Score:		 0.9462228586593273
Avg Testing Score:		 0.944983571396139
Avg MAE:			 46.373390304247295
Avg RMSE:			 88.27513585396663
Avg MSE:			 7860.049369869503
