In [None]:
# from ipynb.fs.full.CMC_Webscraping_Final import *

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
%matplotlib inline

In [None]:
for column in df:
    df.columns = ['Date', 'Open','High','Low','Close','Volume','Market Cap']
for i in df.columns[1:7]:
    df[i] = df[i].str.replace(',','')
    df[i] = df[i].str.replace('$','', regex=True)
    df[i] = df[i].astype(float)
df['Volume'] = df['Volume'].astype(np.int64)
df['Market Cap'] = df['Market Cap'].astype(np.int64)

In [None]:
df = df.set_index('Date')

In [None]:
def SMA(df):
    df['SMA_21days'] = df.iloc[:,4].rolling(window=21).mean()
    df['SMA_50days'] = df.iloc[:,4].rolling(window=50).mean()
    df['SMA_100days'] = df.iloc[:,4].rolling(window=100).mean()

In [None]:
def EMA(df):
    df['EMA_21days'] = df['Close'].ewm(span=21,adjust=False).mean()
    df['EMA_50days'] = df['Close'].ewm(span=50,adjust=False).mean()
    df['EMA_100days'] = df['Close'].ewm(span=100,adjust=False).mean()

In [None]:
def RSIs(df):
    df['diff'] = df.Close.diff()
    df['pos'] = df['diff'].clip(lower=0)
    df['neg'] = -1*df['diff'].clip(upper=0)
    ema_pos = df['pos'].ewm(com=13, adjust=False).mean()
    ema_neg = df['neg'].ewm(com=13, adjust=False).mean()
    relative_str = ema_pos / ema_neg
    df['RSI'] = 100-(100/(1+relative_str))
    df['Stochastic_RSI'] = (df['RSI']-df['RSI'].rolling(14).min())/(df['RSI'].rolling(14).max()-(df['RSI'].rolling(14).min()))

In [None]:
SMA(df)
EMA(df)
RSIs(df)

In [None]:
df1 = df.copy()
df1=df1.drop(['pos','neg','Log Return','diff','Open','High','Low'], axis=1)
df1=df1.dropna()

Machine Learning:

In [None]:
import numpy as np
import pandas as pd
import pandas_datareader.data as web
from matplotlib import pyplot
import seaborn as sns
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor

#Libraries for Deep Learning Models
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.optimizers import SGD
from keras.layers import LSTM
from keras.wrappers.scikit_learn import KerasRegressor

#Libraries for Statistical Models
import statsmodels.api as sm

#Libraries for Saving the Model
from pickle import dump
from pickle import load

# Time series Models
from statsmodels.tsa.arima.model import ARIMA
import statsmodels.api as sm

from statsmodels.tsa.statespace.sarimax import SARIMAX

# Error Metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Feature Selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_regression


#Plotting 
from pandas.plotting import scatter_matrix
from statsmodels.graphics.tsaplots import plot_acf

In [None]:
df1.corr()['Close'].sort_values(ascending=False)

In [None]:
df1 = df1.loc['2021-02-01':]

In [None]:
len(df1)

In [None]:
X = df1.drop('Close',axis=1)
Y =(df1['Close'].copy()

In [None]:
correlation = df1.corr()
plt.figure(figsize=(20,20))
plt.title('Correlation Matrix')
sns.heatmap(correlation, vmax=1, square=True, annot=True, cmap='cubehelix')

In [None]:
plt.figure(figsize=(20,20))
scatter_matrix(df1, figsize=(20,20))
plt.show()

In [None]:
bestfeatures = SelectKBest(k=5, score_func=f_regression)
fit = bestfeatures.fit(X,Y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
featureScores.nlargest(20,'Score').set_index('Specs')

In [None]:
validation_size = 0.3
train_size = int(len(X) * (1-validation_size))
X_train, X_validation = X[0:train_size], X[train_size:len(X)]
Y_train, Y_validation = Y[0:train_size], Y[train_size:len(X)]

In [None]:
num_folds = 10
seed = 7
scoring='neg_mean_squared_error'

In [None]:
models = []
# models.append(('LR', LinearRegression()))
# models.append(('LASSO', Lasso()))
# models.append(('EN', ElasticNet()))
models.append(('KNN', KNeighborsRegressor()))
# models.append(('CART', DecisionTreeRegressor()))
# models.append(('SVR', SVR()))

# models.append(('MLP', MLPRegressor())) #Don't do this lmao, significant error

# Boosting methods
models.append(('ABR', AdaBoostRegressor()))
models.append(('GBR', GradientBoostingRegressor()))
# Bagging methods
models.append(('RFR', RandomForestRegressor()))
# models.append(('ETR', ExtraTreesRegressor()))

In [None]:
names = []
kfold_results = []
test_results = []
train_results = []
for name, model in models:
    names.append(name)
    ## K Fold analysis:
    kfold = KFold(n_splits=num_folds, random_state=None)
    #converted mean square error to positive. The lower the beter
    cv_results = -1* cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    kfold_results.append(cv_results)
    
    # Full Training period
    res = model.fit(X_train, Y_train)
    train_result = mean_squared_error(res.predict(X_train), Y_train)
    train_results.append(train_result)
    # Test results
    test_result = mean_squared_error(res.predict(X_validation), Y_validation)
    test_results.append(test_result)
    msg = "%s: %f (%f) %f %f" % (name, cv_results.mean(), cv_results.std(), train_result, test_result)
    print(msg)

In [None]:
fig = plt.figure()
fig.suptitle('Kfold analysis between various algorithms')
ax = fig.add_subplot(111)
plt.boxplot(kfold_results)
ax.set_xticklabels(names)
fig.set_size_inches(15,8)
plt.show()

In [None]:
# compare algorithms
fig = plt.figure()
ind = np.arange(len(names))  # the x locations for the groups
width = 0.35  # the width of the bars
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.bar(ind - width/2, train_results,  width=width, label='Train Error')
plt.bar(ind + width/2, test_results, width=width, label='Test Error')
fig.set_size_inches(15,8)
plt.legend()
ax.set_xticks(ind)
ax.set_xticklabels(names)
plt.show()

In [None]:
# compare algorithms
fig = plt.figure()
ind = np.arange(len(names))  # the x locations for the groups
width = 0.35  # the width of the bars
fig.suptitle('Comparing the performance of various algorthims on the Train and Test Dataset')
ax = fig.add_subplot(111)
plt.bar(ind - width/2, train_results,  width=width, label='Train Error')
plt.bar(ind + width/2, test_results, width=width, label='Test Error')
fig.set_size_inches(15,8)
plt.legend()
ax.set_xticks(ind)
ax.set_xticklabels(names)
plt.ylabel('Mean Square Error')
plt.show()

In [None]:
model = KNeighborsRegressor(n_neighbors=3)
model.fit(X_train, Y_train)

In [None]:
predictions = model.predict(X_validation)
print(mean_squared_error(Y_validation, predictions))
print(r2_score(Y_validation, predictions))

In [None]:
df2 = pd.DataFrame(predictions)

In [None]:
# df2.columns = ['Prediction']
y_val = []
for i in Y_validation:
    y_val.append(i)
df2['Y_validation'] = y_val

In [None]:
fig, ax = plt.subplots(figsize=(12,8))
df2[['Prediction','Y_validation']].plot(ax=ax)
plt.show()

In [None]:
test_prediction = pd.Series(predictions.reshape(len(predictions),))

In [None]:
result = mean_squared_error(Y_validation, predictions)

In [None]:
result