In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
import plotly as py
import plotly.io as pio
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [None]:
df1= pd.read_csv('Amazon.csv')

In [None]:
df2= pd.read_csv('Aamazontweet.csv')

In [None]:
data =pd.merge(df1, df2, on='Date', how='left')

In [None]:
data

In [None]:
data =data.drop(['Volume', 'Open', 'Low', 'High','Unnamed: 0_x',  'Unnamed: 0_y', 'Adj Close'  ], 1)

In [None]:
data['Sen_Avr_3'] = data['Avg_Compound'].rolling(3).mean().shift()
data['Sen_Avr_7'] = data['Avg_Compound'].rolling(7).mean().shift()

In [None]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=2, weights="uniform")

In [None]:
data['Close'] = data['Close'].shift(-1)

In [None]:
data = data.iloc[25:] # Because of moving averages and MACD line
data = data[:-1]      # Because of shifting close price

data.index = range(len(data))

In [None]:
validation_size  = 0.01

data_train = data[:-int(data.shape[0]*validation_size)]

In [None]:
data_validation = data[-int(data.shape[0]*validation_size):]

In [None]:
data_validation = data_validation.drop(['Date' ], 1)
data_train = data_train.drop(['Date' ], 1)

In [None]:
y_data_train = data_train['Close'].copy()
X_data_train = data_train.drop(['Close'], 1)

y_data_validation = data_validation['Close'].copy()
X_data_validation = data_validation.drop(['Close'], 1)

In [None]:
from sklearn.model_selection import  GridSearchCV,TimeSeriesSplit, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
sc= StandardScaler() # Scaling the data before processing
tscv = TimeSeriesSplit(n_splits=5 ) # The train-test split for the cross validation

In [None]:
x= data_train.loc[:, data_train.columns != 'Close'].to_numpy() 
y= data_train.loc[:, 'Close'].to_numpy()

In [None]:
from sklearn import linear_model
ll= linear_model.Lasso()
pipe= Pipeline(steps=[('imputer', imputer),('scaler', sc),('regressor', ll)])
param_grid = {
    'regressor__alpha':[0.0000001,0.000001,0.0001, 0.001, 0.01, 0.1],       
}
search_lasso = GridSearchCV(pipe, param_grid, n_jobs=-1)
scores_lasso = cross_validate(search_lasso, x, y, scoring=['r2','neg_mean_squared_error'],cv=tscv)

In [None]:
print('Fold r2', scores_lasso['test_r2'])
print('Average r2', np.mean(scores_lasso['test_r2']))
print('Fold NMSE', scores_lasso['test_neg_mean_squared_error'])
print('Average NMSE', np.mean(scores_lasso['test_neg_mean_squared_error']))

In [None]:
search_lasso.fit(x,y)
search_lasso.best_params_

In [None]:
from sklearn.linear_model import LinearRegression
lr= LinearRegression()
pipe= Pipeline(steps=[('imputer', imputer),('scaler', sc),('regressor', lr)])
param_grid = {
    'regressor__fit_intercept':[True, False],       
}
search_linear = GridSearchCV(pipe, param_grid, n_jobs=-1)
scores_linear = cross_validate(search_linear, x, y, scoring=['r2','neg_mean_squared_error'],cv=tscv)

In [None]:
print('Fold r2', scores_linear['test_r2'])
print('Average r2', np.mean(scores_linear['test_r2']))
print('Fold NMSE', scores_linear['test_neg_mean_squared_error'])
print('Average NMSE', np.mean(scores_linear['test_neg_mean_squared_error']))

In [None]:
search_linear.fit(x,y)
search_linear.best_params_

In [None]:
from hyperopt import tpe, hp, fmin, STATUS_OK,Trials, space_eval
from hyperopt.pyll.base import scope

## SVC

In [None]:
from sklearn.model_selection import cross_val_score, cross_validate

In [None]:
scoring = {'r2': 'r2',
           'neg_mean_squared_error': 'neg_mean_squared_error'
          }

In [None]:
def objective(params, random_state=42, cv=tscv, x=x, y=y):
    score = cross_val_score(pipe, x, y,cv=tscv, scoring = 'neg_mean_squared_error', n_jobs=-1).mean()
    return score

In [None]:
def metric(params, random_state=42, cv=tscv, x=x, y=y):

    score = cross_validate(pipe, x, y,cv=tscv, scoring = scoring, n_jobs=-1)
    return score

In [None]:
space = {
                 
            'regressor__C': hp.choice('regressor__C', [10, 100, 1000, 10000]),
            'regressor__gamma': hp.choice('regressor__gamma', [0.0000001,0.000001,0.0001, 0.001, 0.01, 0.1]),
            'regressor__kernel': hp.choice('regressor__kernel', ['linear','poly', 'rbf','sigmoid'])
}


In [None]:
from sklearn.svm import SVR
SV= SVR()

pipe= Pipeline(steps=[('imputer', imputer),('scaler', sc),('regressor', SV)])
trials = Trials()
best_param_SVC=fmin(fn=objective, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
metrics = metric(best_param_SVC, cv=tscv, x=x, y=y)


In [None]:
print('Fold r2', metrics['test_r2'])
print('Average r2', np.mean(metrics['test_r2']))
print('Fold NMSE', metrics['test_neg_mean_squared_error'])
print('Average NMSE', np.mean(metrics['test_neg_mean_squared_error']))

In [None]:
space_eval(space, best_param_SVC)

In [None]:
pipeSVR= pipe.set_params(**space_eval(space, best_param_SVC))

In [None]:
pipeSVR.fit(x,y)

## RF

In [None]:
space = {
            'regressor__n_estimators': hp.choice('regressor__n_estimators', range(200,10000, 100)),
            'regressor__max_depth': hp.quniform('regressor__max_depth', 1, 30, 1),
            #"regressor__criterion": hp.choice("regressor__criterion", ["gini", "entropy"]),
            'regressor__max_features': hp.choice('regressor__max_features', range(1,8, 1)),
            'regressor__min_samples_leaf':hp.choice('regressor__min_samples_leaf',range(1,8, 1)),
            'regressor__min_samples_split':hp.choice('regressor__min_samples_split',range(2,40, 1))
}

In [None]:
from sklearn.ensemble import RandomForestRegressor

regr = RandomForestRegressor( verbose=True)
pipe= Pipeline(steps=[('imputer', imputer),('scaler', sc),('regressor', regr)])

trials = Trials()


best_param_RF=fmin(fn=objective, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )



metrics = metric(best_param_RF, cv=tscv, x=x, y=y)

In [None]:
print('Fold r2', metrics['test_r2'])
print('Average r2', np.mean(metrics['test_r2']))
print('Fold NMSE', metrics['test_neg_mean_squared_error'])
print('Average NMSE', np.mean(metrics['test_neg_mean_squared_error']))

In [None]:
space_eval(space, best_param_RF)

In [None]:
pipeRF=pipe.set_params(**space_eval(space, best_param_RF))

In [None]:
pipeRF.fit(x,y)

## GBM

In [None]:
space = {
    'regressor__n_estimators':     hp.choice('regressor__n_estimators', range(200,10000, 100)),
    'regressor__learning_rate':    hp.choice('regressor__learning_rate',    np.arange(0.05, 0.99, 0.05)),
    'regressor__max_depth':        hp.choice('regressor__max_depth',        np.arange(2, 16, 1, dtype=int)),
    'regressor__subsample':        hp.uniform('regressor__subsample', 0.8, 1),
    'regressor__min_samples_leaf': hp.choice('regressor__min_samples_leaf',range(1,40, 1)),
    'regressor__min_samples_split':hp.choice('regressor__min_samples_split',range(2,40, 1))
    
}

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

regr = GradientBoostingRegressor( verbose=True)
pipe= Pipeline(steps=[('imputer', imputer),('scaler', sc),('regressor', regr)])

trials = Trials()


best_param_GBM=fmin(fn=objective, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
         rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )



metrics = metric(best_param_GBM, cv=tscv, x=x, y=y)


In [None]:
print('Fold r2', metrics['test_r2'])
print('Average r2', np.mean(metrics['test_r2']))
print('Fold NMSE', metrics['test_neg_mean_squared_error'])
print('Average NMSE', np.mean(metrics['test_neg_mean_squared_error']))

In [None]:
space_eval(space, best_param_GBM)

In [None]:
pipeGBM=pipe.set_params(**space_eval(space, best_param_GBM))

In [None]:
pipeGBM.fit(x,y)

In [None]:
x_valid= data_validation.loc[:, data_validation.columns != 'Close'].to_numpy() 
y_valid= data_validation.loc[:, 'Close'].to_numpy()

In [None]:
predicted_prices = data.loc[int(data.shape[0] * (1-validation_size)):].copy()

fig = make_subplots(rows=1, cols=1)

fig.add_trace(go.Scatter(x=predicted_prices.Date,
                         y=y_valid,
                         name='Truth',
                         marker_color='LightSkyBlue',
                         showlegend=True), row=1, col=1)


fig.add_trace(go.Scatter(x=predicted_prices.Date,
                         y=pipeSVR.predict(x_valid),
                         name='Prediction_SVR',
                         marker_color='Red',
                         showlegend=True), row=1, col=1)

fig.add_trace(go.Scatter(x=predicted_prices.Date,
                         y=pipeGBM.predict(x_valid),
                         name='Prediction_GBM',
                         marker_color='Purple',
                         showlegend=True), row=1, col=1)

fig.add_trace(go.Scatter(x=predicted_prices.Date,
                         y=pipeRF.predict(x_valid),
                         name='Prediction_RF',
                         marker_color='Green',
                         showlegend=True), row=1, col=1)

fig.add_trace(go.Scatter(x=predicted_prices.Date,
                         y=search_lasso.predict(x_valid),
                         name='Prediction_lasso',
                         marker_color='Black',
                         showlegend=True), row=1, col=1)


fig.add_trace(go.Scatter(x=predicted_prices.Date,
                         y=search_linear.predict(x_valid),
                         name='Prediction_Linear_regression',
                         marker_color='Grey',
                         showlegend=True), row=1, col=1)
fig.show()

In [None]:
coefs = pd.DataFrame(
   search_lasso.best_estimator_.named_steps["regressor"].coef_,
   columns=['Coefficients'], index=data_train.loc[:, data_train.columns != 'Close'].columns
)

coefs.plot(kind='barh', figsize=(9, 7))
plt.title('Lasso model')
plt.axvline(x=0, color='.5')
plt.subplots_adjust(left=.3)

In [None]:
coefs = pd.DataFrame(
   search_linear.best_estimator_.named_steps["regressor"].coef_,
   columns=['Coefficients'], index=data_train.loc[:, data_train.columns != 'Close'].columns
)

coefs.plot(kind='barh', figsize=(9, 7))
plt.title('Linear Regression model')
plt.axvline(x=0, color='.5')
plt.subplots_adjust(left=.3)

In [None]:
coefs = pd.DataFrame(
   pipeSVR.named_steps["regressor"].coef_[0],
   columns=['Coefficients'], index=data_train.loc[:, data_train.columns != 'Close'].columns
)

coefs.plot(kind='barh', figsize=(9, 7))
plt.title('SVR Model')
plt.axvline(x=0, color='.5')
plt.subplots_adjust(left=.3)


In [None]:
coefs = pd.DataFrame(
   pipeGBM.named_steps["regressor"].feature_importances_,
   columns=['Coefficients'], index=data_train.loc[:, data_train.columns != 'Close'].columns
)

coefs.plot(kind='barh', figsize=(9, 7))
plt.title('Gradient boost Model')
plt.axvline(x=0, color='.5')
plt.subplots_adjust(left=.3)


In [None]:
coefs = pd.DataFrame(
   pipeRF.named_steps["regressor"].feature_importances_,
   columns=['Coefficients'], index=data_train.loc[:, data_train.columns != 'Close'].columns
)

coefs.plot(kind='barh', figsize=(9, 7))
plt.title('Randon Forest Model')
plt.axvline(x=0, color='.5')
plt.subplots_adjust(left=.3)

## test