# **Description**
* This colab includes codings required for questions 22-27.
* To avoid messy code and improve the organization, codes for the first part of homework (1-21) are in the following link: https://colab.research.google.com/drive/190aRzS0ULWFs_leintfBwr8DdVSsWkaO?usp=sharing

**LSTM for classification**

In [None]:
import pandas as pd

bitcoin_path = '/content/drive/MyDrive/Data Mining Class (Master)/HW3/Dataset/Bitcoin Historical Data - Investing.com.csv'
bitcoin_df = pd.read_csv(bitcoin_path)

for col in bitcoin_df.columns:
  if col == 'Date':
    bitcoin_df[col] = pd.to_datetime(bitcoin_df[col])
  elif col in ['Price', 'Open', 'High']:
    bitcoin_df[col] = bitcoin_df[col].str.replace(',', '').astype('float64')

bitcoin_df= bitcoin_df.sort_values(by = 'Date', ascending=True).reset_index(drop = True)

In [None]:
bitcoin_df['Change %'] = bitcoin_df['Change %'].str.replace('%', '').astype('float64')
bitcoin_df.loc[bitcoin_df['Change %'] <= 0, 'Change %'] = 0
bitcoin_df.loc[bitcoin_df['Change %'] > 0, 'Change %'] = 1

In [None]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# convert an array of values into a dataset matrix
def create_dataset(dataset, col, look_back=1):
	dataX, dataY = [], []
	for i in range(len(dataset)-look_back-1):
		a = dataset[i:(i+look_back), 0]
		dataX.append(a)
		dataY.append(col[i + look_back])
	return np.array(dataX), np.array(dataY)

#LSTM is sensitive to scale of input data, so we rescale everything between 0 and 1
scaler = MinMaxScaler(feature_range=(0, 1))
bitcoin_df_scaled = scaler.fit_transform(bitcoin_df['Price'].values.reshape(-1,1))

bitcoin_train_df_scaled = bitcoin_df_scaled[bitcoin_df['Date']<='2020-01-01']
bitcoin_test_df_scaled = bitcoin_df_scaled[bitcoin_df['Date']>='2020-01-02']

look_back = 1
trainX, trainY = create_dataset(bitcoin_train_df_scaled, bitcoin_df['Change %'].values.tolist(), look_back)
testX, testY = create_dataset(bitcoin_test_df_scaled, bitcoin_df['Change %'].values.tolist(), look_back)

# reshape input to be [samples, time steps, features]
trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

model = Sequential()
model.add(LSTM(4, input_shape=(1, look_back)))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(trainX, trainY, epochs=50, batch_size=1, verbose=2)

Epoch 1/50
3453/3453 - 6s - loss: 0.6920 - accuracy: 0.5219
Epoch 2/50
3453/3453 - 4s - loss: 0.6920 - accuracy: 0.5274
Epoch 3/50
3453/3453 - 4s - loss: 0.6919 - accuracy: 0.5274
Epoch 4/50
3453/3453 - 4s - loss: 0.6918 - accuracy: 0.5274
Epoch 5/50
3453/3453 - 4s - loss: 0.6917 - accuracy: 0.5274
Epoch 6/50
3453/3453 - 4s - loss: 0.6914 - accuracy: 0.5271
Epoch 7/50
3453/3453 - 4s - loss: 0.6915 - accuracy: 0.5268
Epoch 8/50
3453/3453 - 4s - loss: 0.6911 - accuracy: 0.5251
Epoch 9/50
3453/3453 - 4s - loss: 0.6912 - accuracy: 0.5262
Epoch 10/50
3453/3453 - 4s - loss: 0.6912 - accuracy: 0.5268
Epoch 11/50
3453/3453 - 4s - loss: 0.6911 - accuracy: 0.5198
Epoch 12/50
3453/3453 - 4s - loss: 0.6910 - accuracy: 0.5222
Epoch 13/50
3453/3453 - 4s - loss: 0.6910 - accuracy: 0.5236
Epoch 14/50
3453/3453 - 4s - loss: 0.6907 - accuracy: 0.5259
Epoch 15/50
3453/3453 - 4s - loss: 0.6906 - accuracy: 0.5282
Epoch 16/50
3453/3453 - 4s - loss: 0.6908 - accuracy: 0.5303
Epoch 17/50
3453/3453 - 4s - loss

<keras.callbacks.History at 0x7f57e524c490>

In [None]:
scores = model.evaluate(testX, testY, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 33.26%


In [None]:
y_pred = model.predict(testX)

In [None]:
y_pred_binary = [1 if y>0.5 else 0 for y in y_pred]

In [None]:
from sklearn.metrics import precision_recall_fscore_support

print(precision_recall_fscore_support(testY, y_pred_binary))

(array([0.72413793, 0.16519174]), array([0.27061856, 0.58333333]), array([0.39399625, 0.25747126]), array([388,  96]))


**Add a new feature (Change %) to the LSTM model**

In [None]:
import pandas as pd

bitcoin_path = '/content/drive/MyDrive/Data Mining Class (Master)/HW3/Dataset/Bitcoin Historical Data - Investing.com.csv'
bitcoin_df = pd.read_csv(bitcoin_path)

for col in bitcoin_df.columns:
  if col == 'Date':
    bitcoin_df[col] = pd.to_datetime(bitcoin_df[col])
  elif col in ['Price', 'Open', 'High']:
    bitcoin_df[col] = bitcoin_df[col].str.replace(',', '').astype('float64')

bitcoin_df= bitcoin_df.sort_values(by = 'Date', ascending=True).reset_index(drop = True)

In [None]:
bitcoin_df['Change %'] = bitcoin_df['Change %'].str.replace('%', '').astype('float64')
bitcoin_df.loc[bitcoin_df['Change %'] <= 0, 'Change %'] = 0
bitcoin_df.loc[bitcoin_df['Change %'] > 0, 'Change %'] = 1

In [None]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# convert an array of values into a dataset matrix
def create_dataset(dataset, col, look_back=1):
  dataX, dataY = [], []
  for i in range(len(dataset)-look_back-1):
    a = dataset[i:(i+look_back), 0]
    a = np.append(a, col[i + look_back])
    dataX.append(a)
    dataY.append(dataset[i + look_back, 0])
  return np.array(dataX), np.array(dataY)

#LSTM is sensitive to scale of input data, so we rescale everything between 0 and 1
scaler = MinMaxScaler(feature_range=(0, 1))
bitcoin_df_scaled = scaler.fit_transform(bitcoin_df['Price'].values.reshape(-1,1))

bitcoin_train_df_scaled = bitcoin_df_scaled[bitcoin_df['Date']<='2020-01-01']
bitcoin_test_df_scaled = bitcoin_df_scaled[bitcoin_df['Date']>='2020-01-02']

look_back = 1
trainX, trainY = create_dataset(bitcoin_train_df_scaled, bitcoin_df['Change %'].values.tolist(), look_back)
testX, testY = create_dataset(bitcoin_test_df_scaled, bitcoin_df['Change %'].values.tolist(), look_back)

# reshape input to be [samples, time steps, features]
trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

# create and fit the LSTM network
model = Sequential()
model.add(LSTM(4, input_shape=(1, look_back+1)))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(trainX, trainY, epochs=50, batch_size=1, verbose=2)

Epoch 1/50
3453/3453 - 16s - loss: 0.0015
Epoch 2/50
3453/3453 - 3s - loss: 9.2016e-06
Epoch 3/50
3453/3453 - 3s - loss: 9.9398e-06
Epoch 4/50
3453/3453 - 3s - loss: 9.8092e-06
Epoch 5/50
3453/3453 - 3s - loss: 1.0257e-05
Epoch 6/50
3453/3453 - 3s - loss: 8.9668e-06
Epoch 7/50
3453/3453 - 3s - loss: 8.6970e-06
Epoch 8/50
3453/3453 - 3s - loss: 9.0792e-06
Epoch 9/50
3453/3453 - 3s - loss: 8.8467e-06
Epoch 10/50
3453/3453 - 3s - loss: 9.1233e-06
Epoch 11/50
3453/3453 - 3s - loss: 9.5246e-06
Epoch 12/50
3453/3453 - 3s - loss: 9.4978e-06
Epoch 13/50
3453/3453 - 3s - loss: 8.9310e-06
Epoch 14/50
3453/3453 - 3s - loss: 8.6985e-06
Epoch 15/50
3453/3453 - 3s - loss: 9.0593e-06
Epoch 16/50
3453/3453 - 3s - loss: 8.7955e-06
Epoch 17/50
3453/3453 - 3s - loss: 8.6561e-06
Epoch 18/50
3453/3453 - 3s - loss: 8.5585e-06
Epoch 19/50
3453/3453 - 3s - loss: 8.4886e-06
Epoch 20/50
3453/3453 - 3s - loss: 8.3426e-06
Epoch 21/50
3453/3453 - 3s - loss: 8.4539e-06
Epoch 22/50
3453/3453 - 3s - loss: 8.7447e-06


<keras.callbacks.History at 0x7f6207049e90>

In [None]:
def accuracy_reg(y_true, y_pred, error_bound = 0.05):
  y_ratio = [x/y for x, y in zip(y_true , y_pred)]
  count = 0
  for x in y_ratio:
    if x >= 1-error_bound and x<= 1+ error_bound:
      count+=1
  #count = len(y_ratio[(y_ratio>= 1 - error_bound) & (y_ratio<= 1 + error_bound)])
  print(count, len(y_true), len(y_pred))
  return (count/len(y_true)*100.0)

In [None]:
# make predictions
trainPredict = model.predict(trainX)
testPredict = model.predict(testX)
# invert predictions
trainPredict = scaler.inverse_transform(trainPredict)
trainY = scaler.inverse_transform([trainY])
testPredict = scaler.inverse_transform(testPredict)
testY = scaler.inverse_transform([testY])

In [None]:
import math
from sklearn.metrics import mean_squared_error

# calculate root mean squared error
trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
print('Train Score: %.2f RMSE' % (trainScore))
testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
print('Test Score: %.2f RMSE' % (testScore))


train_rmse = mean_squared_error(trainY[0], trainPredict[:,0], squared=False)
test_rmse = mean_squared_error(testY[0], testPredict[:,0], squared=False)
print('train error: {}, test error: {}'.format(train_rmse, test_rmse))


train_acc = accuracy_reg(trainY[0], trainPredict[:,0], error_bound=0.05)
test_acc = accuracy_reg(testY[0], testPredict[:,0], error_bound=0.05)
print('train accuracy 5%: {}, test accuracy 5%: {}'.format(train_acc, test_acc))

Train Score: 149.36 RMSE
Test Score: 6750.50 RMSE
train error: 149.36408023926046, test error: 6750.497141738015
1218 3453 3453
277 484 484
train accuracy 5%: 35.27367506516073, test accuracy 5%: 57.231404958677686


**Improvements**

In [None]:
import pandas as pd

bitcoin_path = '/content/drive/MyDrive/Data Mining Class (Master)/HW3/Dataset/Bitcoin Historical Data - Investing.com.csv'
bitcoin_df = pd.read_csv(bitcoin_path)

for col in bitcoin_df.columns:
  if col == 'Date':
    bitcoin_df[col] = pd.to_datetime(bitcoin_df[col])
  elif col in ['Price', 'Open', 'High']:
    bitcoin_df[col] = bitcoin_df[col].str.replace(',', '').astype('float64')

bitcoin_df= bitcoin_df.sort_values(by = 'Date', ascending=True).reset_index(drop = True)

In [None]:
bitcoin_df['Change %'] = bitcoin_df['Change %'].str.replace('%', '').astype('float64')
bitcoin_df.loc[bitcoin_df['Change %'] <= 0, 'Change %'] = 0
bitcoin_df.loc[bitcoin_df['Change %'] > 0, 'Change %'] = 1

In [None]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# convert an array of values into a dataset matrix
def create_dataset(dataset, col, look_back=1):
  dataX, dataY = [], []
  for i in range(len(dataset)-look_back-1):
    a = dataset[i:(i+look_back), 0]
    a = np.append(a, col[i + look_back-1])
    dataX.append(a)
    dataY.append(dataset[i + look_back, 0])
  return np.array(dataX), np.array(dataY)

#LSTM is sensitive to scale of input data, so we rescale everything between 0 and 1
scaler = MinMaxScaler(feature_range=(0, 1))
bitcoin_df_scaled = scaler.fit_transform(bitcoin_df['Price'].values.reshape(-1,1))

bitcoin_train_df_scaled = bitcoin_df_scaled[bitcoin_df['Date']<='2020-01-01']
bitcoin_test_df_scaled = bitcoin_df_scaled[bitcoin_df['Date']>='2020-01-02']

look_back = 1
trainX, trainY = create_dataset(bitcoin_train_df_scaled, bitcoin_df['Change %'].values.tolist(), look_back)
testX, testY = create_dataset(bitcoin_test_df_scaled, bitcoin_df['Change %'].values.tolist(), look_back)

# reshape input to be [samples, time steps, features]
trainX = np.reshape(trainX, (trainX.shape[0], trainX.shape[1]))
testX = np.reshape(testX, (testX.shape[0], testX.shape[1]))

In [None]:
# Model Training
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.svm import SVR

def model_train(X, y, X_test, cf):
  cf.fit(X, y)
  y_pred = cf.predict(X_test)
  return y_pred


def error_calc(y_pred, y_true):
  return mean_squared_error(y_true, y_pred), mean_squared_error(y_true, y_pred, squared=False) # MSE, RMSE


methods = {"XGB": XGBRegressor(learning_rate=.1), 
           "LR": LinearRegression(), \
           "DT": DecisionTreeRegressor(), \
           "RF": RandomForestRegressor(), \
           "BAG": BaggingRegressor(base_estimator=SVR()), \
           "VOT": VotingRegressor([('lr', LinearRegression()), ('rf', RandomForestRegressor())]), \
           "EXT": ExtraTreesRegressor(), \
           "ADA": AdaBoostRegressor()}

results = {method: [] for method in methods}

for method in methods:
  y_pred = model_train(trainX, trainY, testX, methods[method])
  results[method] = y_pred
  print("MSE, RMSE error for {}: {}".format(method, error_calc(y_pred, testY)))

MSE, RMSE error for XGB: (0.059588748609196364, 0.2441080674807704)
MSE, RMSE error for LR: (0.00029374816679205833, 0.017139083020747008)
MSE, RMSE error for DT: (0.05878280480005465, 0.2424516545624192)
MSE, RMSE error for RF: (0.059816032088914214, 0.24457316305947024)
MSE, RMSE error for BAG: (0.058607247891527235, 0.2420893386572966)
MSE, RMSE error for VOT: (0.015373209626986763, 0.12398874798539891)
MSE, RMSE error for EXT: (0.058669735695529225, 0.24221836366289246)
MSE, RMSE error for ADA: (0.06184227682186922, 0.2486810745148678)


**Indicators**
# https://towardsdatascience.com/building-a-comprehensive-set-of-technical-indicators-in-python-for-quantitative-trading-8d98751b5fb

In [None]:
import pandas as pd

bitcoin_path = '/content/drive/MyDrive/Data Mining Class (Master)/HW3/Dataset/Bitcoin Historical Data - Investing.com.csv'
bitcoin_df = pd.read_csv(bitcoin_path)

for col in bitcoin_df.columns:
  if col == 'Date':
    bitcoin_df[col] = pd.to_datetime(bitcoin_df[col])
  elif col in ['Price', 'Open', 'High']:
    bitcoin_df[col] = bitcoin_df[col].str.replace(',', '').astype('float64')

bitcoin_df= bitcoin_df.sort_values(by = 'Date', ascending=True).reset_index(drop = True)

**(1) Simple Moving Average (SMA)**

In [None]:
# Simple Moving Average
bitcoin_df['SMA_5'] = bitcoin_df['Price'].transform(lambda x: x.rolling(window = 5).mean())
bitcoin_df['SMA_15'] = bitcoin_df['Price'].transform(lambda x: x.rolling(window = 15).mean())
bitcoin_df['SMA_Ratio'] = bitcoin_df['SMA_15'] / bitcoin_df['SMA_5']

**(2) Simple Moving Average Volume (SMA-Volume)**

In [None]:
bitcoin_df['Vol.'] = bitcoin_df['Vol.'].str.replace('K', '')
bitcoin_df['Vol.'] = bitcoin_df['Vol.'].str.replace('M', '')
bitcoin_df['Vol.'] = bitcoin_df['Vol.'].str.replace('-', '')
bitcoin_df['Vol.'] = bitcoin_df['Vol.'].str.replace('', '0')

bitcoin_df['Vol.'] = bitcoin_df['Vol.'].astype('float64')

In [None]:
bitcoin_df['SMA5_Volume'] = bitcoin_df['Vol.'].transform(lambda x: x.rolling(window = 5).mean())
bitcoin_df['SMA15_Volume'] = bitcoin_df['Vol.'].transform(lambda x: x.rolling(window = 15).mean())
bitcoin_df['SMA_Volume_Ratio'] = bitcoin_df['SMA15_Volume'] / bitcoin_df['SMA5_Volume']

**(3) Average True Range (ATR)**

In [None]:
import numpy as np

def Wilder(data, periods):
    start = np.where(~np.isnan(data))[0][0] #Check if nans present in beginning
    Wilder = np.array([np.nan]*len(data))
    Wilder[start+periods-1] = data[start:(start+periods)].mean() #Simple Moving Average
    for i in range(start+periods,len(data)):
        Wilder[i] = (Wilder[i-1]*(periods-1) + data[i])/periods #Wilder Smoothing
    return(Wilder)

bitcoin_df['prev_close'] = bitcoin_df['Price'].shift(1)

In [None]:
bitcoin_df['High'] = bitcoin_df['High'].astype('float64')
bitcoin_df['Low'] = bitcoin_df['Low'].str.replace(',', '').astype('float64')

In [None]:
bitcoin_df['TR'] = np.maximum((bitcoin_df['High'] - bitcoin_df['Low']), 
                     np.maximum(abs(bitcoin_df['High'] - bitcoin_df['prev_close']), 
                     abs(bitcoin_df['prev_close'] - bitcoin_df['Low'])))

TR_data = bitcoin_df.copy(deep=True)
bitcoin_df['ATR_5'] = Wilder(TR_data['TR'], 5)
bitcoin_df['ATR_15'] = Wilder(TR_data['TR'], 15)

bitcoin_df['ATR_Ratio'] = bitcoin_df['ATR_5'] / bitcoin_df['ATR_15']

**(4) Relative Strength Index (RSI)**

In [None]:
bitcoin_df['Diff'] = bitcoin_df['Price'].transform(lambda x: x.diff())
bitcoin_df['Up'] = bitcoin_df['Diff']
bitcoin_df.loc[(bitcoin_df['Up']<0), 'Up'] = 0

bitcoin_df['Down'] = bitcoin_df['Diff']
bitcoin_df.loc[(bitcoin_df['Down']>0), 'Down'] = 0 
bitcoin_df['Down'] = abs(bitcoin_df['Down'])

bitcoin_df['avg_5up'] = bitcoin_df['Up'].transform(lambda x: x.rolling(window=5).mean())
bitcoin_df['avg_5down'] = bitcoin_df['Down'].transform(lambda x: x.rolling(window=5).mean())

bitcoin_df['avg_15up'] = bitcoin_df['Up'].transform(lambda x: x.rolling(window=15).mean())
bitcoin_df['avg_15down'] = bitcoin_df['Down'].transform(lambda x: x.rolling(window=15).mean())

bitcoin_df['RS_5'] = bitcoin_df['avg_5up'] / bitcoin_df['avg_5down']
bitcoin_df['RS_15'] = bitcoin_df['avg_15up'] / bitcoin_df['avg_15down']

bitcoin_df['RSI_5'] = 100 - (100/(1+bitcoin_df['RS_5']))
bitcoin_df['RSI_15'] = 100 - (100/(1+bitcoin_df['RS_15']))

bitcoin_df['RSI_Ratio'] = bitcoin_df['RSI_5']/bitcoin_df['RSI_15']

**(5) Moving Average Convergence Divergence (MACD)**

In [None]:
bitcoin_df['5Ewm'] = bitcoin_df['Price'].transform(lambda x: x.ewm(span=5, adjust=False).mean())
bitcoin_df['15Ewm'] = bitcoin_df['Price'].transform(lambda x: x.ewm(span=15, adjust=False).mean())
bitcoin_df['MACD'] = bitcoin_df['15Ewm'] - bitcoin_df['5Ewm']

In [None]:
bitcoin_df = bitcoin_df.fillna(0)

In [None]:
bitcoin_df

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %,SMA_5,SMA_15,SMA_Ratio,SMA5_Volume,SMA15_Volume,SMA_Volume_Ratio,prev_close,TR,ATR_5,ATR_15,ATR_Ratio,Diff,Up,Down,avg_5up,avg_5down,avg_15up,avg_15down,RS_5,RS_15,RSI_5,RSI_15,RSI_Ratio,5Ewm,15Ewm,MACD
0,2010-07-18,0.1,0.0,0.1,0.1,0.0008,0.00%,0.00,0.000000,0.000000,0.00000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.100000,0.100000,0.000000
1,2010-07-19,0.1,0.1,0.1,0.1,0.0507,0.00%,0.00,0.000000,0.000000,0.00000,0.000000,0.000000,0.1,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.100000,0.100000,0.000000
2,2010-07-20,0.1,0.1,0.1,0.1,0.0206,0.00%,0.00,0.000000,0.000000,0.00000,0.000000,0.000000,0.1,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.100000,0.100000,0.000000
3,2010-07-21,0.1,0.1,0.1,0.1,0.0508,0.00%,0.00,0.000000,0.000000,0.00000,0.000000,0.000000,0.1,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.100000,0.100000,0.000000
4,2010-07-22,0.1,0.1,0.1,0.1,20.0106,0.00%,0.10,0.000000,0.000000,4.02670,0.000000,0.000000,0.1,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.100000,0.100000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3936,2021-04-27,55036.5,54011.1,55427.8,53345.0,8040.0008,1.88%,51850.62,56286.540000,1.085552,64856.03050,83784.041160,1.291847,54020.5,2082.8,3606.277931,3368.405753,1.070619,1016.0,1016.0,0.0,1214.58,553.18,721.513333,1043.333333,2.195633,0.691546,68.707291,40.882494,1.680604,53138.969817,54826.378969,1687.409152
3937,2021-04-28,54841.4,55036.0,56419.9,53876.4,8060.0906,-0.35%,52590.18,55706.573333,1.059258,26260.04050,77517.377160,2.951914,55036.5,2543.5,3393.722345,3313.412036,1.024238,-195.1,0.0,195.1,1214.58,475.02,476.373333,1056.340000,2.556903,0.450966,71.885653,31.080393,2.312894,53706.446545,54828.256598,1121.810053
3938,2021-04-29,53560.8,54838.6,55173.7,52400.0,8030.0900,-2.34%,53284.56,55078.600000,1.033669,26262.05440,71186.047140,2.710605,54841.4,2773.7,3269.717876,3277.431233,0.997647,-1280.6,0.0,1280.6,1214.58,520.20,476.373333,1104.346667,2.334833,0.431362,70.013489,30.136478,2.323214,53657.897697,54669.824523,1011.926826
3939,2021-04-30,57720.3,53562.3,57925.6,53088.7,100030.0704,7.77%,55035.90,54712.220000,0.994119,45250.06448,77384.045787,1.710142,53560.8,4836.9,3583.154300,3381.395818,1.059667,4159.5,4159.5,0.0,2046.48,295.14,737.966667,1104.346667,6.933930,0.668238,87.395905,40.056523,2.181815,55012.031798,55051.133958,39.102160


In [None]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# convert an array of values into a dataset matrix
def create_dataset(dataset, col, look_back=1):
  dataX, dataY = [], []
  for i in range(len(dataset)-look_back-1):
    a = dataset[i:(i+look_back), 0]
    a = np.append(a, col[i + look_back-1])
    dataX.append(a)
    dataY.append(dataset[i + look_back, 0])
  return np.array(dataX), np.array(dataY)

#LSTM is sensitive to scale of input data, so we rescale everything between 0 and 1
scaler = MinMaxScaler(feature_range=(0, 1))
bitcoin_df_scaled = scaler.fit_transform(bitcoin_df['Price'].values.reshape(-1,1))

bitcoin_train_df_scaled = bitcoin_df_scaled[bitcoin_df['Date']<='2020-01-01']
bitcoin_test_df_scaled = bitcoin_df_scaled[bitcoin_df['Date']>='2020-01-02']

look_back = 1
cols = ['SMA_Ratio', 'ATR_Ratio', 'SMA_Volume_Ratio', 'RSI_Ratio', 'MACD']
# cols = ['SMA_Volume_Ratio']

trainX, trainY = create_dataset(bitcoin_train_df_scaled, bitcoin_df[cols].values.tolist(), look_back)
testX, testY = create_dataset(bitcoin_test_df_scaled, bitcoin_df[cols].values.tolist(), look_back)

# reshape input to be [samples, time steps, features]
trainX = np.reshape(trainX, (trainX.shape[0], trainX.shape[1]))
testX = np.reshape(testX, (testX.shape[0], testX.shape[1]))

In [None]:
trainX[np.any(np.isnan(trainX))] = 0
trainX[np.any(np.isfinite(trainX))] = 0
testX[np.any(np.isnan(testX))] = 0
testX[np.any(np.isfinite(testX))] = 0

In [None]:
testX.shape

(484, 6)

In [None]:
# Model Training

from scipy import stats
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.svm import SVR

def model_train(X, y, X_test, cf):
  cf.fit(X, y)
  y_pred = cf.predict(X_test)
  return y_pred


def error_calc(y_pred, y_true):
  return mean_squared_error(y_true, y_pred), mean_squared_error(y_true, y_pred, squared=False) # MSE, RMSE


methods = {"XGB": XGBRegressor(learning_rate=.1), 
           "LR": LinearRegression(), \
           "DT": DecisionTreeRegressor(), \
           "RF": RandomForestRegressor(), \
           "BAG": BaggingRegressor(base_estimator=SVR()), \
           "VOT": VotingRegressor([('lr', LinearRegression()), ('rf', RandomForestRegressor())]), \
           "EXT": ExtraTreesRegressor(), \
           "ADA": AdaBoostRegressor()}

results = {method: [] for method in methods}

for method in methods:
  y_pred = model_train(trainX, trainY, testX, methods[method])
  results[method] = y_pred
  print("MSE, RMSE error for {}: {}".format(method, error_calc(y_pred, testY)))

MSE, RMSE error for XGB: (0.154615580519115, 0.3932118773881519)
MSE, RMSE error for LR: (0.15462266574612846, 0.3932208867114366)
MSE, RMSE error for DT: (0.1546226657461284, 0.3932208867114365)
MSE, RMSE error for RF: (0.1545972361143889, 0.3931885503348093)
MSE, RMSE error for BAG: (0.15469541891867805, 0.39331338512524344)
MSE, RMSE error for VOT: (0.15465693518026233, 0.39326445959463757)
MSE, RMSE error for EXT: (0.15462266574612854, 0.3932208867114367)
MSE, RMSE error for ADA: (0.1399759097718205, 0.37413354537092836)


**Questions with New dataset Q26, Q27**

In [None]:
# Data
train = '/content/drive/MyDrive/Data Mining Class (Master)/HW3/Dataset/training_data.csv'
tour = '/content/drive/MyDrive/Data Mining Class (Master)/HW3/Dataset/tournament_data.csv'

# Count nrows

def count_lines(path):
  with open(path) as fp:
      for (count, _) in enumerate(fp, 1):
        pass
  return count


print(count_lines(train))
print(count_lines(tour))

# Read Data
import pandas as pd

train_data = pd.read_csv(train, nrows=10000).set_index('id')

tour_data = pd.read_csv(tour, nrows=10000).set_index('id')

features = [f for f in train_data.columns if 'feature' in f]

X = train_data[features]
y = train_data['target']
X_test = tour_data[features]
y_true = tour_data['target']

501809
1725603


In [None]:
# Model Training

from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.svm import SVR

def model_train(X, y, X_test, cf):
  cf.fit(X, y)
  y_pred = cf.predict(X_test)
  return y_pred


def error_calc(y_pred, y_true):
  return mean_squared_error(y_true, y_pred), mean_squared_error(y_true, y_pred, squared=False) # MSE, RMSE


methods = {"XGB": XGBRegressor(learning_rate=.1), 
           "LR": LinearRegression(), \
           "DT": DecisionTreeRegressor(), \
           "RF": RandomForestRegressor(), \
           "BAG": BaggingRegressor(base_estimator=SVR()), \
           "VOT": VotingRegressor([('lr', LinearRegression()), ('rf', RandomForestRegressor())]), \
           "EXT": ExtraTreesRegressor(), \
           "ADA": AdaBoostRegressor()}

results = {method: [] for method in methods}

for method in methods:
  y_pred = model_train(X, y, X_test, methods[method])
  results[method] = y_pred
  print("MSE, RMSE error for {}: {}".format(method, error_calc(y_pred, y_true)))

MSE, RMSE error for XGB: (0.05101390215442374, 0.2258625736026749)
MSE, RMSE error for LR: (0.05278123475952705, 0.2297416696194381)
MSE, RMSE error for DT: (0.11835, 0.34402034823539146)
MSE, RMSE error for RF: (0.050840847499999994, 0.2254791509208778)
MSE, RMSE error for BAG: (0.05596810163949772, 0.23657578413586144)
MSE, RMSE error for VOT: (0.05107260476268298, 0.22599248828817958)
MSE, RMSE error for EXT: (0.051203181874999996, 0.22628120088730305)
MSE, RMSE error for ADA: (0.05005330248051367, 0.223725953971625)


In [None]:
# Spearman between models and y_true
from scipy import stats

for m in results:
  rho, pval = stats.spearmanr(results[m], y_true)
  print("Spearman correlation (rho, pval) between {} and y_true: {}, {}".format(m, rho, pval))

Spearman correlation (rho, pval) between XGB and y_true: -0.015677615078634587, 0.11696006662725852
Spearman correlation (rho, pval) between LR and y_true: -0.0015004768539846427, 0.8807417254055534
Spearman correlation (rho, pval) between DT and y_true: -0.010461672684597542, 0.2955309709091788
Spearman correlation (rho, pval) between RF and y_true: -0.0014734940785052406, 0.882870711258991
Spearman correlation (rho, pval) between BAG and y_true: -0.011644831126105147, 0.24427125665803498
Spearman correlation (rho, pval) between VOT and y_true: -0.0039875420253721455, 0.6901098903154018
Spearman correlation (rho, pval) between EXT and y_true: -0.015566345691622992, 0.11958109830750738
Spearman correlation (rho, pval) between ADA and y_true: -0.01847725125314812, 0.06465272160591216


In [None]:
# Spearman between outputs from different models
from scipy import stats

for i, m1 in enumerate(results):
  for j, m2 in enumerate(results):
    if j > i:
      rho, pval = stats.spearmanr(results[m1], results[m2])
      print("Spearman correlation (rho, pval) between {} and {}: {}, {}".format(m1, m2, rho, pval))