In [1]:
import pandas as pd
import numpy as np 
import yfinance as yf

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [51]:
data = yf.download("ETH-USD")

[*********************100%***********************]  1 of 1 completed


In [52]:
data

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-11-09 00:00:00+00:00,308.644989,329.451996,307.056000,320.884003,320.884003,893249984
2017-11-10 00:00:00+00:00,320.670990,324.717987,294.541992,299.252991,299.252991,885985984
2017-11-11 00:00:00+00:00,298.585999,319.453003,298.191986,314.681000,314.681000,842300992
2017-11-12 00:00:00+00:00,314.690002,319.153015,298.513000,307.907990,307.907990,1613479936
2017-11-13 00:00:00+00:00,307.024994,328.415009,307.024994,316.716003,316.716003,1041889984
...,...,...,...,...,...,...
2022-11-09 00:00:00+00:00,1333.122437,1335.743530,1083.285645,1100.169800,1100.169800,38864492427
2022-11-10 00:00:00+00:00,1100.107178,1341.791138,1093.122559,1299.464600,1299.464600,28581002122
2022-11-11 00:00:00+00:00,1298.882446,1302.295288,1211.329590,1287.221069,1287.221069,20920539099
2022-11-12 00:00:00+00:00,1287.438354,1288.150879,1242.152222,1255.268311,1255.268311,10964962767


## Simple moving average

In [53]:
data['SMA_5'] = data['Close'].transform(lambda x: x.rolling(window = 5).mean())
data["SMA_15"] = data["Close"].transform(lambda x:x.rolling(window = 15).mean())

## Simple moving average volume



In [54]:
data['SMA_5_volume'] = data['Volume'].transform(lambda x: x.rolling(window = 5).mean())
data['SMA_15_volume'] = data['Volume'].transform(lambda x: x.rolling(window = 15).mean())


## Wilder smoothing

In [55]:
def Wilder(data, periods):
    start = np.where(~np.isnan(data))[0][0] #Check if nans present in beginning
    Wilder = np.array([np.nan]*len(data))
    Wilder[start+periods-1] = data[start:(start+periods)].mean() #Simple Moving Average
    for i in range(start+periods,len(data)):
        Wilder[i] = (Wilder[i-1]*(periods-1) + data[i])/periods #Wilder Smoothing
    return(Wilder)

## Average True Range (ATR)

In [56]:
data['true_range'] = np.maximum((data['High'] - data['Low']), 
                         abs(data['High'] - data['Close'].shift()), 
                         abs(data['Low']-data["Close"].shift()))

In [57]:

data.loc[:,'ATR_5'] = Wilder(data['true_range'], 5)
data.loc[:,'ATR_15'] = Wilder(data['true_range'], 15)

## Stochastic Oscillators


In [58]:

data['Lowest_5D'] = data['Low'].transform(lambda x: x.rolling(window = 5).min())
data['High_5D'] = data['High'].transform(lambda x: x.rolling(window = 5).max())
data['Lowest_15D'] =data['Low'].transform(lambda x: x.rolling(window = 15).min())
data['High_15D'] =data['High'].transform(lambda x: x.rolling(window = 15).max())

data['Stochastic_5'] = ((data['Close'] - data['Lowest_5D'])/(data['High_5D'] - data['Lowest_5D']))*100
data['Stochastic_15'] = ((data['Close'] - data['Lowest_15D'])/(data['High_15D'] - data['Lowest_15D']))*100

data['Stochastic_avg_5'] = data['Stochastic_5'].rolling(window = 5).mean()
data['Stochastic_avg_15'] = data['Stochastic_5'].rolling(window = 15).mean()



## Relative Strength Index (RSI)



In [60]:
data['Diff'] = data['Close'].transform(lambda x: x.diff())
data['Up'] = data['Diff']
data.loc[(data['Up']<0), 'Up'] = 0

data['Down'] = data['Diff']
data.loc[(data['Down']>0), 'Down'] = 0 
data['Down'] = abs(data['Down'])

data['avg_5up'] = data['Up'].transform(lambda x: x.rolling(window=5).mean())
data['avg_5down'] = data['Down'].transform(lambda x: x.rolling(window=5).mean())

data['avg_15up'] = data['Up'].transform(lambda x: x.rolling(window=14).mean())
data['avg_15down'] = data['Down'].transform(lambda x: x.rolling(window=14).mean())

data['RS_5'] = data['avg_5up'] / data['avg_5down']
data['RS_15'] = data['avg_15up'] / data['avg_15down']

data['RSI_5'] = 100 - (100/(1+data['RS_5']))
data['RSI_15'] = 100 - (100/(1+data['RS_15']))


## Moving Average Convergence Divergence (MACD)

In [61]:
data['5Ewm'] = data['Close'].transform(lambda x: x.ewm(span=5, adjust=False).mean())
data['15Ewm'] = data['Close'].transform(lambda x: x.ewm(span=15, adjust=False).mean())
data['MACD'] = data['15Ewm'] - data['5Ewm']

## Bollinger Bands

In [63]:
data['15MA'] = data['Close'].transform(lambda x: x.rolling(window=15).mean())
data['SD'] = data['Close'].transform(lambda x: x.rolling(window=15).std())
data['upperband'] = data['15MA'] + 2*data['SD']
data['lowerband'] = data['15MA'] - 2*data['SD']

## Model

In [65]:
data.dropna(inplace=True)

### Target variable

In [75]:
data["y"] = np.where(data.Open*102/100 <= data.Close, 1, 0)

In [76]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

In [77]:
X = X.drop(["RS_5"],axis = 1)

In [79]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [88]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled,y,train_size=0.75)

model = LogisticRegression(max_iter=1000, C=50)
model.fit(X_train,y_train)

y_predict = model.predict(X_test)
confusion_matrix(y_test,y_predict)



array([[325,   0],
       [ 11, 118]], dtype=int64)

In [89]:
accuracy_score(y_test,y_predict)

0.9757709251101322