In [104]:
import pandas as pd
import numpy as np 
import yfinance as yf

In [329]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [330]:
import talib

In [464]:
aapl = yf.download("TSLA")

[*********************100%***********************]  1 of 1 completed


In [465]:
aapl

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-06-29 00:00:00-04:00,1.266667,1.666667,1.169333,1.592667,1.592667,281494500
2010-06-30 00:00:00-04:00,1.719333,2.028000,1.553333,1.588667,1.588667,257806500
2010-07-01 00:00:00-04:00,1.666667,1.728000,1.351333,1.464000,1.464000,123282000
2010-07-02 00:00:00-04:00,1.533333,1.540000,1.247333,1.280000,1.280000,77097000
2010-07-06 00:00:00-04:00,1.333333,1.333333,1.055333,1.074000,1.074000,103003500
...,...,...,...,...,...,...
2022-11-04 00:00:00-04:00,222.600006,223.800003,203.080002,207.470001,207.470001,98453100
2022-11-07 00:00:00-05:00,208.649994,208.899994,196.660004,197.080002,197.080002,93916500
2022-11-08 00:00:00-05:00,194.020004,195.199997,186.750000,191.300003,191.300003,128803400
2022-11-09 00:00:00-05:00,190.779999,195.889999,177.119995,177.589996,177.589996,127062700


## Simple moving average

In [466]:
aapl['SMA_5'] = aapl['Close'].transform(lambda x: x.rolling(window = 5).mean())
aapl["SMA_15"] = aapl["Close"].transform(lambda x:x.rolling(window = 15).mean())

## Simple moving average volume



In [467]:
aapl['SMA_5_volume'] = aapl['Volume'].transform(lambda x: x.rolling(window = 5).mean())
aapl['SMA_15_volume'] = aapl['Volume'].transform(lambda x: x.rolling(window = 15).mean())


## Wilder smoothing

In [468]:
def Wilder(data, periods):
    start = np.where(~np.isnan(data))[0][0] #Check if nans present in beginning
    Wilder = np.array([np.nan]*len(data))
    Wilder[start+periods-1] = data[start:(start+periods)].mean() #Simple Moving Average
    for i in range(start+periods,len(data)):
        Wilder[i] = (Wilder[i-1]*(periods-1) + data[i])/periods #Wilder Smoothing
    return(Wilder)

## Average True Range (ATR)

In [469]:
aapl['true_range'] = np.maximum((aapl['High'] - aapl['Low']), 
                         abs(aapl['High'] - aapl['Close'].shift()), 
                         abs(aapl['Low']-aapl["Close"].shift()))

In [470]:
TR_data = aapl.copy()
aapl.loc[:,'ATR_5'] = Wilder(aapl['true_range'], 5)
aapl.loc[:,'ATR_15'] = Wilder(aapl['true_range'], 15)

 ## Average Directional Index

## Stochastic Oscillators


In [471]:

aapl['Lowest_5D'] = aapl['Low'].transform(lambda x: x.rolling(window = 5).min())
aapl['High_5D'] = aapl['High'].transform(lambda x: x.rolling(window = 5).max())
aapl['Lowest_15D'] =aapl['Low'].transform(lambda x: x.rolling(window = 15).min())
aapl['High_15D'] =aapl['High'].transform(lambda x: x.rolling(window = 15).max())

aapl['Stochastic_5'] = ((aapl['Close'] - aapl['Lowest_5D'])/(aapl['High_5D'] - aapl['Lowest_5D']))*100
aapl['Stochastic_15'] = ((aapl['Close'] - aapl['Lowest_15D'])/(aapl['High_15D'] - aapl['Lowest_15D']))*100

aapl['Stochastic_avg_5'] = aapl['Stochastic_5'].rolling(window = 5).mean()
aapl['Stochastic_avg_15'] = aapl['Stochastic_5'].rolling(window = 15).mean()



## Relative Strength Index (RSI)

RSI is one of the most common momentum indicator aimed at quantifies price changes and the speed of such change.

In [472]:
aapl['Diff'] = aapl['Close'].transform(lambda x: x.diff())
aapl['Up'] = aapl['Diff']
aapl.loc[(aapl['Up']<0), 'Up'] = 0

aapl['Down'] = aapl['Diff']
aapl.loc[(aapl['Down']>0), 'Down'] = 0 
aapl['Down'] = abs(aapl['Down'])

aapl['avg_5up'] = aapl['Up'].transform(lambda x: x.rolling(window=5).mean())
aapl['avg_5down'] = aapl['Down'].transform(lambda x: x.rolling(window=5).mean())

aapl['avg_15up'] = aapl['Up'].transform(lambda x: x.rolling(window=14).mean())
aapl['avg_15down'] = aapl['Down'].transform(lambda x: x.rolling(window=14).mean())

aapl['RS_5'] = aapl['avg_5up'] / aapl['avg_5down']
aapl['RS_15'] = aapl['avg_15up'] / aapl['avg_15down']

aapl['RSI_5'] = 100 - (100/(1+aapl['RS_5']))
aapl['RSI_15'] = 100 - (100/(1+aapl['RS_15']))


## Moving Average Convergence Divergence (MACD)

In [473]:
aapl['5Ewm'] = aapl['Close'].transform(lambda x: x.ewm(span=5, adjust=False).mean())
aapl['15Ewm'] = aapl['Close'].transform(lambda x: x.ewm(span=15, adjust=False).mean())
aapl['MACD'] = aapl['15Ewm'] - aapl['5Ewm']

## Bollinger Bands

In [474]:
aapl['15MA'] = aapl['Close'].transform(lambda x: x.rolling(window=15).mean())
aapl['SD'] = aapl['Close'].transform(lambda x: x.rolling(window=15).std())
aapl['upperband'] = aapl['15MA'] + 2*aapl['SD']
aapl['lowerband'] = aapl['15MA'] - 2*aapl['SD']

In [475]:
aapl.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3116 entries, 2010-06-29 00:00:00-04:00 to 2022-11-10 00:00:00-05:00
Data columns (total 39 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Open               3116 non-null   float64
 1   High               3116 non-null   float64
 2   Low                3116 non-null   float64
 3   Close              3116 non-null   float64
 4   Adj Close          3116 non-null   float64
 5   Volume             3116 non-null   int64  
 6   SMA_5              3112 non-null   float64
 7   SMA_15             3102 non-null   float64
 8   SMA_5_volume       3112 non-null   float64
 9   SMA_15_volume      3102 non-null   float64
 10  true_range         3115 non-null   float64
 11  ATR_5              3111 non-null   float64
 12  ATR_15             3101 non-null   float64
 13  Lowest_5D          3112 non-null   float64
 14  High_5D            3112 non-null   float64
 15  Lowest_15D         3102 

## Model

In [476]:
aapl.dropna(inplace=True)

In [477]:
aapl["y"] = np.where(aapl.Open*102/100 <= aapl.Close, 1, 0)

In [478]:
X = aapl.iloc[:,:-1]
y = aapl.iloc[:,-1]

In [479]:
X.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Open,3098.0,58.01393,95.65753,1.186667,9.104834,16.16867,24.074,411.47
High,3098.0,59.31263,97.85715,1.193333,9.391666,16.44333,24.44833,414.4967
Low,3098.0,56.58949,93.18274,1.159333,8.915667,15.892,23.57767,405.6667
Close,3098.0,57.97727,95.53833,1.173333,9.176667,16.17567,23.97667,409.97
Adj Close,3098.0,57.97727,95.53833,1.173333,9.176667,16.17567,23.97667,409.97
Volume,3098.0,93382940.0,81989220.0,1777500.0,42088200.0,75544500.0,117205900.0,914082000.0
SMA_5,3098.0,57.85711,95.41469,1.221733,9.036633,16.166,23.87847,403.0967
SMA_15,3098.0,57.51747,95.07281,1.269378,8.760533,16.05989,23.52402,373.0742
SMA_5_volume,3098.0,93310960.0,72034600.0,3412200.0,46853250.0,81411690.0,117919900.0,640115700.0
SMA_15_volume,3098.0,93248290.0,66171770.0,3784300.0,47969980.0,83302350.0,120626800.0,409161300.0


In [480]:
X = X.drop(["RS_5"],axis = 1)

In [481]:
scaler = StandardScaler()

In [482]:
X_scaled = scaler.fit_transform(X)

In [493]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled,y,train_size=0.75)

model1 = LogisticRegression(max_iter=1000, C=50)
model1.fit(X_train,y_train)

y1_predict = model1.predict(X_test)
confusion_matrix(y_test,y1_predict)



array([[620,  19],
       [ 65,  71]], dtype=int64)

In [494]:
accuracy_score(y_test,y1_predict)

0.8916129032258064