In [1]:
# Import
import yfinance as yf
import pandas as pd
import numpy as np
import requests
import json
import talib as talib
from talib import MA_Type, OBV, ATR, WCLPRICE, AVGPRICE, STDDEV
from matplotlib import pyplot as plt
from mplfinance.original_flavor import candlestick_ohlc
from matplotlib.pylab import date2num
from keras.models import Sequential
from keras.layers import Dense,LSTM,BatchNormalization,Flatten
import seaborn as sns

In [2]:
df = pd.read_csv (r'../CSV/oneStockAAPL.csv', sep=',')
df.drop('Dividends', axis=1, inplace=True)
df.drop('Stock Splits', axis=1, inplace=True)
print(df)


        Timestamp        Open        High         Low       Close     Volume  \
0      1980-12-12    0.100453    0.100890    0.100453    0.100453  469033600   
1      1980-12-15    0.095649    0.095649    0.095213    0.095213  175884800   
2      1980-12-16    0.088661    0.088661    0.088224    0.088224  105728000   
3      1980-12-17    0.090408    0.090845    0.090408    0.090408   86441600   
4      1980-12-18    0.093029    0.093466    0.093029    0.093029   73449600   
...           ...         ...         ...         ...         ...        ...   
10357  2022-01-10  169.080002  172.500000  168.169998  172.190002  106765600   
10358  2022-01-11  172.320007  175.179993  170.820007  175.080002   76138300   
10359  2022-01-12  176.119995  177.179993  174.820007  175.529999   74805200   
10360  2022-01-13  175.779999  176.619995  171.789993  172.190002   84505800   
10361  2022-01-14  171.339996  173.779999  171.089996  173.070007   80355000   

      Symbol  
0       AAPL  
1       A

In [3]:
# Functions
def checkNonesString(df):
    # CHECK Object column for "None" strings.
    print(df[df['Symbol'].str.contains('None')].any())
    

def checkNans(df):
    # Check if there are NaNs
    print("-----IS NA per COL-------")
    print("Doesn't seem to be working")
    print(df[df.isna().any(axis=1)].sum())

    print("-----IS NA per COL-------")
    print(df.isna().sum())

    print("------printing sum of Nans for each column------")
    for col in df.columns:
        print(col + ": " + str(df[col].isnull().sum()))
        
        
def checkInfs(df):
    print("-------total amount of Infs per Column-----------")
    print(df.groupby(np.isinf(df["Open"])).count())

    print("-------total amount of Infs-----------")
    count = np.isinf(df["Open"]).values.sum()
    print("It contains " + str(count) + " infinite values")


def printDataframe(df):
    # Prints
    print("-----DATAFRAME-------")
    print(df.head(10))

    print("-----SHAPE-------")
    print(df.shape)

    print("-----COLUMNS-------")
    print(df.columns)

    print("-----DATATYPES-------")
    print(df.dtypes)



In [None]:
# Casting Timestamp to type datetime64[ns]
df['Timestamp'] = df['Timestamp'].astype('datetime64[ns]')
# Adding week number, to be able to derived the Weekly labels.
df.insert(1, 'Week_Number', df['Timestamp'].dt.week)

# Creating labels
for ticker in df["Symbol"]:
    df["Close1d%"] = df["Close"].pct_change(periods=-1).shift(periods=0)
    df["Close5d%"] = df["Close"].pct_change(periods=-5).shift(periods=0)
    df["Close10d%"] = df["Close"].pct_change(periods=-10).shift(periods=0)
    df["Close15d%"] = df["Close"].pct_change(periods=-15).shift(periods=0)
    df["Close20d%"] = df["Close"].pct_change(periods=-20).shift(periods=0)

printDataframe(df)

  df.insert(1, 'Week_Number', df['Timestamp'].dt.week)


In [None]:
# Adding Numerical indicators/features in a for loop, for all stocks.
list_of_stocks = df['Symbol'].unique()
list_to_category = []
list_to_percentage = []
print(list_of_stocks.size)
for ticker in list_of_stocks:
    
    # it neeeds to know how to access the right rows, those beloning to the right ticker.
#-------- Price comparable indicators------------
    
    # BB
    df["BB upper"], df["BB middle"], df["BB lower"] = talib.BBANDS(df["Close"], matype=MA_Type.T3)
    list_to_category.extend(["BB upper", "BB lower"])
    list_to_percentage.extend(["BB upper", "BB lower"])    

    # SMA 0-50
    for i in range(5,25,5):
        tempColName = "SMA" + str(i)
        list_to_category.append(tempColName)
        list_to_percentage.append(tempColName)
        df[tempColName] = talib.SMA(df["Close"], i)

    # SMA 50-300
    for i in range(50,300,50):
        tempColName = "SMA" + str(i)
        list_to_category.append(tempColName)
        list_to_percentage.append(tempColName)
        df[tempColName] = talib.SMA(df["Close"], i)
        
    # Weighted Closed Price
    df["WCLPRICE"] = WCLPRICE(df["High"], df["Low"], df["Close"])
    list_to_category.append("WCLPRICE")
    list_to_percentage.append("WCLPRICE")

    # AVG Price
    df["AVGPRICE"] = AVGPRICE(df["Open"], df["High"], df["Low"], df["Close"])
    list_to_category.append("AVGPRICE")
    list_to_percentage.append("AVGPRICE")
    
    # STD deviation
    df["STD-5/1"] = STDDEV(df["Close"], timeperiod=5, nbdev=1)
    df["STD-5/2"] = STDDEV(df["Close"], timeperiod=5, nbdev=2)
    df["STD-10/1"] = STDDEV(df["Close"], timeperiod=10, nbdev=1)
    df["STD-10/2"] = STDDEV(df["Close"], timeperiod=10, nbdev=2)
    list_to_category.extend(["STD-5/1", "STD-5/2", "STD-10/1", "STD-10/2"])
    list_to_percentage.extend(["STD-5/1", "STD-5/2", "STD-10/1", "STD-10/2"])

#-------- Other indicators------------
    
    # OBV
    df["OBV"] = OBV(df["Close"], df["Volume"])
    
    # ATR
    df["ATR14"] = ATR(df["High"], df["Low"], df["Close"], timeperiod=14)
    df["ATR5"] = ATR(df["High"], df["Low"], df["Close"], timeperiod=5)

    # RSI
    df["RSI"] = talib.RSI(df["Close"])
    
    # Momentum
    df["Momentum"] = talib.MOM(df["Close"], timeperiod=5)

    # MACD
    df["macd"], df["macd_signal"], df["macd_hist"] = talib.MACD(df['Close'])

In [None]:
# A checking of values validity
checkNonesString(df)
checkNans(df)
checkInfs(df)
printDataframe(df)

In [None]:
# Example plotting
df_plot = df[df["Timestamp"] > '2020-12-12']
df_plot.index = df_plot["Timestamp"]

checkNans(df_plot)

fig = plt.figure()
fig.set_size_inches((50, 36))
ax_rsi = fig.add_axes((0, 0.24, 1, 0.2))

# Plotting the RSI boundaries
#ax_rsi.plot(df_plot.index, [70] * len(df_plot.index), label="overbought")
#ax_rsi.plot(df_plot.index, [30] * len(df_plot.index), label="oversold")
#ax_rsi.plot(df_plot.index, df_plot["RSI"], label="rsi")

# Plotting the BB 
#ax_rsi.plot(df_plot.index,df_plot["BB upper"],  label="BBhigh", color ="purple")
#ax_rsi.plot(df_plot.index,df_plot["BB lower"], label="BBlow", color ="purple")
#ax_rsi.plot(df_plot.index,df_plot["BB middle"], label="BBmiddle", color ="purple")

# Plotting Volume
#ax_rsi.plot(df_plot.index,(df_plot["OBV"] / 1000000000), label="OBV", color ="teal")


# Plotting OBV
#ax_rsi.plot(df_plot.index, (df_plot["OBV"] / 1000000000), label="OBV", color ="teal")
ax_rsi.plot(df_plot.index, df_plot["Close1d%"])
ax_rsi.plot(df_plot.index, df_plot["Close5d%"])

# plotting the Price
#ax_rsi.plot(df_plot.index, (df_plot["Volume"] / 10000000), label = "Vol", color= "black")  # density=False would make counts
#ax_rsi.plot(df_plot["Close"], label = "Price")
#ax_rsi.plot((df_plot["Volume"] / 10000000), label="Vol", color ="Black")
ax_rsi.legend()

In [None]:
# Categoriacl feature engineering represented as binary
print(list_to_category)
#print(list_to_category)
for col in list_to_category:
    under = "Under " + col 
    over = "Over " + col 
    df[under] = df[col] <= df["Close"]
    df[over] = df[col] > df["Close"]
    df[under] = df[under].astype(int)
    df[over] = df[over].astype(int)
    
print(df.columns)
print(df.head(30))


In [None]:
# % difference from price Features
print(list_to_percentage)
for col in list_to_percentage:
    col_name = "% " + col 
    df[col_name] = df[col] / df["Close"]
    #df[col_name] = df["Close"] / df[col]
    print(df[col_name])

In [None]:
# Other indicators % feature engineering

In [None]:
# correlation calculation
feature_corr = df.copy().corr()

# Sorting
sortedDesc1d = feature_corr["Close1d%"].sort_values(ascending=False) 
sortedDesc5d = feature_corr["Close5d%"].sort_values(ascending=False) 
sortedDesc10d = feature_corr["Close10d%"].sort_values(ascending=False) 
sortedDesc15d = feature_corr["Close15d%"].sort_values(ascending=False) 
sortedDesc20d = feature_corr["Close20d%"].sort_values(ascending=False) 

# Printing
print(sortedDesc1d.head(10))
print(sortedDesc5d.head(10))
print(sortedDesc10d.head(10))
print(sortedDesc15d.head(10))
print(sortedDesc20d.head(10))

In [None]:
# Exporting as CSV 
oneStockAAPL.to_csv(r'../CSV/FeaturesAAPL.csv', sep= ",", index = False)

In [None]:
#cf_matrix = confusion_matrix(feature_corr)
#plt.figure(figsize = (24,18))
#sns.heatmap(feature_corr, annot=True, cbar=False, )

In [None]:
# LSTM test
# reshaping into NP arrays

#df_model = df.iloc[60:-60]
#print(df_model.shape)
#sma_1 = np.array(df_model["SMA50"]).reshape(len(df_model["SMA50"]),1)
#sma_2 = np.array(df_model["SMA10"]).reshape(len(df_model["SMA10"]),1)
#sma_2 = sma_1[:-60]
#print(len(sma_1),len(sma_2))
#plt.plot(sma_1)
#plt.plot(sma_2)
#
#smas = np.hstack((sma_1,sma_2))
#smas.shape
#X = np.array(smas)
#y = np.array(df_model["Close"])
#X.shape,y.shape

In [None]:
#model = Sequential()
#model.add(LSTM(100, activation='relu', kernel_initializer='he_normal', input_shape=(k[0],1)))
#model.add(Dense(50, activation='relu', kernel_initializer='he_normal'))
#model.add(Dense(50, activation='relu', kernel_initializer='he_normal'))
#model.add(Dense(50, activation='relu', kernel_initializer='he_normal'))
#model.add(Dense(50, activation='relu', kernel_initializer='he_normal'))
#model.add(Dense(50, activation='relu', kernel_initializer='he_normal'))
#model.add(BatchNormalization())
#model.add(Dense(1))

In [None]:
#train_loss = model.evaluate(X, y)
#print(train_loss)

In [None]:
#model.predict(X)