In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# python_version 3.7.10

### &nbsp;&nbsp;&nbsp; %CLASS% StockClass: used to create the stock dictionary

In [3]:
class StockClass(object):

    # Initialization of the StockClass object with the ticker symbol which is use to construct a yf.Ticker object
    def __init__(self, ticker, isin=None, exchangeid=None, sector=None, industry=None, country=None, pe=None, eps=None,
                 insiderown=None, shsout=None, shsfloat=None, mktcap=None, income=None, sales=None,
                 booksh=None, pb=None, roa=None, tp=None, roe=None, roi=None, employees=None, debteq=None, 
                 groupby=None, confusion_matrix=None, accuracy_report=None, 
                 confusion_matrix2=None, accuracy_report2=None):
        self.name = ticker
        self.isin = isin
        self.exchangeid = exchangeid
        # self.history is the method of the SotckClass object to store data in DataFrame format
        self.history = pd.DataFrame
        self.investing = pd.DataFrame
        self.pickle = pd.DataFrame
        self.not_found = np.array([['date', 'ticker']])
        self.nanDiv = False
        self.nanSplit = False
        self.sector = None
        self.industry = industry
        self.country = country
        self.pe = pe
        self.eps = eps
        self.insiderown = insiderown
        self.shsout = shsout
        self.shsfloat = shsfloat
        self.mktcap = mktcap
        self.income = income
        self.sales = sales
        self.bookh = booksh
        self.pb = pb
        self.roa = roa
        self.tp = tp
        self.roe = roe
        self.roi = roi
        self.employees = employees
        self.debteq = debteq
        self.rsi = pd.DataFrame
        self.groupby = groupby
        self.confusion_matrix = confusion_matrix
        self.accuracy_report = accuracy_report
        self.confusion_matrix2 = confusion_matrix2
        self.accuracy_report2 = accuracy_report2

# 1. First part: import data, cleaning and arranging 

### &nbsp;&nbsp;&nbsp; * Main packages import

In [4]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

### &nbsp;&nbsp;&nbsp; * Data import

In [5]:
open_df_original = pd.read_csv('/content/drive/MyDrive/Data/20210322/open', index_col='date')
high_df_original = pd.read_csv('/content/drive/MyDrive/Data/20210322/high', index_col='date')
low_df_original = pd.read_csv('/content/drive/MyDrive/Data/20210322/low', index_col='date')
adjclose_df_original = pd.read_csv('/content/drive/MyDrive/Data/20210322/adjclose', index_col='date')
volume_df_original = pd.read_csv('/content/drive/MyDrive/Data/20210322/volume', index_col='date')
ticker_list = list(adjclose_df_original.columns)

In [6]:
# to drop columns with the end of the series NaN (probably not quoted anymore)
if open_df_original.isnull().values.any():
    print('open shape before: ', open_df_original.shape)
    open_df_original.dropna(axis=1, how='any', inplace=True)
    print('open shape after: ', open_df_original.shape)    
if high_df_original.isnull().values.any():
    print('high shape before: ', high_df_original.shape)
    high_df_original.dropna(axis=1, how='any', inplace=True)
    print('high shape after: ', high_df_original.shape)  
if low_df_original.isnull().values.any():
    print('low shape before: ', low_df_original.shape)
    low_df_original.dropna(axis=1, how='any', inplace=True)
    print('low shape after: ', low_df_original.shape)    
if adjclose_df_original.isnull().values.any():
    print('high shape before: ', adjclose_df_original.shape)
    adjclose_df_original.dropna(axis=1, how='any', inplace=True)
    print('high shape after: ', adjclose_df_original.shape)    
if volume_df_original.isnull().values.any():
    print('high shape before: ', volume_df_original.shape)
    volume_df_original.dropna(axis=1, how='any', inplace=True)
    print('high shape after: ', volume_df_original.shape)   

open shape before:  (415, 4541)
open shape after:  (415, 4534)
high shape before:  (415, 4541)
high shape after:  (415, 4534)
low shape before:  (415, 4541)
low shape after:  (415, 4534)
high shape before:  (415, 4541)
high shape after:  (415, 4534)
high shape before:  (415, 4541)
high shape after:  (415, 4534)


In [7]:
open_df = open_df_original.drop(index=open_df_original.iloc[-1:].index)
high_df = high_df_original.drop(index=high_df_original.iloc[-1:].index)
low_df = low_df_original.drop(index=low_df_original.iloc[-1:].index)
adjclose_df = adjclose_df_original.drop(index=adjclose_df_original.iloc[-1:].index)
volume_df = volume_df_original.drop(index=volume_df_original.iloc[-1:].index)

# 2. Second part: computation and assessing 

### &nbsp;&nbsp;&nbsp; * SKLearn preprocessing import to scale data

In [9]:
from sklearn import preprocessing

### &nbsp;&nbsp;&nbsp; %FEATURE% Relative variation from open to adjusted close price

In [None]:
# relative variation from open to adjusted close price
adjclose_rel_var_df = (adjclose_df-open_df)/open_df

### &nbsp;&nbsp;&nbsp; %FEATURE% Absolute variation between high and low price

In [None]:
# absolute variation between high and low price
high_low_var_df = (high_df-low_df)

# to scale the absolute variation between min&max value
high_low_var_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
high_low_var_scaled = high_low_var_scaler.fit_transform(high_low_var_df)
high_low_var_scaled_df = pd.DataFrame(data=high_low_var_scaled, index=high_low_var_df.index, columns=high_low_var_df.columns)

### &nbsp;&nbsp;&nbsp; %FEATURE% High low absolute variation over adjusted close price

In [None]:
# high low absolute variation over adjusted close price
high_low_var_df_adjclose = high_low_var_df/adjclose_df

### &nbsp;&nbsp;&nbsp; %FEATURE% Log Return (adjusted close price log return)

In [None]:
adjclose_df_log_return = np.log(adjclose_df/adjclose_df.shift(1))

### &nbsp;&nbsp;&nbsp; * Construction of stack dataset with all features and label to classify

In [None]:
adjclose_rel_var_df_t = adjclose_rel_var_df.transpose()
adjclose_rel_var_df_stack = adjclose_rel_var_df_t.stack(dropna=False)

high_low_var_scaled_df_t = high_low_var_scaled_df.transpose()
high_low_var_scaled_df_stack = high_low_var_scaled_df_t.stack(dropna=False)

high_low_var_df_adjclose_t = high_low_var_df_adjclose.transpose()
high_low_var_df_adjclose_stack = high_low_var_df_adjclose_t.stack(dropna=False)

adjclose_df_t = adjclose_df.transpose()
adjclose_df_stack = adjclose_df_t.stack(dropna=False)

volume_df_t = volume_df.transpose()
volume_df_stack = volume_df_t.stack(dropna=False)

# classification variable 
classification_df = (adjclose_df_log_return > 0) * 1
classification_df_t = classification_df.transpose()
classification_df_t.shift(axis = 1, periods = -1) # sign the day before a rally up with 1 and vice versa with 0 
classification_df_stack = classification_df_t.shift(axis = 1, periods = -1).stack(dropna=False)

data = {'adjclose_rel_var': adjclose_rel_var_df_stack, 
        'high_low_var_scaled': high_low_var_scaled_df_stack,
        'high_low_var_adjclose': high_low_var_df_adjclose_stack,
        'adjclose': adjclose_df_stack,
        'volume': volume_df_stack,
        'label': classification_df_stack}  

df_concat = pd.concat(data, axis=1)
df_concat.dropna(axis = 0, how = 'any', inplace=True)
df_concat

### &nbsp;&nbsp;&nbsp; * Check the correct number of row in the previous dataframe

In [None]:
# check the correct number of row in the previous dataframe
len(open_df.columns) * len(open_df.index) - len(open_df.columns)

### &nbsp;&nbsp;&nbsp; * Construction of stack dataset with other features and label to classify

In [None]:
open_df_t = open_df.loc[adjclose_df_log_return.index[0]:adjclose_df_log_return.index[-1]].transpose()
open_df_stack = open_df_t.stack()

high_df_t = high_df.loc[adjclose_df_log_return.index[0]:adjclose_df_log_return.index[-1]].transpose()
high_df_stack = high_df_t.stack()

low_df_t = low_df.loc[adjclose_df_log_return.index[0]:adjclose_df_log_return.index[-1]].transpose()
low_df_stack = low_df_t.stack()

volume_df_t = volume_df.loc[adjclose_df_log_return.index[0]:adjclose_df_log_return.index[-1]].transpose()
volume_df_stack = volume_df_t.stack()

data = {'open': open_df_stack, 
        'high': high_df_stack,
        'low': low_df_stack,
        'adjclose': adjclose_df_stack,
        'volume': volume_df_stack,
        'label': classification_df_stack} # taken from before computation

df_concat2 = pd.concat(data, axis = 1)
df_concat2.dropna(axis = 0, how = 'any', inplace = True)
df_concat2

### &nbsp;&nbsp;&nbsp; * Definition of the StockClass dictionary

In [None]:
# Definition of a dictionary to store stock as StockClass instances and for each stock get attribute
# (take a look at _2_0_stock_dataframe_class.py for more information)
stock_object_dictionary = {'{0}'.format(ticker): StockClass(ticker=ticker_list) for ticker in ticker_list}

##### Remember: .std() is sample deviation, whereas the standardization thru StandardScaler use the standard deviation (the difference is the denominator, in the sample std it is used N-1). Moreover, to compute the standard deviation, you can do it in this way: .std(ddof=0)

# 3. Third part: Deep Learning

In [None]:
import tensorflow as tf 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
print(tf.__version__)

### &nbsp;&nbsp;&nbsp; %FUNCTION% Function for calling back DL training

In [None]:
accuracy_limit=.7
class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if (logs.get('accuracy')>accuracy_limit):
            print('--- accuracy higher than ', accuracy_limit)
            self.model.stop_training=True

In [None]:
ticker = 'AAPL'

# features dataframe, without classification column
features_df = df_concat.loc[ticker]
features_df.drop('label', axis=1, inplace=True)

# classification array
classification_array = df_concat.loc[ticker]['label'].values

# sacaling data
df_sc_scaled = StandardScaler().fit_transform(features_df)

shuffle_value = False # no shuffle, because of this we cannot stratify our label
stratify_value = None
train_size = 0.80
random_state = None

# define train set and test set 
x_train, x_test, y_train, y_test = train_test_split(features_df, classification_array, train_size = train_size, shuffle = shuffle_value, stratify = stratify_value, random_state = random_state)

In [None]:
callbacks=myCallback()

In [None]:
model=tf.keras.models.Sequential([tf.keras.layers.Flatten(),
                                   tf.keras.layers.Dense(units=500, activation=tf.nn.relu),
                                   tf.keras.layers.Dense(units=2, activation=tf.nn.softmax)])

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(x_train, y_train, epochs=100, callbacks=[callbacks])

In [None]:
stock_object_dictionary['{0}'.format(ticker)].confusion_matrix, stock_object_dictionary['{0}'.format(ticker)].accuracy_report = logistic_reg(df_sc_scaled, classification_array, train_size, shuffle_value, stratify_value, random_stat) 

### a. Trial 1: No Shuffle (i.e. No Stratify) - 80/20 - scaled features