# Computing Project Research Notebook

---
## 1.0: Data Processing

### 1.1: Retrieving Data

In [1]:
%%javascript
//
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import time
import urllib
import datetime
import csv
import matplotlib
import numpy as np
from operator import itemgetter
from datetime import datetime as dt
from matplotlib import pyplot as plt

In [3]:
#Loading csv files into pandas returns a dictionary with dataframes of tickers
def load_to_df(tickers, path):
    data = {}
    
    for i in range(len(tickers)):
        full_path = path + "/" + tickers[i] + ".csv"
        ticker = tickers[i].replace('^', '')
        data.update({ticker: pd.read_csv(full_path, parse_dates=True, index_col=0)})
        
    return data

In [4]:
# Downlaoding Files to .csv
path = "C:\\Users\\noahd\\Google Drive\\University\\2k20-21\\Personal Project\\Data\\Stocks"
tickers = ["^GSPC", "^DJI", "^FTSE", "^N225", "^BSESN"]
dates = ('2011-04-07', '2021-04-06')

In [5]:
data_raw = load_to_df(tickers, path)

### 1.2: Normalising and Differencing Data

In [6]:
from sklearn import preprocessing

#Applies normalisation and difference transfrom to a pandas dataframe
#Can only normalise and remove Na if passed diff=False
def process_data(data, dates, diff=True):
    data_proc = data.copy()
    scaler = preprocessing.MinMaxScaler()
    
    for key in data_proc:
        df_copy = data_proc[key].copy(deep=True)# So the original data is not overwritten
        df_copy = df_copy.reindex(pd.date_range(start=dates[0], end=dates[1], freq='D')) #For using mutiple time series, all trackers must have the same dates
        
        if diff==True:
            df_copy = df_copy.diff() #Difference transform
        
        df_copy = df_copy.resample('d').mean().interpolate('spline', order=3, s=0) #Iterpolate missing dates
        df_copy = pd.DataFrame(scaler.fit_transform(df_copy.values), columns = df_copy.columns, index=df_copy.index.values) #Scales values
        df_copy = df_copy.drop(df_copy.tail(1).index)
        df_copy = df_copy.dropna(axis=0)
        data_proc[key] = df_copy # Assign new data back to data dictionary    
    
    return data_proc

In [7]:
#Processing all Datasets
data_normal = process_data(data_raw, dates, diff=False)
data_diff = process_data(data_raw, dates)

---
## 3: Machine Learning Modeling

### 3.1: CNN-LSTM Model Configuration

In [23]:
import keras as kr
import tensorflow as tf
import kerastuner as kt
#Model Layers
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Input
from tensorflow.keras import Sequential
from sklearn.metrics import mean_squared_error as mse

# --- Model Fitting Callbacks --- #
modelfit_callbacks = [tf.keras.callbacks.TensorBoard(log_dir=path + "\\logs\\fit\\" + str(datetime.datetime), histogram_freq=1),
                      tf.keras.callbacks.EarlyStopping('val_loss', patience=7)]

# https://gist.github.com/GermanCM/1943a0dc1eac04f848c6fe9b16947ac4
#Contains methods for building, trainig, optimising and validating the model.
def reset_weights(model):
    import keras.backend as K
    session = K.get_session()
    for layer in model.layers: 
        if hasattr(layer, 'kernel_initializer'): 
            layer.kernel.initializer.run(session=session)
        if hasattr(layer, 'bias_initializer'):
            layer.bias.initializer.run(session=session)  


#Takes hyperpameter tuner as input reutns model, to be run with keras tuner
def model_builder(hp, samples=1, time_steps=1, features=30, params='Egg'):
    model = Sequential()
    
    # --- Hypertuner Parameter Variables --- #
    #Dictionary organising all params for easy reading
    params = {
        # Layer Parameters
        'conv1d' : {
            'units' : hp.Int('conv_units', min_value=1, max_value=8, step=1), #Conv1D Units
            'kernal_size' : hp.Int('kernal_size', min_value=2, max_value=10, step=1),
            'filters' : hp.Int('filters', min_value=4, max_value=16, step=1)
        },
        'max_pooling' : {
            'pool_size' : hp.Int('pool_size', min_value=1, max_value=4, step=1)
        },
        'lstm' : {
            'units' : hp.Int('lstm_units', min_value=5, max_value=255, step=5)
        },
        'dense' : {
            'units' : hp.Int('dense_units', min_value=4, max_value=256, step=4)
        },
        # Hyper Parameters
        'hp' : {
            'lr' : hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
        }

    }

    if(params=='Default'):
        paramas = {
        'conv1D' : {'units': 1, 'filters' : 7},
        'maxpooling' : {'pool_size' : 2},
        'ltsm': {'units': 45},
        'dense': {'units': 128},
        'hp': {'lr': 1e-2}
    }
    
    # --- Build Model === #
    # Add time distributed -wrapped CNN layers
    model.add(Conv1D(filters=params['conv1d']['filters'],
                     kernel_size=params['conv1d']['kernal_size'],
                     activation='relu', 
                     padding='causal',))
    #model.add(MaxPooling1D(pool_size=params['max_pooling']['pool_size'], padding='valid'))
    #model.add(Flatten())
    model.add(LSTM(params['lstm']['units'])) #LTSM Layer
    model.add(Dense(params['dense']['units']))
    print(model.summary(line_length=None, positions=None, print_fn=None))

    #Compile Model
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=params['hp']['lr']),
        loss=kr.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=['accuracy'])

    return model

#Fits the model using keras tuner, hypertuner object. Returns the most model.
def fit_model(train, tuner, epochs, val_split, n_shifts):
    x = train
    y = np.roll(train, -1*n_shifts, axis=0) #Lags the y input
    print(x.shape, y.shape)

    tuner.search(tf.keras.Input(shape=30), y,
        epochs=epochs,
        validation_split=val_split, 
        callbacks=modelfit_callbacks)
    
    
    best_model = tuner.get_best_models(num_models=1)
    model = best_model=[0]

    return model

def y_predict(model, history, n_shifts):
    import numpy as np
    
    # For if using differencing between train and test #
    # prepare data
    correction = 0.0
    #if n_diff > 0:
    #    correction = history[-n_diff]
    #    history = difference(history, n_diff)
    
    # Make Predictions
    x_input = np.array(history[-n_shifts:]).reshape((1, n_shifts, 1))
    yhat = model.predict(x_input, verbose=0)
    return correction + yhat[0]

# evaluate a single model
def walk_forward_validation(data, tuner, n_shifts, epochs, val_split):
    pred = list()
    train, validation = train_test_split(data, test_size=val_split, shuffle=False)
    history = [x for x in train]
    print('Walk Forward Train', train.shape)
    model = fit_model(train, tuner, epochs, val_split, n_shifts)
    
    # Loop through each time period in chronlogiccal order with past data
    # Each period = n_shifts
    for i in range(len(validation)):
        yhat = y_predict(model, history, n_shifts) #Get prediction for y after n_shifts
        pred.append(yhat) #Store predicted value
        history.append(validaton[i]) #Add consecutive validation value to past data
    
    error = mse(validation, pred, squared=False) # Calculate overall error
    #predictions = array(predictions)
    #score, scores = evaluate_forecasts(test[:, :, 0], predictions)
    return error, pred

def evaluate(data, test_split, val_split, n_shifts, epochs, repeats=1, hp_epochs=10):
    #Set tuner 
    tuner = kt.Hyperband(model_builder,
             objective='val_accuracy',
             max_epochs=hp_epochs,
             factor=3,
             directory='C:\\Users\\noahd\\Google Drive\\University\\2k20-21\\Personal Project\\Data\\res',
             project_name='Stock-CNN-LTSM')

    
    predictions_matrix = []
    scores = []
    train, test = train_test_split(data, test_size=test_split, shuffle=False) #Split to train and test
    
    for n in range(repeats):
        score, predictions = walk_forward_validation(train, tuner, n_shifts, epochs, val_split)
        scores.append(score)
        predictions_matrix.append(predictions)
    
    return scores, prediction_matrix
    

In [24]:
#Using tensorboard for network graphs and debugging
%load_ext tensorboard
import tensorboard


The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [31]:
# Creates input Matrix x(n,m,l): Where data is n; step_size, m: samples, l; features.
# Steps is the period between each set of timestep. For steps=1, n[1] = n[2]
# Sample size is the number of samples per timestep
# Features is determined by the number features in the dataset, ie: Tackers*Tracker-Atrributes
# Ouput vector y(a, b, c), where a is the timestep after last sample in m
# b is the size of the forecast horizon, the number of values to predict, and c is the target values for prediction
# Returns this data format, and a shifted version to use for rolling predition as y
def format_data_cnn(data_dict, step_size, sample_size, forecast_horizon):
    data_flat = pd.DataFrame()
    for i, df in enumerate(data_dict.values()):
        data_flat.concat(df, axis=1)
    
    #data = np.asarray(data)
    #data  = np.swapaxes(data, 0, 1) #Get the data in right alignment
    #data = np.reshape(data, (data.shape[0], (data.shape[1]*data.shape[2])), order='C') #Merge 2nd and 3d axis to make 2D array
    print(data.shape)

    return data

In [32]:
data_cnn = format_data_cnn(data_diff, 1, 60, 1)
print(data_cnn.shape)
test = evaluate(data_cnn, 0.3, 0.3, 1, 7)

AttributeError: 'DataFrame' object has no attribute 'concat'

In [None]:
%tensorboard --logdir logs