#Import and Define
Project 2: Time Series Forecasting using NN, LSTM, and CNN

Authors: Jason Phillips and Peeja

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
!pip install tf-explain

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [7]:
#Import and print out the current version of packages.
%load_ext tensorboard

import tensorflow as tf
import tf_explain
import numpy as np
import matplotlib.pyplot as plt
import sys
import sklearn as sk
import pandas as pd
import shutil
import os
import imblearn
import io
import requests
import datetime

from scipy.stats import zscore
from collections.abc import Sequence
from collections import Counter
from numpy import where
from sklearn.datasets import make_classification
from matplotlib import pyplot
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D

from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras import optimizers

from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve, auc

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) * (normalized_high - normalized_low) + normalized_low

# Plot a confusion matrix.
# cm is the confusion matrix, names are the names of the classes.
def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(names))
    plt.xticks(tick_marks, names, rotation=45)
    plt.yticks(tick_marks, names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    

# Plot an ROC. pred - the predictions, y - the expected output.
def plot_roc(pred,y):
    fpr, tpr, thresholds = roc_curve(y, pred)
    roc_auc = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.legend(loc="lower right")
    plt.show()

print('Python {}'.format(sys.version))
print('Pandas {}'.format(pd.__version__))
print('Numpy {}'.format(np.__version__))
print('Scikit-Learn {}'.format(sk.__version__))
print()
print('Tensor Flow {}'.format(tf.__version__))
print('Keras {}'.format(tf.keras.__version__))
print('Imblearn {}'.format(imblearn.__version__))

#Remove any files in the log folder for tensorboard
!rm -rf "/content/drive/MyDrive/Colab Notebooks/logs/"

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard
Python 3.8.10 (default, Nov 14 2022, 12:59:47) 
[GCC 9.4.0]
Pandas 1.3.5
Numpy 1.22.4
Scikit-Learn 1.2.1

Tensor Flow 2.11.0
Keras 2.11.0
Imblearn 0.8.1


#Data Preprocessing
Import dataset and preprocess it.

In [8]:
#Create the dataframe and convert empty values to NA
df = pd.read_csv('https://raw.githubusercontent.com/JasonTPhillipsJr/CSC215/main/TSLA.csv', na_values=['NA', '?'])
df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2010-06-29,1.266667,1.666667,1.169333,1.592667,1.592667,281494500
1,2010-06-30,1.719333,2.028000,1.553333,1.588667,1.588667,257806500
2,2010-07-01,1.666667,1.728000,1.351333,1.464000,1.464000,123282000
3,2010-07-02,1.533333,1.540000,1.247333,1.280000,1.280000,77097000
4,2010-07-06,1.333333,1.333333,1.055333,1.074000,1.074000,103003500
...,...,...,...,...,...,...,...
3186,2023-02-24,196.330002,197.669998,192.800003,196.880005,196.880005,142228100
3187,2023-02-27,202.029999,209.419998,201.259995,207.630005,207.630005,161028300
3188,2023-02-28,210.589996,211.229996,203.750000,205.710007,205.710007,153144900
3189,2023-03-01,206.210007,207.199997,198.520004,202.770004,202.770004,156852800


In [9]:
#Check if any values are empty or null
df[df.isnull().any(axis=1)]

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume


In [10]:
#Display data types for each column
df.dtypes

Date          object
Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume         int64
dtype: object

In [11]:
#Show some statistics for each column.
data = {'Mean': df.mean(),
        'Max': df.max(),
        'Min': df.min(),
        'Variaence': df.var(),
        'STD': df.std()}

dataframe = pd.DataFrame(data)
dataframe

  data = {'Mean': df.mean(),
  'Variaence': df.var(),
  'STD': df.std()}


Unnamed: 0,Mean,Max,Min,Variaence,STD
Adj Close,60.20587,409.970001,1.053333,9175.041,95.78644
Close,60.20587,409.970001,1.053333,9175.041,95.78644
Date,,2023-03-02,2010-06-29,,
High,61.60542,414.496674,1.108667,9626.803,98.11627
Low,58.74202,405.666656,0.998667,8724.17,93.40326
Open,60.23675,411.470001,1.076,9195.453,95.89293
Volume,94856380.0,914082000,1777500,6723681000000000.0,81998050.0


In [None]:
#Drop columns that aren't needed
df = df.drop(['Date', 'Adj Close'], axis=1)
df

Unnamed: 0,Open,High,Low,Close,Volume
0,1.266667,1.666667,1.169333,1.592667,281494500
1,1.719333,2.028000,1.553333,1.588667,257806500
2,1.666667,1.728000,1.351333,1.464000,123282000
3,1.533333,1.540000,1.247333,1.280000,77097000
4,1.333333,1.333333,1.055333,1.074000,103003500
...,...,...,...,...,...
3186,196.330002,197.669998,192.800003,196.880005,142228100
3187,202.029999,209.419998,201.259995,207.630005,161028300
3188,210.589996,211.229996,203.750000,205.710007,153144900
3189,206.210007,207.199997,198.520004,202.770004,156852800


In [None]:
#Create a copy of the Close column to use for the output feature.
df['Close_Output'] = df['Close']
df

Unnamed: 0,Open,High,Low,Close,Volume,Close_Output
0,1.266667,1.666667,1.169333,1.592667,281494500,1.592667
1,1.719333,2.028000,1.553333,1.588667,257806500,1.588667
2,1.666667,1.728000,1.351333,1.464000,123282000,1.464000
3,1.533333,1.540000,1.247333,1.280000,77097000,1.280000
4,1.333333,1.333333,1.055333,1.074000,103003500,1.074000
...,...,...,...,...,...,...
3186,196.330002,197.669998,192.800003,196.880005,142228100,196.880005
3187,202.029999,209.419998,201.259995,207.630005,161028300,207.630005
3188,210.589996,211.229996,203.750000,205.710007,153144900,205.710007
3189,206.210007,207.199997,198.520004,202.770004,156852800,202.770004


In [None]:
#Normalize the Numeric Data
df['Open'] = zscore(df['Open'])
df['High'] = zscore(df['High'])
df['Low'] = zscore(df['Low'])
df['Close'] = zscore(df['Close'])
df['Volume'] = zscore(df['Volume'])

#Never normalize the output feature when traaining any regression models, else RMSE will also be normalized
#df['Close_Output'] = zscore(df['Close_Output'])
df

Unnamed: 0,Open,High,Low,Close,Volume,Close_Output
0,-0.615054,-0.610991,-0.616485,-0.612011,2.276485,1.592667
1,-0.610333,-0.607308,-0.612373,-0.612053,1.987555,1.588667
2,-0.610882,-0.610366,-0.614536,-0.613355,0.346716,1.464000
3,-0.612273,-0.612282,-0.615650,-0.615276,-0.216617,1.280000
4,-0.614359,-0.614389,-0.617706,-0.617427,0.099373,1.074000
...,...,...,...,...,...,...
3186,1.419443,1.386986,1.435485,1.427087,0.577808,196.880005
3187,1.478894,1.506761,1.526074,1.539333,0.807120,207.630005
3188,1.568174,1.525211,1.552737,1.519285,0.710964,205.710007
3189,1.522491,1.484131,1.496735,1.488587,0.756190,202.770004


#Splitting the Data
Create test and training splits with a 30/70 split

In [None]:
#Create the x(inputs) and y(outputs)
Close_Output = df["Close_Output"]
x,y = to_xy(df,"Close_Output")

#Split the train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42)

In [None]:
print("x shape: " + str(x.shape))
print("y shape: " + str(y.shape))
print()
print("x_train shape: " + str(x_train.shape))
print("x_test shape: " + str(x_test.shape))
print()
print("y_train shape: " + str(y_train.shape))
print("y_test shape: " + str(y_test.shape))

x shape: (3191, 5)
y shape: (3191,)

x_train shape: (2233, 5)
x_test shape: (958, 5)

y_train shape: (2233,)
y_test shape: (958,)


In [None]:
#Define the log folder for tensorboard
log_dir = "/content/drive/MyDrive/Colab Notebooks/logs/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")