# Modeling with Regression models

In [1]:
import gc
import sys
sys.path.append('../')

import pandas as pd
import numpy as np


from sklearn.model_selection import TimeSeriesSplit

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("darkgrid")
%matplotlib inline


from src.fetch_data import DataLoader
from src.cleaning import CleanDataFrame
from src.visualization import Plotters
from src.processing import FeatureEngineering
from src.modeling_pipeline import (
    run_train_pipeline, 
    get_pipeline,
    )


import mlflow


cleaner = CleanDataFrame()
feature_engineering = FeatureEngineering()
plotters = Plotters(w=6, h=4)

# Getting the data

In [2]:
# Then load the raw sales data
data_path = 'data/merged/train.csv'
version = 'merged_v3'
repo = '../'

train_df = DataLoader.dvc_get_data(data_path, version, repo)
train_df['Date'] = pd.to_datetime(train_df['Date'])

# load the test data
# data_path = 'data/merged/test.csv'
# version = 'merged_v3'
# repo = '../'

# test_df = DataLoader.dvc_get_data(data_path, version, repo)
# test_df['Date'] = pd.to_datetime(test_df['Date'])

  df = pd.read_csv(io.StringIO(content), sep=",")
DataLoaderLogger - INFO - DVC: CSV file read with path: data/merged/train.csv | version: merged_v3 | from: ../


I will add the additional columns from the feature engineering here

In [3]:
train_df = feature_engineering.transform(train_df)


FeatureEngineeringLogger - INFO - 9 new columns added to the dataframe
FeatureEngineeringLogger - INFO - Feature enginerring completed


In order to properly split the data, we need the timeframe for it. I will grab all the unique dates in the `Date` column

In [4]:
timeframe = train_df.Date.unique()
timeframe.sort()
print(f"Starts at {timeframe[0]} --- ends at: {timeframe[-1]}")
print(f"It spans for {len(timeframe)} days")

Starts at 2013-01-01T00:00:00.000000000 --- ends at: 2015-07-31T00:00:00.000000000
It spans for 942 days


In [5]:
train_df.sort_values(by='Date', inplace=True)

train_split, test_split = train_df.iloc[:int(len(train_df)*.8), :], train_df.iloc[int(len(train_df)*.8):, :]

In [6]:
X_train = []
y_train = []

n_future = 1
n_past = 32
train_split = train_split.iloc[:20_000, :]
test_split = test_split.iloc[:20_000, :]
for i in range(n_past, len(train_split) - n_future +1):
    X_train.append(train_split.iloc[i - n_past:i, :])
    y_train.append(train_split.iloc[i:i + n_future, 0])


X_test = []
y_test = []
for i in range(n_past, len(test_split) - n_future +1):
    X_test.append(test_split.iloc[i - n_past:i, :])
    y_test.append(test_split.iloc[i:i + n_future, 0])


In [7]:
# Let's turn them into numpy arrays
X_train, y_train = np.array(X_train), np.array(y_train)
print("X_train data shape:", X_train.shape)
print("y_train data shape:", y_train.shape)

X_test, y_test = np.array(X_test), np.array(y_test)
print("X_test data shape:", X_test.shape)
print("y_test data shape:", y_test.shape)


X_train data shape: (19968, 32, 29)
y_train data shape: (19968, 1)
X_test data shape: (19968, 32, 29)
y_test data shape: (19968, 1)


In [9]:
import gc 
gc.collect()

0

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM, Bidirectional, Embedding
from tensorflow.keras.layers import Dropout
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, TensorBoard



# Initializing the LSTM


In [None]:

def get_model():
    model = Sequential()

    # Adding 1st LSTM layer
    model.add(LSTM(units=32, return_sequences=True, input_shape=(X_train.shape[1:])))
    model.add(Dense(units=64, activation='relu'))
    # model.add(Dropout(0.2))
    model.add(LSTM(units=128, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(units=256, return_sequences=True))
    #model.add(LSTM(units=1024, return_sequences=True))
    #model.add(Dense(units=512, activation='relu'))

    # Adding 2nd LSTM layer
    model.add(LSTM(units=128, return_sequences=True))

    model.add(LSTM(units=32, return_sequences=False))

    # Adding Dropout
    model.add(Dropout(0.2))
    #model.add(Dense(units=16, activation='relu'))
    # Output layer
    model.add(Dense(units=y_train.shape[1]))
    # # Compiling the Neural Network
    model.compile(optimizer = Adam(learning_rate=0.01), loss='mse')

    return model

# tb = TensorBoard('logs')



# Train with cross-validation

the `n_splits` parameter can be modified to increase of decrease the number of folds to train with


In [None]:

from sklearn.model_selection import cross_val_score, KFold
k_fold = KFold(n_splits=5, shuffle=False)
fold = 1
train_histories = []
loss_per_fold = []
for train, valid in k_fold.split(X_train, y_train):
    # Get a fress model from the factory :)
    model = get_model()
    # Training utilities
    es = EarlyStopping(monitor='val_loss', min_delta=1e-10, patience=10, verbose=0)
    rlr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=0)
    mcp = ModelCheckpoint(filepath=f'weights-fold-{fold}.h5', monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=True)
    # Start training
    history = model.fit(X_train, 
                    y_train, 
                    shuffle=True, 
                    epochs=100, 
                    callbacks=[es, rlr, mcp], 
                    validation_split=0.2, 
                    verbose=0, 
                    batch_size=64)
    train_histories.append(history)
    print(f"Fold number {fold} complete")
    print("-"*100)
    # Evaluate
    score = model.evaluate(X_test, y_test, verbose=0)
    loss_per_fold.append(score)
    print(f'Score for fold {fold}: {score}')
    fold += 1



# Loads the best weights


In [None]:
best_fold = loss_per_fold.index(min(loss_per_fold)) + 1
print("The best fold is: ", best_fold)
checkpoint_path = f"weights-fold-{best_fold}.h5"
model.load_weights(checkpoint_path)