# <span Style='Color:purple'> Recurrent Neural Networks

# <span Style='Color:orange'> I. Exploratory Data Analysis

## <span Style='Color:red'> 1. Import library data and functions

In [481]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch import nn

In [482]:
# Import the data
df_train = pd.read_csv('train.csv', index_col = 'date')
df_train.index = pd.to_datetime(df_train.index)
df_test = pd.read_csv('test.csv', index_col = 'date')
df_test.index = pd.to_datetime(df_test.index)
df_train = pd.concat([df_train, df_test], axis = 0)
df_train

Unnamed: 0_level_0,id,country,store,product,num_sold
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-01,0,Canada,Discount Stickers,Holographic Goose,
2010-01-01,1,Canada,Discount Stickers,Kaggle,973.0
2010-01-01,2,Canada,Discount Stickers,Kaggle Tiers,906.0
2010-01-01,3,Canada,Discount Stickers,Kerneler,423.0
2010-01-01,4,Canada,Discount Stickers,Kerneler Dark Mode,491.0
...,...,...,...,...,...
2019-12-31,328675,Singapore,Premium Sticker Mart,Holographic Goose,
2019-12-31,328676,Singapore,Premium Sticker Mart,Kaggle,
2019-12-31,328677,Singapore,Premium Sticker Mart,Kaggle Tiers,
2019-12-31,328678,Singapore,Premium Sticker Mart,Kerneler,


In [483]:
# Engineer the the variables
def feature_engineer(df):
    df['weekday_sin'] = np.sin(2 * np.pi * df.index.weekday/7)
    df['weekday_cos'] = np.cos(2 * np.pi * df.index.weekday/7)
    df['month_sin'] = np.sin(2 * np.pi * df.index.month/12)
    df['month_cos'] = np.cos(2 * np.pi * df.index.month/12)
    return df

In [484]:
# Function to make sequences
def create_sequence(df, sequence_length, target_idx):
    seq = []
    tar_seq = []
    indices = []
    for i in range(len(df) - sequence_length):
        sequences = df[i : i+sequence_length].values
        target_sequences = df.iloc[i + sequence_length, target_idx]
        target_indices = df.index[i + sequence_length]
        
        seq.append(sequences)
        tar_seq.append(target_sequences)
        indices.append(target_indices)
    return np.array(seq), np.array(tar_seq), pd.to_datetime(indices)

In [485]:
# Function to One hot encode the categorical features for traning data
def one_hot_encode_train(train_data, categ_columns):
    encoders = {}
    for col in categ_columns:
        encoder = OneHotEncoder(sparse_output = False, handle_unknown = 'ignore')
        encoded_data = pd.DataFrame(np.array(encoder.fit_transform(train_data[[col]])), columns = encoder.get_feature_names_out([col]), index = train_data.index)
        train_data = pd.concat((train_data.drop(columns = col), encoded_data), axis = 1)
        encoders[col] = encoder
    return train_data, encoders

In [486]:
# Function to One hot encode the categorical features for testing data
def one_hot_encode_test(test_data, categ_columns, encoders):
    for col in categ_columns:
        encoder = encoders[col]
        encoded_data = pd.DataFrame(np.array(encoder.transform(test_data[[col]])), columns= encoder.get_feature_names_out([col]), index = test_data.index)
        test_data = pd.concat((test_data.drop(columns = col), encoded_data), axis = 1)
    return test_data

## <span Style='Color:red'> 2. Data cleaning and Feature engineering

In [487]:
# Check for missing data
df_train.isna().sum()

id               0
country          0
store            0
product          0
num_sold    107421
dtype: int64

In [488]:
# Fill the missing data with mean values
df_train['num_sold'] = df_train['num_sold'].fillna(df_train['num_sold'].median())

In [489]:
# Feature engineer the training data
df_train = feature_engineer(df_train)

## <span Style='Color:red'> 3. Data Visualization

## <span Style="color:red"> 4. Data Preparation

In [490]:
# Separate the data by year
train = df_train[df_train.index.year < 2016]  # Training data is before 2016
val = df_train[df_train.index.year == 2016]  # Validation data is for 2016
test = df_train[df_train.index.year >= 2017]  # Test data is 2017 and above


# Encode the data
categorical_columns = df_train.select_dtypes(exclude = 'number')
train, encoders = one_hot_encode_train(train, categorical_columns)
val = one_hot_encode_test(val, categorical_columns, encoders)
test = one_hot_encode_test(test, categorical_columns, encoders)
encoded_df = pd.concat([train, val, test], axis = 0)

# Make sequences of the data
sequence_length = 30
target_idx = train.columns.get_indexer(['num_sold'])
features, targets, index = create_sequence(encoded_df, sequence_length, target_idx)

In [508]:
# Separate the sequences
train_features = features[index.year < 2016]
val_features = features[index.year == 2016]
test_features = features[index.year > 2017]

targets = targets[index.u]

array([[[ 0.00000000e+00,  6.05000000e+02, -4.33883739e-01, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
        [ 1.00000000e+00,  9.73000000e+02, -4.33883739e-01, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
        [ 2.00000000e+00,  9.06000000e+02, -4.33883739e-01, ...,
          1.00000000e+00,  0.00000000e+00,  0.00000000e+00],
        ...,
        [ 2.70000000e+01,  2.07000000e+03, -4.33883739e-01, ...,
          1.00000000e+00,  0.00000000e+00,  0.00000000e+00],
        [ 2.80000000e+01,  1.13400000e+03, -4.33883739e-01, ...,
          0.00000000e+00,  1.00000000e+00,  0.00000000e+00],
        [ 2.90000000e+01,  1.04700000e+03, -4.33883739e-01, ...,
          0.00000000e+00,  0.00000000e+00,  1.00000000e+00]],

       [[ 1.00000000e+00,  9.73000000e+02, -4.33883739e-01, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
        [ 2.00000000e+00,  9.06000000e+02, -4.33883739e-01, ...,
          1.00000000e+00,  0.00000000e

In [512]:
pd.Index(index)

DatetimeIndex(['2010-01-01', '2010-01-01', '2010-01-01', '2010-01-01',
               '2010-01-01', '2010-01-01', '2010-01-01', '2010-01-01',
               '2010-01-01', '2010-01-01',
               ...
               '2019-12-31', '2019-12-31', '2019-12-31', '2019-12-31',
               '2019-12-31', '2019-12-31', '2019-12-31', '2019-12-31',
               '2019-12-31', '2019-12-31'],
              dtype='datetime64[ns]', length=328650, freq=None)

In [521]:
len(index[index.year == 2010])

32820

In [523]:
len(df_train[df_train.index.year ==2010])

32850

In [493]:
threshold_val = len(df_train[(df_train.index.year == 2016)])
threshold_val

32940

In [494]:
# Scale the features
feature_scaler = StandardScaler()
train_features_scaled = feature_scaler.fit_transform(train_features.reshape(-1, train_features.shape[-1]))
val_features_scaled = feature_scaler.transform(val_features.reshape(-1, val_features.shape[-1]))

# Scale the targets
target_scaler = StandardScaler()
train_targets_scaled = target_scaler.fit_transform(train_targets)
val_targets_scaled = target_scaler.transform(val_targets)

# Reshape the features
train_features_scaled = train_features_scaled.reshape(train_features.shape)
val_features_scaled = val_features_scaled.reshape(val_features.shape)

# Turn the data into tensors
X_train_tensor = torch.tensor(train_features_scaled, dtype = torch.float32)
y_train_tensor = torch.tensor(val_features_scaled, dtype = torch.float32)
X_val_tensor = torch.tensor(val_targets, dtype = torch.float32)
y_val_tensor = torch.tensor(val_targets, dtype = torch.float32)

In [495]:
train_index

array([Timestamp('2010-01-01 00:00:00'), Timestamp('2010-01-01 00:00:00'),
       Timestamp('2010-01-01 00:00:00'), ...,
       Timestamp('2015-12-31 00:00:00'), Timestamp('2015-12-31 00:00:00'),
       Timestamp('2015-12-31 00:00:00')], dtype=object)

In [496]:
train

Unnamed: 0_level_0,id,num_sold,weekday_sin,weekday_cos,month_sin,month_cos,country_Canada,country_Finland,country_Italy,country_Kenya,country_Norway,country_Singapore,store_Discount Stickers,store_Premium Sticker Mart,store_Stickers for Less,product_Holographic Goose,product_Kaggle,product_Kaggle Tiers,product_Kerneler,product_Kerneler Dark Mode
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2010-01-01,0,605.0,-0.433884,-0.900969,5.000000e-01,0.866025,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2010-01-01,1,973.0,-0.433884,-0.900969,5.000000e-01,0.866025,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2010-01-01,2,906.0,-0.433884,-0.900969,5.000000e-01,0.866025,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2010-01-01,3,423.0,-0.433884,-0.900969,5.000000e-01,0.866025,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2010-01-01,4,491.0,-0.433884,-0.900969,5.000000e-01,0.866025,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-12-31,197185,387.0,0.433884,-0.900969,-2.449294e-16,1.000000,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2015-12-31,197186,2224.0,0.433884,-0.900969,-2.449294e-16,1.000000,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2015-12-31,197187,1995.0,0.433884,-0.900969,-2.449294e-16,1.000000,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2015-12-31,197188,1110.0,0.433884,-0.900969,-2.449294e-16,1.000000,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [497]:
val

Unnamed: 0_level_0,id,num_sold,weekday_sin,weekday_cos,month_sin,month_cos,country_Canada,country_Finland,country_Italy,country_Kenya,country_Norway,country_Singapore,store_Discount Stickers,store_Premium Sticker Mart,store_Stickers for Less,product_Holographic Goose,product_Kaggle,product_Kaggle Tiers,product_Kerneler,product_Kerneler Dark Mode
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2016-01-01,197190,605.0,-0.433884,-0.900969,5.000000e-01,0.866025,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2016-01-01,197191,706.0,-0.433884,-0.900969,5.000000e-01,0.866025,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2016-01-01,197192,634.0,-0.433884,-0.900969,5.000000e-01,0.866025,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2016-01-01,197193,316.0,-0.433884,-0.900969,5.000000e-01,0.866025,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2016-01-01,197194,404.0,-0.433884,-0.900969,5.000000e-01,0.866025,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12-31,230125,466.0,-0.974928,-0.222521,-2.449294e-16,1.000000,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2016-12-31,230126,2907.0,-0.974928,-0.222521,-2.449294e-16,1.000000,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2016-12-31,230127,2299.0,-0.974928,-0.222521,-2.449294e-16,1.000000,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2016-12-31,230128,1242.0,-0.974928,-0.222521,-2.449294e-16,1.000000,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [498]:
val_index.shape

(32910,)

In [499]:
val.index.shape

(32940,)

In [500]:
train_features_scaled.reshape(train_features.shape).shape

(197160, 30, 20)

In [501]:
# Turn the data into batches
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_val_tensor, y_val_tensor)

AssertionError: Size mismatch between tensors

In [None]:
X_train_tensor

In [None]:
val

Unnamed: 0_level_0,id,store,product,num_sold,weekday_sin,weekday_cos,month_sin,month_cos,country_Canada,country_Finland,country_Italy,country_Kenya,country_Norway,country_Singapore
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2016-01-01,197190,Discount Stickers,Holographic Goose,605.0,-0.433884,-0.900969,5.000000e-01,0.866025,1.0,0.0,0.0,0.0,0.0,0.0
2016-01-01,197191,Discount Stickers,Kaggle,706.0,-0.433884,-0.900969,5.000000e-01,0.866025,1.0,0.0,0.0,0.0,0.0,0.0
2016-01-01,197192,Discount Stickers,Kaggle Tiers,634.0,-0.433884,-0.900969,5.000000e-01,0.866025,1.0,0.0,0.0,0.0,0.0,0.0
2016-01-01,197193,Discount Stickers,Kerneler,316.0,-0.433884,-0.900969,5.000000e-01,0.866025,1.0,0.0,0.0,0.0,0.0,0.0
2016-01-01,197194,Discount Stickers,Kerneler Dark Mode,404.0,-0.433884,-0.900969,5.000000e-01,0.866025,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12-31,230125,Premium Sticker Mart,Holographic Goose,466.0,-0.974928,-0.222521,-2.449294e-16,1.000000,0.0,0.0,0.0,0.0,0.0,1.0
2016-12-31,230126,Premium Sticker Mart,Kaggle,2907.0,-0.974928,-0.222521,-2.449294e-16,1.000000,0.0,0.0,0.0,0.0,0.0,1.0
2016-12-31,230127,Premium Sticker Mart,Kaggle Tiers,2299.0,-0.974928,-0.222521,-2.449294e-16,1.000000,0.0,0.0,0.0,0.0,0.0,1.0
2016-12-31,230128,Premium Sticker Mart,Kerneler,1242.0,-0.974928,-0.222521,-2.449294e-16,1.000000,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
train_features.reshape(-1, train_features.shape[-1])

array([[ 0.00000000e+00,  6.05000000e+02, -4.33883739e-01, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 1.00000000e+00,  9.73000000e+02, -4.33883739e-01, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 2.00000000e+00,  9.06000000e+02, -4.33883739e-01, ...,
         1.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       ...,
       [ 1.97186000e+05,  2.22400000e+03,  4.33883739e-01, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 1.97187000e+05,  1.99500000e+03,  4.33883739e-01, ...,
         1.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 1.97188000e+05,  1.11000000e+03,  4.33883739e-01, ...,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00]])