In [1]:
import pandas as pd
import numpy as np
import math

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
PATH = "data/"

In [4]:
df_stores = pd.read_csv(PATH + "store/store.csv")
df_stores.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,c,c,620.0,9.0,2009.0,0,,,
4,5,a,a,29910.0,4.0,2015.0,0,,,


In [5]:
# Low number of NaN, simply replace with median
df_stores["CompetitionDistance"].iloc[[df_stores["CompetitionDistance"].isna()]] = df_stores["CompetitionDistance"].median()

In [6]:
many_nan_cols = ["CompetitionOpenSinceMonth", "CompetitionOpenSinceYear", "Promo2SinceWeek", "Promo2SinceYear"]

# Replace NaN with median and add feature which indicates that the feature is missing
for col in many_nan_cols:
    df_stores[col + "isNaN"] = df_stores[col].isna().astype(int)
    df_stores[col][df_stores[col].isna()] = df_stores[col].median()
    df_stores[col] = df_stores[col].astype(int)

In [7]:
# Columns that should be factorize
factorize_cols = [
    "StoreType",
    "Assortment",
    "Promo2SinceYear",
    "Promo2SinceWeek",
    "CompetitionOpenSinceMonth",
    "Assortment",
    "CompetitionOpenSinceYear"
]

for col in factorize_cols:
    temp_df = pd.factorize(df_stores[col])[0]
    temp_df = pd.DataFrame({col: temp_df})
    df_stores[col] = temp_df

In [8]:
df_stores.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,CompetitionOpenSinceMonthisNaN,CompetitionOpenSinceYearisNaN,Promo2SinceWeekisNaN,Promo2SinceYearisNaN
0,1,0,0,1270.0,0,0,0,0,0,,0,0,1,1
1,2,1,0,570.0,1,1,1,1,1,"Jan,Apr,Jul,Oct",0,0,0,0
2,3,1,0,14130.0,2,2,1,2,2,"Jan,Apr,Jul,Oct",0,0,0,0
3,4,0,1,620.0,0,3,0,0,0,,0,0,1,1
4,5,1,0,29910.0,3,4,0,0,0,,0,0,1,1


In [9]:
df = pd.read_csv(PATH + "train/train.csv")

# Some zeros where strings, some where int; convering them to the same
df["StateHoliday"] = df["StateHoliday"].astype(str)

# Converting to cat features
df_holiday = pd.factorize(df["StateHoliday"])[0]
df_holiday = pd.DataFrame({"StateHoliday": df_holiday})
df["StateHoliday"] = df_holiday

df["Date"] = pd.to_datetime(df["Date"])
df.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1


In [10]:
df["date_year"] = df["Date"].dt.year
df["date_year"] = df["date_year"] - np.min(df["date_year"].values)
df["date_month"] = df["Date"].dt.month - 1
df["date_week"] = df["Date"].dt.week - 1
df["DayOfWeek"] = df["DayOfWeek"] - 1

df = df.drop("Date", axis=1)

In [11]:
df = df.merge(df_stores, on="Store")
df.head()

Unnamed: 0,Store,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,date_year,date_month,...,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,CompetitionOpenSinceMonthisNaN,CompetitionOpenSinceYearisNaN,Promo2SinceWeekisNaN,Promo2SinceYearisNaN
0,1,4,5263,555,1,1,0,1,2,6,...,0,0,0,0,0,,0,0,1,1
1,1,3,5020,546,1,1,0,1,2,6,...,0,0,0,0,0,,0,0,1,1
2,1,2,4782,523,1,1,0,1,2,6,...,0,0,0,0,0,,0,0,1,1
3,1,1,5011,560,1,1,0,1,2,6,...,0,0,0,0,0,,0,0,1,1
4,1,0,6102,612,1,1,0,1,2,6,...,0,0,0,0,0,,0,0,1,1


In [12]:
ind_to_month = {
    0: "Jan",
    1: "Feb",
    2: "Mar",
    3: "Apr",
    4: "May",
    5: "Jun",
    6: "Jul",
    7: "Aug",
    8: "Sept",
    9: "Oct",
    10: "Nov",
    11: "Dec"
}

df_promo = pd.DataFrame({"PromoInterval": np.zeros(len(df))})

for idx in range(len(df)):
    month = df["date_month"].iloc[idx]

    if type(df["PromoInterval"].iloc[idx]) == str:
        promo = set(df["PromoInterval"].iloc[idx].split(","))

        if ind_to_month[month] in promo:
            df_promo.iloc[idx] = 1

df["PromoInterval"] = df_promo
df = df.rename(columns={"PromoInterval": "isPromotion"})

In [13]:
# If the store is not open, simply predict 0
df = df.query("Open == 1")
df = df.drop("Open", axis=1)

# Need to factorize Stores later since store id = 0 is never open
temp_df = pd.factorize(df["Store"])[0]
temp_df = pd.DataFrame({"Store": temp_df})
df["Store"] = temp_df

In [14]:
target = "Sales"
cont_features = ["Customers", "CompetitionDistance"]
cat_features = [col for col in df.columns if col not in cont_features and col != target]

In [15]:
# Splitting based on time, i.e. validation set is in the "future"
train_df = df[~((df["date_year"] >= 2) & (df["date_month"] >= 5))]
val_df = df[(df["date_year"] >= 2) & (df["date_month"] >= 5)]

In [16]:
for col in cont_features:
    mean = train_df[col].mean()
    std = train_df[col].std()

    train_df[col] = (train_df[col] - mean) / std
    val_df[col] = (val_df[col] - mean) / std

In [18]:
train_df.head()

Unnamed: 0,Store,DayOfWeek,Sales,Customers,Promo,StateHoliday,SchoolHoliday,date_year,date_month,date_week,...,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,isPromotion,CompetitionOpenSinceMonthisNaN,CompetitionOpenSinceYearisNaN,Promo2SinceWeekisNaN,Promo2SinceYearisNaN
62,0.0,5,5592,-0.457203,0,0,0,2,4,21,...,0,0,0,0,0,0.0,0,0,1,1
63,0.0,4,4656,-0.564193,0,0,0,2,4,21,...,0,0,0,0,0,0.0,0,0,1,1
64,0.0,3,4111,-0.720946,0,0,0,2,4,21,...,0,0,0,0,0,0.0,0,0,1,1
65,0.0,2,4083,-0.663719,0,0,0,2,4,21,...,0,0,0,0,0,0.0,0,0,1,1
66,0.0,1,4211,-0.708505,0,0,0,2,4,21,...,0,0,0,0,0,0.0,0,0,1,1


In [19]:
device = "cuda"
batch_size = 256
seed = 42

In [20]:
# Log since log(x) - log(y) = log(x / y), i.e. emulating RMPSE.
# Also add 1 since all sales are positive and we dont have to worry about log(0)
train_df["Sales"] = np.log(train_df["Sales"].values + 1)
val_df["Sales"] = np.log(val_df["Sales"].values + 1)

y_max = np.max(train_df["Sales"].values) * 1.2

In [21]:
class StoreData(Dataset):
    def __init__(self, df_, cont_features_, cat_features_, target_):
        self.df = df_
        self.cont_features = cont_features_
        self.cat_features = cat_features_
        self.target = target_

        self.X_cont = torch.from_numpy(self.df.drop([self.target] + self.cat_features, axis=1).values).float().to(device)
        self.X_cat = torch.from_numpy(self.df[self.cat_features].values).long().to(device)
        self.Y = torch.from_numpy(self.df[self.target].values).float().to(device)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return self.X_cont[idx, :], self.X_cat[idx, :], self.Y[idx]

In [22]:
train_dataset = StoreData(train_df, cont_features, cat_features, target)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)

val_dataset = StoreData(val_df, cont_features, cat_features, target)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=0)

In [23]:
class BatchNorm(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.bn = nn.BatchNorm1d(num_features)

    def forward(self, x):
        # Transforming [N, D] to [N, D, 1] in-order to use batch-norm 1D
        x = x.unsqueeze(-1)
        x = self.bn(x)
        return x.squeeze(-1)

class Model(nn.Module):
    def __init__(self, cols_, cont_features_, cat_features_, num_hidden=[128, 64], p=0.2):
        super().__init__()
        self.cols = cols_
        self.cont_features = cont_features_
        self.cat_features = cat_features_

        self.embedding_layer = nn.ModuleList([nn.Embedding(*self.calc_embedding_size(col)) for col in self.cat_features])
        self.entity_dim = sum([self.calc_embedding_size(col)[1] for col in self.cat_features])
        
        self.embedding_bn = BatchNorm(self.entity_dim)
        self.dropout_embedding = nn.Dropout(p=p)

        self.fc_1 = nn.Linear(len(self.cont_features) + self.entity_dim, num_hidden[0], bias=False)
        self.bn_1 = BatchNorm(num_hidden[0])
        self.dropout_1 = nn.Dropout(p=p)

        self.fc_2 = nn.Linear(num_hidden[0], num_hidden[1], bias=False)
        self.bn_2 = BatchNorm(num_hidden[1])
        self.dropout_2 = nn.Dropout(p=p)

        self.output = nn.Linear(num_hidden[1], 1)
        
    def calc_embedding_size(self, col):
        # Using the heuristic dim = min(50, (c+1)/2), where c is the cardinality of the feature
        c = df[col].nunique()
        return (c, min(50, math.ceil( (c+1) / 2 )))

    def forward(self, x_cont, x_cat):
        x_emb = torch.cat([layer(x_cat[:, i]) for i, layer in enumerate(self.embedding_layer)], axis=1)
        x_emb = self.embedding_bn(x_emb)
        x_emb = F.relu(x_emb)
        x_emb = self.dropout_embedding(x_emb)

        x = torch.cat([x_emb, x_cont], axis=1)
        x = self.fc_1(x)
        x = self.bn_1(x)
        x = F.relu(x)
        x = self.dropout_1(x)

        x = self.fc_2(x)
        x = self.bn_2(x)
        x = F.relu(x)
        x = self.dropout_2(x)

        x = self.output(x).squeeze(1)
        return F.sigmoid(x) * y_max

In [24]:
epochs = 20
lr = 1e-2
weight_decay = 1e-3
dropout_rate = 0.5

torch.manual_seed(seed)
model = Model(train_df.columns, cont_features, cat_features, p=dropout_rate).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

In [25]:
loss_fct = nn.MSELoss()

In [26]:
torch.manual_seed(seed)
for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    for idx, (x_cont, x_cat, y) in enumerate(train_loader):
        model.zero_grad()
        y_hat = model(x_cont, x_cat)
        batch_loss = loss_fct(y_hat, y)

        batch_loss.backward()
        optimizer.step()
        train_loss += batch_loss.cpu().detach().numpy()

    train_loss = np.round(train_loss / len(train_loader), 6)

    val_loss = 0.0
    val_RMSPE = 0.0
    model.eval()
    for idx, (x_cont, x_cat, y) in enumerate(val_loader):
        y_hat = model(x_cont, x_cat)
        batch_loss = loss_fct(y_hat, y)

        # Calculating RMSPE
        y_hat = torch.exp(y_hat) - 1
        y = torch.exp(y) - 1

        val_loss += batch_loss.cpu().detach().numpy()

        val = torch.sqrt(torch.mean(torch.pow((y_hat - y) / (y + 1e-8), 2)))
        val_RMSPE += val.cpu().detach().numpy()

    val_loss = np.round(val_loss / len(val_loader), 6)
    val_RMSPE = np.round(val_RMSPE / len(val_loader), 6)

    print(f"------------ Epoch {epoch} ------------")
    print(f"Train loss: {train_loss}")
    print(f"Val loss: {val_loss}")
    print(f"Val RMSPE: {val_RMSPE}")

------------ Epoch 0 ------------
Train loss: 0.053582
Val loss: 0.027768
Val RMSPE: 0.164258
------------ Epoch 1 ------------
Train loss: 0.035151
Val loss: 0.0234
Val RMSPE: 0.163955
------------ Epoch 2 ------------
Train loss: 0.036593
Val loss: 0.039617
Val RMSPE: 0.184238


KeyboardInterrupt: 

In [None]:
"""
Train loss: 0.04876
Train RMSPE: 0.0
Val loss: 0.016414
Val RMSPE: 0.13567
------------ Epoch 1 ------------
Train loss: 0.028863
Train RMSPE: 0.0
Val loss: 0.01524
Val RMSPE: 0.128551
------------ Epoch 2 ------------
Train loss: 0.029587
Train RMSPE: 0.0
Val loss: 0.013793
Val RMSPE: 0.122036
------------ Epoch 3 ------------
Train loss: 0.028463
Train RMSPE: 0.0
Val loss: 0.014834
Val RMSPE: 0.120304
------------ Epoch 4 ------------
Train loss: 0.029222
Train RMSPE: 0.0
Val loss: 0.023989
Val RMSPE: 0.144714
"""