# The Singularity

In [187]:
# Notes
# Only Use mean for number of customers i.e. do not use features you don't know ahead of time
# comment the code 
# Drop all columns with sales = o or sales = null

**Things to do differently on second iteration**
1. Maybe keep Sales = 0 
- - Because sales might be 0 on holidays therefore we might need this information to predict holidays accurately
2. For Train We want to investigate further into which rows should be dropped and don't just drop all the null values
3. Maybe merge the datasets before cleaning them
4. Keep non-numeric columns and find a way to encode them
5. How to impute customer number
- - Currentlt using the mean for each store

## 1) Create the Hypothesis

## 2) Exploratory Data Analysis

### 2.1) Import libraries

In [188]:
# Basic
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline

In [189]:
# Advanced 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [190]:
# Super advanced 
from xgboost import XGBRegressor

### 2.2) Locate and Access Data

In [191]:
# Read in store data
df_store = pd.read_csv('./data/store.csv', low_memory=False)

# Read in store data
df_train = pd.read_csv('./data/train.csv', low_memory=False)

### 2.3) Subset to Time Period

### 2.4) Explore Data - Graphs and Correlations

## 3) Clean Data

### 3.1) Cut Features with too many missing values

#### 3.1.1) Store

In [192]:
df_store.isnull().sum() / df_store.shape[0] * 100

Store                         0.000000
StoreType                     0.000000
Assortment                    0.000000
CompetitionDistance           0.269058
CompetitionOpenSinceMonth    31.748879
CompetitionOpenSinceYear     31.748879
Promo2                        0.000000
Promo2SinceWeek              48.789238
Promo2SinceYear              48.789238
PromoInterval                48.789238
dtype: float64

In [193]:
# drop columns with lots of nulls 
columns_to_drop = ['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval']

df_store_cleaned = df_store.drop(columns_to_drop, axis=1)

In [194]:
df_store_cleaned.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,Promo2
0,1,c,a,1270.0,0
1,2,a,a,570.0,1
2,3,a,a,14130.0,1
3,4,c,c,620.0,0
4,5,a,a,29910.0,0


#### 3.1.2) Train

In [195]:
# Train
df_train.head()

Unnamed: 0,Date,Store,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,2013-01-01,1115.0,2.0,0.0,0.0,0.0,0.0,a,1.0
1,2013-01-01,379.0,2.0,0.0,0.0,0.0,0.0,a,1.0
2,2013-01-01,378.0,2.0,0.0,0.0,0.0,0.0,a,1.0
3,2013-01-01,377.0,2.0,0.0,0.0,0.0,0.0,a,1.0
4,2013-01-01,376.0,2.0,0.0,0.0,0.0,0.0,a,1.0


In [196]:
df_train.shape

(637774, 9)

In [197]:
df_train.isnull().sum() / df_train.shape[0] * 100

Date             0.000000
Store            3.026307
DayOfWeek        2.981777
Sales            2.983345
Customers        2.993380
Open             3.008276
Promo            3.009530
StateHoliday     3.018938
SchoolHoliday    3.031952
dtype: float64

In [198]:
df_train.isnull().any()

Date             False
Store             True
DayOfWeek         True
Sales             True
Customers         True
Open              True
Promo             True
StateHoliday      True
SchoolHoliday     True
dtype: bool

In [199]:
# STORE - Drop columns - Store, Assortment, 

### 3.2) Clean Numeric data

#### 3.1.1) Store

In [200]:
df_store_cleaned.isnull().sum()

Store                  0
StoreType              0
Assortment             0
CompetitionDistance    3
Promo2                 0
dtype: int64

In [201]:
# impute the mean for CompetitionDistance
df_store_cleaned['CompetitionDistance'] = df_store_cleaned['CompetitionDistance'].fillna(df_store_cleaned['CompetitionDistance'].mean())


In [202]:
# No nulls
df_store_cleaned.isnull().sum()

Store                  0
StoreType              0
Assortment             0
CompetitionDistance    0
Promo2                 0
dtype: int64

#### 3.1.1) Train

In [203]:
# Sales 
# 1) Drop where sales are null
df_train_cleaned = df_train.loc[df_train.loc[:, 'Sales'].notnull()]

In [204]:
# 2) Drop where sales are 0 

# 16.59% where sales = 0
(df_train_cleaned.loc[:, 'Sales'].eq(0).sum() / df_train.shape[0]) * 100

# drop where sales = 0
df_train_cleaned = df_train_cleaned.loc[~df_train_cleaned.loc[:, 'Sales'].eq(0)]

In [205]:
df_train_cleaned.isnull().sum() / df_train_cleaned.shape[0] * 100

Date             0.000000
Store            3.037298
DayOfWeek        2.982517
Sales            0.000000
Customers        3.012734
Open             3.012929
Promo            3.009810
StateHoliday     3.033399
SchoolHoliday    3.030864
dtype: float64

In [206]:
# Store - drop the nulls because we can't impute this 
df_train_cleaned = df_train_cleaned.loc[df_train_cleaned.loc[:, 'Store'].notnull()]


In [207]:
df_train_cleaned.isnull().sum() / df_train_cleaned.shape[0] * 100

Date             0.000000
Store            0.000000
DayOfWeek        2.977627
Sales            0.000000
Customers        3.012610
Open             3.017838
Promo            3.018240
StateHoliday     3.022060
SchoolHoliday    3.034525
dtype: float64

In [208]:
# Initial Run through - drop all the nulls 
df_train_cleaned = df_train_cleaned.dropna()

In [209]:
# No nulls
df_train_cleaned.isnull().sum()

Date             0
Store            0
DayOfWeek        0
Sales            0
Customers        0
Open             0
Promo            0
StateHoliday     0
SchoolHoliday    0
dtype: int64

### 3.3) Clean Categorical Data

### 3.4) Clean Ordinal Data

### 3.5) Drop Superfluous Columns

### 3.6) Remove Highly Correlated Variables

### 3.7) Merge the datasets 


In [210]:
# START HERE 
df_full = pd.merge(df_train_cleaned, df_store_cleaned, on=["Store"])

In [211]:
df_full.head()

Unnamed: 0,Date,Store,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,Promo2
0,2013-01-01,353.0,2.0,3139.0,820.0,1.0,0.0,a,1.0,b,b,900.0,1
1,2013-01-02,353.0,3.0,2686.0,835.0,1.0,0.0,0,1.0,b,b,900.0,1
2,2013-01-03,353.0,4.0,2628.0,815.0,1.0,0.0,0,1.0,b,b,900.0,1
3,2013-01-04,353.0,5.0,2677.0,856.0,1.0,0.0,0,1.0,b,b,900.0,1
4,2013-01-05,353.0,6.0,2224.0,719.0,1.0,0.0,0,0.0,b,b,900.0,1


In [212]:
# No nulls
df_full.isnull().sum()

Date                   0
Store                  0
DayOfWeek              0
Sales                  0
Customers              0
Open                   0
Promo                  0
StateHoliday           0
SchoolHoliday          0
StoreType              0
Assortment             0
CompetitionDistance    0
Promo2                 0
dtype: int64

### 3.8) Convert non-numeric variables to numeric

In [213]:
df_full.dtypes

Date                    object
Store                  float64
DayOfWeek              float64
Sales                  float64
Customers              float64
Open                   float64
Promo                  float64
StateHoliday            object
SchoolHoliday          float64
StoreType               object
Assortment              object
CompetitionDistance    float64
Promo2                   int64
dtype: object

In [214]:
# drop non-numeric columns for now
df_full = df_full.select_dtypes(np.number)

### 3.9) Convert Customer Number to mean for store

In [215]:
df_full['Customers'] = df_full.groupby('Store')['Customers'].transform('mean').astype('int')

In [216]:
df_full.tail()

Unnamed: 0,Store,DayOfWeek,Sales,Customers,Open,Promo,SchoolHoliday,CompetitionDistance,Promo2
413959,1081.0,3.0,5042.0,965,1.0,0.0,1.0,400.0,0
413960,1081.0,7.0,4974.0,965,1.0,0.0,0.0,400.0,0
413961,1081.0,1.0,7900.0,965,1.0,1.0,1.0,400.0,0
413962,1081.0,2.0,7563.0,965,1.0,1.0,1.0,400.0,0
413963,1081.0,4.0,6895.0,965,1.0,1.0,1.0,400.0,0


## 4) Train, Test, Split

In [217]:
X_columns = list(df_full.columns)
X_columns.remove("Sales")

# create the X, y dataframes
X = df_full.loc[:, X_columns]
y = df_full.loc[:, 'Sales']

In [218]:
X.head()

Unnamed: 0,Store,DayOfWeek,Customers,Open,Promo,SchoolHoliday,CompetitionDistance,Promo2
0,353.0,2.0,1370,1.0,0.0,1.0,900.0,1
1,353.0,3.0,1370,1.0,0.0,1.0,900.0,1
2,353.0,4.0,1370,1.0,0.0,1.0,900.0,1
3,353.0,5.0,1370,1.0,0.0,1.0,900.0,1
4,353.0,6.0,1370,1.0,0.0,0.0,900.0,1


In [219]:
X.shape

(413964, 8)

In [220]:
y.head()

0    3139.0
1    2686.0
2    2628.0
3    2677.0
4    2224.0
Name: Sales, dtype: float64

In [221]:
y.shape

(413964,)

In [222]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [223]:
X_train.head()

Unnamed: 0,Store,DayOfWeek,Customers,Open,Promo,SchoolHoliday,CompetitionDistance,Promo2
326034,366.0,4.0,599,1.0,0.0,0.0,6470.0,0
78686,481.0,6.0,444,1.0,0.0,1.0,7470.0,1
142898,254.0,1.0,254,1.0,0.0,0.0,330.0,1
330671,342.0,5.0,855,1.0,1.0,0.0,15770.0,1
58621,549.0,4.0,535,1.0,1.0,1.0,2330.0,0


In [224]:
X_test.head()

Unnamed: 0,Store,DayOfWeek,Customers,Open,Promo,SchoolHoliday,CompetitionDistance,Promo2
273821,624.0,2.0,685,1.0,1.0,0.0,6920.0,0
308601,723.0,3.0,539,1.0,0.0,0.0,5650.0,1
233565,1012.0,3.0,520,1.0,0.0,1.0,6330.0,1
395961,10.0,1.0,593,1.0,0.0,0.0,3160.0,0
6238,733.0,4.0,3399,1.0,1.0,0.0,860.0,0


In [225]:
y_train.head()

326034     5630.0
78686      3366.0
142898     1560.0
330671    10935.0
58621      4712.0
Name: Sales, dtype: float64

In [226]:
y_test.head()

273821     7611.0
308601     4356.0
233565     4484.0
395961     4543.0
6238      14382.0
Name: Sales, dtype: float64

## 5) Balance Positive and Negative Distribution

## 6) Select Model Features

## 7) Normalise and Scale

***

Data should be model ready by now

***

## 8) Kfolds Cross Validation 

## 9) Build Models

In [227]:
def metric(preds, actuals):
    preds = preds.reshape(-1)
    actuals = actuals.reshape(-1)
    assert preds.shape == actuals.shape
    return 100 * np.linalg.norm((actuals - preds) / actuals) / np.sqrt(preds.shape[0])

In [228]:
def pretty_metric(predictions, actuals, model):
    prediction = metric(predictions, actuals)
    print(f"The prediction for {model} is: {round(prediction, 2)}%")

### 9.1) Baseline - Mean and Median of Sales

#### 9.1.1) Mean prediction 

In [229]:
y_train

326034     5630.0
78686      3366.0
142898     1560.0
330671    10935.0
58621      4712.0
           ...   
259178     5564.0
365838     6670.0
131932    11175.0
146867     7444.0
121958     5562.0
Name: Sales, Length: 277355, dtype: float64

In [230]:
# broadcast the mean predictions 
mean_predictions = [y_train.mean()]
mean_predictions = np.array(mean_predictions * y_test.shape[0])

In [231]:
pretty_metric(mean_predictions, y_test.to_numpy(), "Mean Regressor")

The prediction for Mean Regressor is: 62.34%


### 9.2) Linear Regression

In [232]:
regressor = LinearRegression().fit(X_train, y_train)

In [233]:
linear_regression_predictions = regressor.predict(X_test)

In [234]:
pretty_metric(linear_regression_predictions, y_test.to_numpy(), "Linear Regressor")

The prediction for Linear Regressor is: 40.77%


### 9.3) Random Forest 

In [235]:
regressor_random_forest = RandomForestRegressor(max_depth=2, random_state=0)
regressor_random_forest.fit(X_train, y_train)
random_forest_predictions = regressor_random_forest.predict(X_test)

In [236]:
pretty_metric(random_forest_predictions, y_test.to_numpy(), "Random Forest")

The prediction for Random Forest is: 47.1%


#### 9.3.1) What Features did Random Forest use?

### 9.4) Xgboost 

In [237]:
regressor_xgboost = XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)
regressor_xgboost.fit(X_train, y_train)
xgboost_predictions = regressor_xgboost.predict(X_test)

In [238]:
pretty_metric(xgboost_predictions, y_test.to_numpy(), "XGBoost")

The prediction for XGBoost is: 28.89%


#### 9.4.1) What Features did Xgboost use?

### 9.5) Catboost

In [241]:
from catboost import CatBoostRegressor

In [244]:
regressor_catboost = CatBoostRegressor(iterations=2, depth=2, learning_rate=1, loss_function='RMSE', verbose=True)
regressor_catboost.fit(X_train, y_train)
catboost_predictions = regressor_catboost.predict(X_test)

0:	learn: 2399.2326847	total: 72.5ms	remaining: 72.5ms
1:	learn: 2163.8129825	total: 80.5ms	remaining: 0us


In [245]:
pretty_metric(catboost_predictions, y_test.to_numpy(), "CatBoost")

The prediction for CatBoost is: 41.64%


### 9.6) LightGBM

In [247]:
from lightgbm import LGBMRegressor

In [248]:
lgbm_regressor = LGBMRegressor(learning_rate=0.1,n_estimators=100,max_depth=5,num_leaves=50)
lgbm_regressor.fit(X_train, y_train)
lgbm_predictions = lgbm_regressor.predict(X_test)

In [249]:
pretty_metric(catboost_predictions, y_test.to_numpy(), "LGBM")

The prediction for LGBM is: 41.64%


## 10) Hyper parameter tuning 

### 10.1) Random Hyperparameter Grid

In [239]:
from sklearn.model_selection import RandomizedSearchCV