# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_squared_log_error

# Getting Data Ready

## Downloading Data

### The Dataset without reducing the number of families

In [3]:
!wget https://h31416-ml-datasets.s3.ap-south-1.amazonaws.com/store_sales/test_final_large.csv
!wget https://h31416-ml-datasets.s3.ap-south-1.amazonaws.com/store_sales/train_final_large.csv
!wget https://h31416-ml-datasets.s3.ap-south-1.amazonaws.com/store_sales/sample_submission.csv

--2022-06-26 06:47:26--  https://h31416-ml-datasets.s3.ap-south-1.amazonaws.com/store_sales/test_final_large.csv
Resolving h31416-ml-datasets.s3.ap-south-1.amazonaws.com (h31416-ml-datasets.s3.ap-south-1.amazonaws.com)... 52.219.158.122
Connecting to h31416-ml-datasets.s3.ap-south-1.amazonaws.com (h31416-ml-datasets.s3.ap-south-1.amazonaws.com)|52.219.158.122|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2036293 (1.9M) [text/csv]
Saving to: ‘test_final_large.csv’


2022-06-26 06:47:29 (1.20 MB/s) - ‘test_final_large.csv’ saved [2036293/2036293]

--2022-06-26 06:47:30--  https://h31416-ml-datasets.s3.ap-south-1.amazonaws.com/store_sales/train_final_large.csv
Resolving h31416-ml-datasets.s3.ap-south-1.amazonaws.com (h31416-ml-datasets.s3.ap-south-1.amazonaws.com)... 52.219.158.122
Connecting to h31416-ml-datasets.s3.ap-south-1.amazonaws.com (h31416-ml-datasets.s3.ap-south-1.amazonaws.com)|52.219.158.122|:443... connected.
HTTP request sent, awaiting response..

### The Dataset after reducing the number of families

In [2]:
!wget https://h31416-ml-datasets.s3.ap-south-1.amazonaws.com/store_sales/test_final.csv
!wget https://h31416-ml-datasets.s3.ap-south-1.amazonaws.com/store_sales/train_final.csv

--2022-06-26 08:29:04--  https://h31416-ml-datasets.s3.ap-south-1.amazonaws.com/store_sales/test_final.csv
Resolving h31416-ml-datasets.s3.ap-south-1.amazonaws.com (h31416-ml-datasets.s3.ap-south-1.amazonaws.com)... 52.219.156.50
Connecting to h31416-ml-datasets.s3.ap-south-1.amazonaws.com (h31416-ml-datasets.s3.ap-south-1.amazonaws.com)|52.219.156.50|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2421657 (2.3M) [text/csv]
Saving to: ‘test_final.csv’


2022-06-26 08:29:06 (1.52 MB/s) - ‘test_final.csv’ saved [2421657/2421657]

--2022-06-26 08:29:07--  https://h31416-ml-datasets.s3.ap-south-1.amazonaws.com/store_sales/train_final.csv
Resolving h31416-ml-datasets.s3.ap-south-1.amazonaws.com (h31416-ml-datasets.s3.ap-south-1.amazonaws.com)... 52.219.156.50
Connecting to h31416-ml-datasets.s3.ap-south-1.amazonaws.com (h31416-ml-datasets.s3.ap-south-1.amazonaws.com)|52.219.156.50|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 268664907 (

## Model on *Large* Dataset

In [47]:
train = pd.read_csv("train_final_large.csv")
train = train.drop(["date", "store_nbr"], axis=1)

In [48]:
train.head()

Unnamed: 0,id,family,sales,onpromotion,day,week,month,year,isweekend,cluster_new,type,oil_price,holiday
0,0,AUTOMOTIVE,0.0,0,1,1,1,2013,False,E,D,93.055,True
1,1,BABY CARE,0.0,0,1,1,1,2013,False,E,D,93.055,True
2,2,BEAUTY,0.0,0,1,1,1,2013,False,E,D,93.055,True
3,3,BEVERAGES,0.0,0,1,1,1,2013,False,E,D,93.055,True
4,4,BOOKS,0.0,0,1,1,1,2013,False,E,D,93.055,True


In [49]:
one_hot_cols = train.columns[(train.dtypes == "object")|(train.dtypes == "bool")]
log_transformer = FunctionTransformer(np.log1p, validate=False)

one_hot_encoder = OneHotEncoder(sparse=False, handle_unknown='error', drop='first')
column_transformer = ColumnTransformer(
    [
        ("categorical_cols", one_hot_encoder, one_hot_cols),
    ],
    remainder="passthrough",
)

In [50]:
y = train["sales"]
X = train.drop(["sales"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

X_train = column_transformer.fit_transform(X_train)
X_test = column_transformer.transform(X_test)


In [62]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [64]:
preds = lr.predict(X_test)
preds = np.abs(preds)
mae = mean_absolute_error(y_test, preds)
mse = mean_squared_error(y_test, preds)
msle = mean_squared_log_error(y_test, preds)
mae, mse, msle, np.sqrt(msle)

(292.37325154338197, 516128.5117900585, 10.57820020164479, 3.2524145187298608)

In [65]:
preds = lr.predict(X_test)
preds = np.clip(preds, 0, None)
mae = mean_absolute_error(y_test, preds)
mse = mean_squared_error(y_test, preds)
msle = mean_squared_log_error(y_test, preds)
mae, mse, msle, np.sqrt(msle)

(251.93726839594524, 509019.4340603845, 6.201331107555636, 2.490247198082077)

In [67]:
xgbr = XGBRegressor()
xgbr.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [68]:
preds = xgbr.predict(X_test)
preds = np.clip(preds, 0, None)
mae = mean_absolute_error(y_test, preds)
mse = mean_squared_error(y_test, preds)
msle = mean_squared_log_error(y_test, preds)
mae, mse, msle, np.sqrt(msle)

(122.20677198973628,
 161328.68391090914,
 3.0516569509022187,
 1.7468992389093936)

## Model on *Short* Dataset

In [4]:
train = pd.read_csv("train_final.csv")
train = train.drop(["date", "store_nbr", "family"], axis=1)
train.head()

Unnamed: 0,id,sales,onpromotion,family_new,day,week,month,year,isweekend,wage_day,cluster_new,type,oil_price,holiday
0,0,0.0,0,MISC,1,1,1,2013,False,True,E,D,93.055,True
1,1,0.0,0,MISC,1,1,1,2013,False,True,E,D,93.055,True
2,2,0.0,0,LADIES,1,1,1,2013,False,True,E,D,93.055,True
3,3,0.0,0,BEVERAGE,1,1,1,2013,False,True,E,D,93.055,True
4,4,0.0,0,STATIONARY,1,1,1,2013,False,True,E,D,93.055,True


In [5]:
one_hot_cols = train.columns[(train.dtypes == "object")|(train.dtypes == "bool")]
log_transformer = FunctionTransformer(np.log1p, validate=False)

one_hot_encoder = OneHotEncoder(sparse=False, handle_unknown='error', drop='first')
column_transformer = ColumnTransformer(
    [
        ("categorical_cols", one_hot_encoder, one_hot_cols),
    ],
    remainder="passthrough",
)

In [6]:
y = train["sales"]
X = train.drop(["sales"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

X_train = column_transformer.fit_transform(X_train)
X_test = column_transformer.transform(X_test)


In [11]:
xgbr = XGBRegressor(n_estimators=1000, n_jobs=-1)
xgbr.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=1000,
             n_jobs=-1, num_parallel_tree=1, predictor='auto', random_state=0,
             reg_alpha=0, reg_lambda=1, ...)

In [12]:
preds = xgbr.predict(X_test)
preds = np.clip(preds, 0, None)
mae = mean_absolute_error(y_test, preds)
mse = mean_squared_error(y_test, preds)
msle = mean_squared_log_error(y_test, preds)
mae, mse, msle, np.sqrt(msle)

(236.41789636575717, 383341.49541807, 6.133862107258354, 2.4766635030335378)

In [13]:
preds = xgbr.predict(X_train)
preds = np.clip(preds, 0, None)
mae = mean_absolute_error(y_train, preds)
mse = mean_squared_error(y_train, preds)
msle = mean_squared_log_error(y_train, preds)
mae, mse, msle, np.sqrt(msle)

(226.7137249595685, 326047.570325953, 6.07785161007406, 2.4653299191130706)

It seems that models on raw train data is not performing good. We have to make preditions based on individual families and cluster. 