In [1]:
import sys

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import xgboost
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model

In [2]:
sys.path.insert(0, './modules/')
import cleaning as cln
import feature_eng as feng

In [3]:
# Reading the input files:

full_df_train = pd.read_csv("./data/train.csv")
full_df_store = pd.read_csv("./data/store.csv")
full_df_holdout = pd.read_csv("./data/holdout.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# Merge the 'train' and 'store' data frames, to be used for training:

full_df_train_cv = cln.merge(full_df_train, full_df_store)

In [5]:
# Merge the 'holdout' and 'store' data frames, to be used for final test:

full_df_test = cln.merge(full_df_holdout, full_df_store)

In [6]:
# Cleaning:

df_train_cv = cln.drop_column(full_df_train_cv, column='Customers')
df_test = cln.drop_column(full_df_test, column='Customers')


df_train_cv = cln.clean_targets(df_train_cv, target='Sales')
df_test = cln.clean_targets(df_test, target='Sales')
    
df_train_cv = cln.rough_features_cleaning(df_train_cv, threshold=0.10, drop_columns=True, verbose=False)
df_test = cln.rough_features_cleaning(df_test, threshold=0.10, drop_columns=True, verbose=False)

df_train_cv = cln.drop_column(df_train_cv, column='Open')
df_test = cln.drop_column(df_test, column='Open')

Total number of rows before cleaning:  531983
Total number of rows after cleaning:  425689
Total number of rows before cleaning:  315540
Total number of rows after cleaning:  314760


In [7]:
# Features engineering

df_train_cv = feng.dates_features(df_train_cv)
df_test = feng.dates_features(df_test)

df_train_cv = feng.one_hot_encoding(df_train_cv, 'StateHoliday')
df_test = feng.one_hot_encoding(df_test, 'StateHoliday')

df_train_cv, df_test = feng.mean_encoding(df_train_cv, df_test, 'Store')
df_train_cv = cln.drop_column(df_train_cv, column='Store')
df_test = cln.drop_column(df_test, column='Store')

df_train_cv, df_test = feng.mean_encoding(df_train_cv, df_test, 'Assortment')
df_train_cv = cln.drop_column(df_train_cv, column='Assortment')
df_test = cln.drop_column(df_test, column='Assortment')

df_train_cv, df_test = feng.mean_encoding(df_train_cv, df_test, 'StoreType')
df_train_cv = cln.drop_column(df_train_cv, column='StoreType')
df_test = cln.drop_column(df_test, column='StoreType')

df_train_cv, df_test = feng.mean_encoding(df_train_cv, df_test, 'day_of_week')
df_train_cv = cln.drop_column(df_train_cv, column='day_of_week')
df_test = cln.drop_column(df_test, column='day_of_week')

In [8]:
def metric(preds, actuals):
    preds = preds.reshape(-1)
    actuals = actuals.reshape(-1)
    assert preds.shape == actuals.shape
    return 100 * np.linalg.norm((actuals - preds) / actuals) / np.sqrt(preds.shape[0])

In [9]:
# Features and targets used:

X_train = df_train_cv.drop(['Date','Sales'], axis=1)
y_train = df_train_cv.loc[:, 'Sales']

X_test = df_test.drop(['Date','Sales'], axis=1)
y_test = df_test.loc[:, 'Sales']

In [10]:
# Load model and predict
best_model = xgboost.XGBRegressor()

best_model.load_model('./models/xgb_team1.model')

preds_test = best_model.predict(X_test)
actuals_test = df_test.Sales.to_numpy()

print(f'Final test RMSPE = {metric(preds_test, actuals_test):.2f}%')

Final test RMSPE = 27.65%
