In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

# Load scripts from parent path
import sys, os
sys.path.insert(0, os.path.abspath('..'))

## Load Data

In [2]:
from scripts.processing import load_train_data, process_data

train_raw = load_train_data()
train = process_data(train_raw)
train.head()

Unnamed: 0,Date,Store,DayOfWeek,Sales,Open,Promo,StateHoliday,SchoolHoliday
27,2013-01-01,353.0,2.0,3139.0,1.0,0.0,0,1.0
115,2013-01-01,335.0,2.0,2401.0,1.0,0.0,0,1.0
147,2013-01-01,512.0,2.0,2646.0,1.0,0.0,0,1.0
162,2013-01-01,494.0,2.0,3113.0,1.0,0.0,0,1.0
199,2013-01-01,530.0,2.0,2907.0,1.0,0.0,0,1.0


## Prepare train/test data

In [3]:
X_train = train.copy(deep=True).drop(columns=["Sales"])
y_train = train.loc[:, "Sales"]

## Simple Models

In [4]:
def metric(preds, actuals):    
    preds = preds.reshape(-1)
    actuals = actuals.reshape(-1)
    assert preds.shape == actuals.shape
    return 100 * np.linalg.norm((actuals - preds) / actuals) / np.sqrt(preds.shape[0])

#### Predict mean

In [5]:
lazy_predictor = pd.DataFrame(y_train.copy())

lazy_predictor.loc[:, 'y_pred'] = lazy_predictor.mean()['Sales']
lazy_predictor.head()

Unnamed: 0,Sales,y_pred
27,3139.0,6837.740902
115,2401.0,6837.740902
147,2646.0,6837.740902
162,3113.0,6837.740902
199,2907.0,6837.740902


In [6]:
metric(lazy_predictor.loc[:, 'y_pred'].values, lazy_predictor.loc[:, 'Sales'].values)

61.73132119104066

### Random Forest

In [8]:
from sklearn.ensemble import RandomForestRegressor

# Drop date column too, makes no sense for random forest
X_train = train.copy(deep=True).drop(columns=["Sales", "Date"])
y_train = train.loc[:, "Sales"].values

reg = RandomForestRegressor(random_state=42)

reg.fit(X_train, y_train)
y_pred = reg.predict(X_train)

metric(y_train, y_pred)

15.677881334149104

In [7]:
X_train

Unnamed: 0,Date,Store,DayOfWeek,Open,Promo,StateHoliday,SchoolHoliday
27,2013-01-01,353.0,2.0,1.0,0.0,0,1.0
115,2013-01-01,335.0,2.0,1.0,0.0,0,1.0
147,2013-01-01,512.0,2.0,1.0,0.0,0,1.0
162,2013-01-01,494.0,2.0,1.0,0.0,0,1.0
199,2013-01-01,530.0,2.0,1.0,0.0,0,1.0
...,...,...,...,...,...,...,...
637766,2014-07-31,748.0,4.0,1.0,1.0,3,1.0
637768,2014-07-31,743.0,4.0,1.0,1.0,3,1.0
637769,2014-07-31,752.0,4.0,1.0,1.0,3,1.0
637772,2014-07-31,755.0,4.0,1.0,1.0,3,1.0
