In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
def assign_time_ft(df):
    df['payday'] = ((df['date'].dt.day == 15) | df['date'].dt.is_month_end).astype(int)
    df['weekday'] = df['date'].dt.weekday
    df['day'] = df['date'].dt.day
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year

    df['is_weekday'] = 0
    df.loc[df['weekday'] < 5, 'is_weekday'] = 1

    df["season"] = np.where(df.month.isin([12, 1, 2]), 0, 1)
    df["season"] = np.where(df.month.isin([6, 7, 8]), 2, df["season"])
    df["season"] = pd.Series(np.where(df.month.isin([9, 10, 11]), 3, df["season"])).astype("int8")

    return df

In [None]:
def fill_na(df):
    # df['holiday_type'] = df['holiday_type'].fillna('Common')
    # df['locale'] = df['locale'].fillna('Common')
    # df['description'] = df['description'].fillna('Unknown')
    df['transactions'] = df['transferred'].fillna(0)

    df['transferred'] = df['transferred'].fillna(False)
    df['transferred'] = df['transferred'].astype(int)

    df['dcoilwtico'] = df['dcoilwtico'].fillna(method='backfill')

    return df

In [None]:
def lag_ft(df, cols, lags):
    for c in cols:
        for lag in lags:
            df[f'{c}_{lag}'] = df[c].shift(lag)

    return df

In [None]:
def unify_types(df):
    df[df.select_dtypes(np.int64).columns] = df.select_dtypes(np.int64).astype(np.int32)
    df[df.select_dtypes(np.float32).columns] = df.select_dtypes(np.float32).astype(np.float64)

    return df

In [None]:
def format_sales(df, data_path):
    stores_df = pd.read_csv(data_path + '/store-sales/stores.csv')
    oil_df = pd.read_csv(data_path + '/store-sales/oil.csv', parse_dates=['date'])

    holidays_df = pd.read_csv(data_path + '/store-sales/holidays_events.csv', parse_dates=['date'])
    holidays_df['holiday'] = 1

    transactions_df = pd.read_csv(data_path + '/store-sales/transactions.csv', parse_dates=['date'])

    df = df.merge(stores_df, on='store_nbr', how='left')
    df = df.merge(oil_df, on='date', how='left')
    df = df.merge(transactions_df, on=['date', 'store_nbr'], how='left')

    lb = LabelEncoder()
    df['family'] = lb.fit_transform(df['family'])
    df['city'] = lb.fit_transform(df['city'])
    df['state'] = lb.fit_transform(df['state'])
    df['type'] = lb.fit_transform(df['type'])

    df = df.merge(holidays_df[['date', 'holiday', 'transferred']], on='date', how='left')

    df['holiday'].fillna(0, inplace=True)
    df['holiday'] = df['holiday'].astype(int)

    df = fill_na(df)
    df = lag_ft(df, ['dcoilwtico', 'sales'], [1, 2, 3, 7, 14, 21, 364])
    df = assign_time_ft(df)

    df = unify_types(df)
    df = df.set_index('date')

    return df

In [None]:
def read_sales(data_path):
    train_df = pd.read_csv(data_path + '/store-sales/train.csv', parse_dates=['date'])
    test_df = pd.read_csv(data_path + '/store-sales/test.csv', parse_dates=['date'])

    data_df = pd.concat([train_df, test_df], axis=0)
    data_df = format_sales(data_df, data_path)

    train_df = data_df[data_df.index <= pd.to_datetime("2017-08-15")]
    train_df = train_df.dropna()

    test_df = data_df[data_df.index > pd.to_datetime("2017-08-15")]

    return train_df, test_df

In [None]:
DATAPATH = ""

train_data, test_data = read_sales(DATAPATH)

# train-test split for time series
train_size = int(len(train_data) * 0.80)
val_size = len(train_data) - train_size
train_df, val_df = train_data[:train_size], train_data[train_size:]

In [None]:
def plot_predictions(nb_samples, y_val, y_pred):
    sp_list = list(range(0, nb_samples))
    plt.figure(figsize=(10, 6))

    plt.plot(sp_list, y_val, label='Expected', alpha=0.5)
    plt.plot(sp_list, y_pred, label='Predicted', alpha=0.5)
    plt.legend(loc="upper right")
    plt.show()

    plt.plot(sp_list, abs(y_val - y_pred), label='Difference')
    plt.legend(loc="upper right")
    plt.show()


class BaseModel:
    def __init__(self, show_fip=True, use_pca=False, scaler=None):
        self.use_pca = use_pca
        self.show_fip = show_fip

        self.model = None
        self.pca = None
        self.dropped = []
        self.ft_values = []
        self.feature_names = None

        self.scaler = scaler
        self.nb_train_samples = 0
        self.nb_val_samples = 0

    def apply_pca(self, x_train, x_val):
        self.pca = PCA(n_components=0.95)
        x_train = self.pca.fit_transform(x_train)
        x_val = self.pca.transform(x_val)

        n_pcs = self.pca.components_.shape[0]

        # get the most important feature on EACH component
        most_important = [np.abs(self.pca.components_[i]).argmax() for i in range(n_pcs)]
        # get the names
        most_names = [self.feature_names[most_important[i]] for i in range(n_pcs)]
        self.feature_names = ['PC{}_{}'.format(i + 1, most_names[i]) for i in range(n_pcs)]

        return x_train, x_val

    def process_data(self, train_df, val_df, drop_cols):

        self.dropped = drop_cols
        x_train = train_df.drop(drop_cols, axis=1)
        y_train = train_df['sales']

        x_val = val_df.drop(drop_cols, axis=1)
        y_val = val_df['sales']

        self.nb_train_samples = len(x_train)
        self.nb_val_samples = len(x_val)
        self.feature_names = list(x_train.columns)

        x_train = self.scaler.fit_transform(x_train)
        x_val = self.scaler.transform(x_val)

        if self.use_pca:
            x_train, x_val = self.apply_pca(x_train, x_val)

        return x_train, x_val, y_train, y_val

    def assign_ftip(self, ft_type):
        if self.show_fip:
            if ft_type == "linear":
                return self.model.coef_
            elif ft_type == "tree":
                return self.model.feature_importances_
            else:
                return []

    def train(self, x_train, y_train, x_val, y_val, ft_type="linear"):
        self.model.fit(x_train, y_train)
        self.ft_values = self.assign_ftip(ft_type)

    def predict(self, val, new_data=False, neg_to_zero=True):

        if new_data:
            val = val.drop(self.dropped, axis=1)
            val = self.scaler.transform(val)

            if self.use_pca:
                val = self.pca.transform(val)

        y_pred = self.model.predict(val)

        if neg_to_zero:
            y_pred[y_pred < 0] = 0

        return y_pred

    def show_feature_importance(self, ft_val):
        plt.figure(figsize=(12, 6))

        (pd.Series(ft_val, index=self.feature_names)
         .sort_values(ascending=True)
         .plot(kind='barh'))

        plt.show()

    def resume_training(self, y_val, y_pred):

        print("\nRegression metrics")
        print('MAE: {:.2f}'.format(mean_absolute_error(y_val, y_pred)))
        print('MSE: {:.2f}'.format(mean_squared_error(y_val, y_pred)))
        print('R2: {:.2f}'.format(r2_score(y_val, y_pred)))

        print("\nPlotting truth vs predictions")
        plot_predictions(self.nb_val_samples, y_val, y_pred)

        if self.show_fip:
            print("Plotting feature importance")
            self.show_feature_importance(self.ft_values)


In [None]:
class XGBModel(BaseModel):
    def __init__(self, show_fip=True, use_pca=False, scaler=MinMaxScaler(), ft_attr='gain', nb_estimators=100):
        super().__init__(show_fip, use_pca, scaler)

        self.model = xgb.XGBRegressor(n_estimators=nb_estimators, importance_type=ft_attr, eval_metric='rmse',
                                      early_stopping_rounds=10)

    def train(self, x_train, y_train, x_val, y_val, ft_type="tree"):
        self.model.fit(x_train, y_train, eval_set=[(x_val, y_val)])
        self.model.get_booster().feature_names = self.feature_names
        self.ft_values = self.assign_ftip(ft_type)


class LGBMModel(BaseModel):
    def __init__(self, show_fip=True, use_pca=False, scaler=MinMaxScaler(), ft_attr='gain', nb_estimators=100):
        super().__init__(show_fip, use_pca, scaler)

        self.model = LGBMRegressor(n_estimators=nb_estimators, importance_type=ft_attr, eval_metric='rmse',
                                   early_stopping_rounds=5)

    def train(self, x_train, y_train, x_val, y_val, ft_type="tree"):
        self.model.fit(x_train, y_train, eval_set=[(x_val, y_val)], feature_name=self.feature_names)
        self.ft_values = self.assign_ftip(ft_type)


class RandomForestModel(BaseModel):
    def __init__(self, show_fip=True, use_pca=False, scaler=StandardScaler(), nb_estimators=100):
        super().__init__(show_fip, use_pca, scaler)

        self.model = RandomForestRegressor(n_estimators=nb_estimators, verbose=2)

    def train(self, x_train, y_train, x_val, y_val, ft_type="tree"):
        self.model.fit(x_train, y_train)
        self.ft_values = self.assign_ftip(ft_type)

In [None]:
drop_cols = ['id', 'sales', 'dcoilwtico', 'dcoilwtico_21', 'is_weekday', 'transferred']

forecast_model = XGBModel(show_fip=True, use_pca=False, scaler=MinMaxScaler(), nb_estimators=200)

In [None]:
x_train, x_val, y_train, y_val = forecast_model.process_data(train_df, val_df, drop_cols)
forecast_model.train(x_train, y_train, x_val, y_val)

In [None]:
y_pred = forecast_model.predict(x_val)
forecast_model.resume_training(y_val, y_pred)

In [None]:
test_sales = forecast_model.predict(test_data, new_data=True)

submission_df = pd.read_csv(DATAPATH + '/store-sales/sample_submission.csv')
submission_df['sales'] = test_sales
submission_df.to_csv('submission.csv', index=False)

print("Submission saved!")
