In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from statistics import mean
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_squared_log_error
from statsmodels.tsa.deterministic import DeterministicProcess, CalendarFourier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [2]:
DATAPATH = "data/store-sales"

split_size = 0.7
drop_cols = ['id', 'sales', 'transferred']
scaler = MinMaxScaler()
separator = 'family'
frag = True  # set False for single model

In [3]:
train_df = pd.read_csv(DATAPATH + '/train.csv', parse_dates=['date'])
test_df = pd.read_csv(DATAPATH + '/test.csv', parse_dates=['date'])

stores_df = pd.read_csv(DATAPATH + '/stores.csv')
stores_df = stores_df.rename(columns={'type': 'store_type'})

transactions_df = pd.read_csv(DATAPATH + '/transactions.csv', parse_dates=['date'])
oil_df = pd.read_csv(DATAPATH + '/oil.csv', parse_dates=['date'])

holidays_df = pd.read_csv(DATAPATH + '/holidays_events.csv', parse_dates=['date'])
holidays_df = holidays_df.rename(columns={'type': 'holiday_type'})
holidays_df['is_holiday'] = 1

In [4]:
def assign_time_ft(df):
    df['payday'] = ((df['date'].dt.day == 15) | df['date'].dt.is_month_end).astype(int)
    df["dayofyear"] = df['date'].dt.dayofyear
    df["weekofyear"] = df['date'].dt.isocalendar().week
    df['weekday'] = df['date'].dt.weekday
    df['day'] = df['date'].dt.day
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year

    df['is_weekday'] = 0
    df.loc[df['weekday'] < 5, 'is_weekday'] = 1

    df["season"] = df["month"] % 12 // 3
    df["school_season"] = df["month"].isin([4, 5, 8, 9])

    df['earthquake_rl'] = np.where(df['description'].str.contains('Terremoto Manabi'), 1, 0)

    return df

In [5]:
def handle_na(df):
    df = df.copy()
    df['transferred'] = df['transferred'].fillna(False).astype(int)
    df['is_holiday'] = df['is_holiday'].fillna(0).astype(int)

    df['description'] = df['description'].fillna('None')

    df['holiday_type'] = df['holiday_type'].fillna('Common')
    df['locale'] = df['locale'].fillna('Common')
    df['locale_name'] = df['locale_name'].fillna('Ecuador')  # NEED TO BE CHECKED

    df['dcoilwtico'] = df['dcoilwtico'].fillna(method='backfill')

    df['transactions'] = df['transactions'].fillna(0).astype(int)

    return df

In [6]:
def lag_ft(df, lag_infos):
    df = df.copy()
    for col_name, lags in lag_infos.items():
        for lag in lags:
            df[f'{col_name}_{lag}'] = df[col_name].shift(lag)
            df[f'{col_name}_{lag}'] = df[f'{col_name}_{lag}'].fillna(0).astype(float)

    return df

In [7]:
def window_ft(df):
    df = df.copy()
    times = {'week': 7, '2weeks': 14, 'month': 28}

    for k, v in times.items():
        df[f'oil_{k}_avg'] = df['dcoilwtico'].rolling(v).mean()
        df[f'oil_{k}_avg'] = df[f'oil_{k}_avg'].fillna(0).astype(float)

        df[f'oil_{k}_min'] = df['dcoilwtico'].rolling(v).min()
        df[f'oil_{k}_min'] = df[f'oil_{k}_min'].fillna(0).astype(float)

        df[f'oil_{k}_max'] = df['dcoilwtico'].rolling(v).max()
        df[f'oil_{k}_max'] = df[f'oil_{k}_max'].fillna(0).astype(float)

    df['avg_transactions'] = df['transactions'].rolling(15, min_periods=10).mean()
    df['avg_transactions'] = df['avg_transactions'].fillna(0).astype(float)

    df['min_transactions'] = df['transactions'].rolling(15, min_periods=10).min()
    df['min_transactions'] = df['min_transactions'].fillna(0).astype(float)

    df['max_transactions'] = df['transactions'].rolling(15, min_periods=10).max()
    df['max_transactions'] = df['max_transactions'].fillna(0).astype(float)

    return df

In [8]:
def weird(df):
    fourier = CalendarFourier(freq='D', order=3)
    dp = DeterministicProcess(index=df.index,
                              order=1,
                              seasonal=False,
                              constant=False,
                              additional_terms=[fourier],
                              drop=True)
    x = dp.in_sample()
    x = x.join(df)
    return x

In [9]:
def format_sales(df):
    df = df.merge(stores_df, on='store_nbr', how='left')
    df = df.merge(oil_df, on='date', how='left')
    df = df.merge(transactions_df, on=['date', 'store_nbr'], how='left')
    df = df.merge(holidays_df, on='date', how='left')

    df = handle_na(df)
    df.loc[df['transferred'] == 1, 'is_holiday'] = 0
    df.loc[df['holiday_type'] == 'Bridge', 'is_holiday'] = 1
    df.loc[df['holiday_type'] == 'Transfer', 'is_holiday'] = 1

    lb = LabelEncoder()
    df['family'] = lb.fit_transform(df['family'])
    df['city'] = lb.fit_transform(df['city'])
    df['state'] = lb.fit_transform(df['state'])
    df['store_type'] = lb.fit_transform(df['store_type'])
    df['holiday_type'] = lb.fit_transform(df['holiday_type'])
    df['locale'] = lb.fit_transform(df['locale'])
    df['locale_name'] = lb.fit_transform(df['locale_name'])

    df = assign_time_ft(df)
    df = df.drop(['description'], axis=1)

    lag_features = {
        'dcoilwtico': [1, 3, 5, 7],
        'transactions': [1, 3, 5, 7]
    }

    for uval in df[separator].unique():
        sub_df = df.loc[df[separator] == uval]

        sub_df = lag_ft(sub_df, lag_features)
        sub_df = window_ft(sub_df)
        df.loc[sub_df.index, sub_df.columns] = sub_df

    df[df.select_dtypes(np.int32).columns] = df.select_dtypes(np.int32).astype(np.int64)
    df[df.select_dtypes(np.float32).columns] = df.select_dtypes(np.float32).astype(np.float64)

    return df.set_index('date')

In [10]:
train_data = format_sales(train_df)
scaled = train_data.drop(['id', 'sales'], axis=1)
train_data[scaled.columns] = scaler.fit_transform(scaled[scaled.columns])

In [21]:
zeroes_df = train_data.groupby(["store_nbr", "family"], as_index=False).sales.sum()
zeroes_df = zeroes_df[zeroes_df.sales == 0]

In [22]:
test_data = format_sales(test_df)
scaled = test_data.drop(['id'], axis=1)
test_data[scaled.columns] = scaler.transform(scaled[scaled.columns])

In [23]:
feature_names = list(filter(lambda i: i not in drop_cols, train_data.columns))

In [24]:
if frag:
    train_dict = {}
    test_dict = {}

    for idx, val in enumerate(train_data[separator].unique()):
        train_dict[idx] = train_data.loc[train_data[separator] == val]
        test_dict[idx] = test_data.loc[test_data[separator] == val]

    train_data = train_dict
    test_data = test_dict

In [25]:
def show_metrics(y_val, y_pred, mdict):
    mae = mean_absolute_error(y_val, y_pred)
    mse = mean_squared_error(y_val, y_pred, squared=True)
    rmsle = mean_squared_log_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)

    print("\nRegression metrics")
    print('MAE: {:.2f}'.format(mae))
    print('MSE: {:.2f}'.format(mse))
    print('RMSLE: {:.2f}'.format(rmsle))
    print('R2: {:.2f}'.format(r2))

    if mdict is not None:
        mdict["mae"].append(mae)
        mdict["mse"].append(mse)
        mdict["rmsle"].append(rmsle)
        mdict["r2"].append(r2)


def plot_predictions(nb_samples, y_val, y_pred):
    sp_list = list(range(0, nb_samples))
    plt.figure(figsize=(10, 6))

    plt.plot(sp_list, y_val, label='Expected', alpha=0.5)
    plt.plot(sp_list, y_pred, label='Predicted', alpha=0.5)
    plt.legend(loc="upper right")
    plt.show()

    plt.plot(sp_list, abs(y_val - y_pred), label='Difference')
    plt.legend(loc="upper right")
    plt.show()

In [26]:
def create_submission(df):
    sales_df = df.drop(drop_cols, axis=1, errors='ignore')

    sales_values = model.predict(sales_df)
    sales_values[sales_values < 0] = 0

    submission_df = pd.read_csv(DATAPATH + '/sample_submission.csv')
    submission_df['sales'] = sales_values

    temp_df = df.copy()
    temp_df['sales'] = sales_values
    zeroed_df = pd.merge(temp_df, zeroes_df, left_on=['store_nbr', 'family'],
                         right_on=['store_nbr', 'family'], how='left')
    zeroed_df.loc[zeroed_df['sales_y'] == 0, 'sales_x'] = zeroed_df['sales_y']

    zeroed_df.rename(columns={'sales_x': 'sales'}, inplace=True)
    zeroed_df = zeroed_df.drop('sales_y', axis=1)
    zeroed_df = zeroed_df[['id', 'sales']]

    submission_df = submission_df.merge(zeroed_df, on='id', how='left')
    submission_df['sales'] = submission_df['sales_y'].fillna(submission_df['sales_x'])
    submission_df = submission_df.drop(['sales_x', 'sales_y'], axis=1)

    submission_df.to_csv('submission.csv', index=False)

    print("Submission saved!")


def create_frag_submission(models_dict, df):
    submission_df = pd.read_csv(DATAPATH + '/sample_submission.csv')

    for fam, split_model in models_dict.items():
        current_df = df[fam]
        fam_df = current_df.drop(drop_cols, axis=1, errors='ignore')

        sales_values = split_model.predict(fam_df)

        # zeroing
        sales_values[sales_values < 0] = 0

        temp_df = current_df.copy()
        temp_df['sales'] = sales_values

        zeroed_df = pd.merge(temp_df, zeroes_df, left_on=['store_nbr', 'family'],
                             right_on=['store_nbr', 'family'], how='left')
        zeroed_df.loc[zeroed_df['sales_y'] == 0, 'sales_x'] = zeroed_df['sales_y']

        zeroed_df.rename(columns={'sales_x': 'sales'}, inplace=True)
        zeroed_df = zeroed_df.drop('sales_y', axis=1)
        zeroed_df = zeroed_df[['id', 'sales']]

        submission_df = submission_df.merge(zeroed_df, on='id', how='left')
        submission_df['sales'] = submission_df['sales_y'].fillna(submission_df['sales_x'])
        submission_df = submission_df.drop(['sales_x', 'sales_y'], axis=1)

    submission_df.to_csv('submission.csv', index=False)

    print("Submission saved!")


In [27]:
def format_data(data_df):
    train_size = int(len(data_df) * split_size)

    train_df, val_df = data_df[:train_size], data_df[train_size:]

    x_train, y_train = train_df.drop(drop_cols, axis=1), train_df['sales']
    x_val, y_val = val_df.drop(drop_cols, axis=1), val_df['sales']

    return x_train, x_val, y_train, y_val

In [28]:
model = xgb.XGBRegressor(n_estimators=250, importance_type='gain', eval_metric='rmse',
                         early_stopping_rounds=20, verbosity=1)

In [29]:
if frag:

    metrics_dict = {"mae": [], "mse": [], "rmsle": [], "r2": []}
    family_models = {}

    for family, split_df in train_data.items():
        x_train, x_val, y_train, y_val = format_data(split_df)

        fit_model = model.fit(x_train, y_train, eval_set=[(x_val, y_val)])

        print(f"\nEvaluating family model {family}")
        y_pred = fit_model.predict(x_val)
        y_pred[y_pred < 0] = 0

        show_metrics(y_val, y_pred, metrics_dict)

        family_models[family] = fit_model

    print(f"\nTraining summary")

    for key, val in metrics_dict.items():
        print("Average {} : {:.2f}".format(key.upper(), mean(val)))

    create_frag_submission(family_models, test_data)

[0]	validation_0-rmse:7.15138
[1]	validation_0-rmse:5.99039
[2]	validation_0-rmse:5.38370
[3]	validation_0-rmse:5.00453
[4]	validation_0-rmse:4.79099
[5]	validation_0-rmse:4.70562
[6]	validation_0-rmse:4.66435
[7]	validation_0-rmse:4.64545
[8]	validation_0-rmse:4.63263
[9]	validation_0-rmse:4.62957
[10]	validation_0-rmse:4.63641
[11]	validation_0-rmse:4.63992
[12]	validation_0-rmse:4.64868
[13]	validation_0-rmse:4.64011
[14]	validation_0-rmse:4.64600
[15]	validation_0-rmse:4.64463
[16]	validation_0-rmse:4.64447
[17]	validation_0-rmse:4.64455
[18]	validation_0-rmse:4.64256
[19]	validation_0-rmse:4.64700
[20]	validation_0-rmse:4.64867
[21]	validation_0-rmse:4.65192
[22]	validation_0-rmse:4.65717
[23]	validation_0-rmse:4.65875
[24]	validation_0-rmse:4.66085
[25]	validation_0-rmse:4.66099
[26]	validation_0-rmse:4.65847
[27]	validation_0-rmse:4.65935
[28]	validation_0-rmse:4.66111

Evaluating family model 0

Regression metrics
MAE: 3.09
MSE: 21.43
RMSLE: 0.33
R2: 0.43
[0]	validation_0-rmse:

In [30]:
if not frag:
    x_train, x_val, y_train, y_val = format_data(train_data)

    model = model.fit(x_train, y_train, eval_set=[(x_val, y_val)])

    print(f"\nEvaluating model")
    y_pred = model.predict(x_val)
    y_pred[y_pred < 0] = 0

    show_metrics(y_val, y_pred, None)
    plot_predictions(len(x_val), y_val, y_pred)

    features_val = model.feature_importances_

    plt.figure(figsize=(12, 6))

    (pd.Series(features_val, index=feature_names)
     .sort_values(ascending=True)
     .plot(kind='barh'))

    plt.show()

    create_submission(test_data)



End of program
