In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb

In [None]:
def prepare_date(train_val_test_dates=[None, None, None],
                 extra_features=None,
                 only_normal_shops=False,
                 only_normal_items=False):
    """
    Prepares the data in "train_clean.csv" for xgboost, with options for feature engineering and other manipulation.

    Args:
        train_val_test_dates: 'YYYY-MM-DD' format, specify train start date, val start date, test start date.
        extra_features=None: if True, item_category and an assortment of time-based features are added
        only_normal_shops=False: some shops are not present in "test.csv" and tend to have unusual distributions. if True, exclude these.
        only_normal_items=False: some items are not present in "test.csv". if True, exclude these.

    Returns:
        dtrain, dvalidation, dtest, train, validation, test, df
    """
    def only_normal(df, shops=False, items=False):
        """
        Returns df with only shops, items found in "test.csv" depending on if shops, items = True is set (defaults: False). mutates df.
        """
        normal = pd.read_csv('test.csv')

        # if enabled, only items and/or shops in 'test.csv' are retained in self.df
        if shops:
            normal_shops = normal['shop_id'].unique()
            df = df[df['shop_id'].isin(normal_shops)]
        if items:
            normal_items = normal['item_id'].unique()
            df = df[df['item_id'].isin(normal_items)]
        return df

    def add_features(df):
        """
        Add item categories to df and assorted date features. Mutates df.

        Returns: df
        """
        # get items csv and use id and categories columns as key to add item categories to dataframe
        df['item_category_id'] = df['item_id'].map(items.set_index('item_id')['item_category_id'])

        # add some time features, add the dates as a column to do this, remove date column at the end
        df['date'] = df.index
        df['dayofweek'] = df['date'].dt.dayofweek
        df['month'] = df['date'].dt.month
        df['year'] = df['date'].dt.year
        df['dayofyear'] = df['date'].dt.dayofyear
        df['dayofmonth'] = df['date'].dt.day

        df.drop(columns='date', inplace=True)
        return df

    def train_val_test_split(df, train_val_test_dates):
        """
        Returns train, val, test split of df, as dataframes including labels, according to date splits. Does not mutate df.

        Format: 'YYYY-MM-DD'
        """
        train_start_date, validation_start_date, test_start_date = train_val_test_dates[0], train_val_test_dates[1], train_val_test_dates[2]

        train = df[(train_start_date <= df.index) & df.index < val_start_date].copy()
        validation = df[(val_start_date <= df.index) & (df.index < test_start_date)].copy()
        test = df[test_start_date <= df.index].copy()

        return train, validation, test

    def make_dmatrix(df):
        """
        Convert pandas dataframe to DMatrix for xgboost, splitting off labels and including in dmatrix as need. Does not mutate df.
        """
        X = dataframe.copy()
        y = X.pop('item_cnt_day')
        dmatrix = xgb.DMatrix(X, label=y)
        return dmatrix

    # import and use dates as index, sort by date
    df = pd.read_csv("train_clean.csv", index_col=[0], parse_dates=[0], dayfirst=True)
    df = df.sort_index()

    # if True, use only shops and/or items found in "test.csv"
    if only_normal_shops:
        df = only_normal(df, shops=True)
    if only_normal_items:
        df = only_normal(df, items=True)

    # if True, add item category and various time features
    if extra_features:
        df = add_features(df)

    train, validation, test = train_val_test_split(df, train_val_test_dates)

    # dmatrices for xgboost
    dtrain, dvalidation, dtest = make_dmatrix(train), make_dmatrix(validation), make_dmatrix(test)

    return dtrain, dvalidation, dtest, train, validation, test, df


In [None]:
# 2nd to last month (sep 2015) as validation
# last month (oct 2015) as test
val_start_date, test_start_date = '2015-09-01', '2015-10-01'


In [None]:
train.groupby('date').item_cnt_day.mean().plot(figsize=(20, 5))

In [None]:
# might also want to use mean
validation['item_cnt_day'].groupby('date').sum().plot(figsize=(20, 5))


In [None]:
# make model
def make_model():
    params = {}
    params['eval_metric'] = ['mae', 'mape', 'rmse'] # last metric is used for early stopping
    params['nthread'] = 4
    params['eta'] = .3 # learning rate, default = 0.3
    params['max_depth'] = 6 # default = 6

    # gpu (needs CUDA compute >= 3.5)
    # params['gpu_id'] = 0
    # params['tree_method'] = 'gpu_hist'

    # track performance on this data
    evallist = [(dvalidation, 'eval'), (dtrain, 'train')]

    num_boost_round = 200 # how many trees, default = 10
    early_stopping_rounds = 10

    model = xgb.train(params=params,
                    dtrain=dtrain,
                    num_boost_round=num_boost_round,
                    evals=evallist,
                    early_stopping_rounds=early_stopping_rounds)

In [None]:
# predict using the best iteration's values
val_predictions = model.predict(dvalidation, ntree_limit=model.best_ntree_limit)

# pd.DataFrame(data=val_predictions, index=validation.index, columns=['item_cnt_pred']).groupby('date').sum().plot(figsize=(20, 5))

# this isn't so bad! the real surprise is why adding time features doesn't seem to make a difference...
val_compare = pd.DataFrame(data=val_predictions, index=validation.index, columns=['item_cnt_pred'])
val_compare['item_cnt_day'] = validation['item_cnt_day']
val_compare.groupby('date').sum().plot(figsize=(20, 5))