In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import gc
import sys
import time
import gc
import itertools
import tqdm
import time
import copy

from sklearn.preprocessing import LabelEncoder
import scipy.stats as ss

if os.name=='nt':
    try:
        mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-8.1.0-posix-seh-rt_v6-rev0\\mingw64\\bin'
        os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
    except:
        pass
    
import xgboost as xgb
from sklearn.metrics import mean_squared_error 
from numba import jit

%matplotlib inline

## Helper Functions

In [2]:
def clip_rmse(preds, dtrain):
    y_test = np.array(dtrain.get_label())
    preds = np.array(preds)
    y_test = np.maximum(np.minimum(y_test, 20), 0)
    preds = np.maximum(np.minimum(preds, 20), 0)
    #preds = np.array(list(map(lambda x: min(20, max(x, 0)), list(preds))))
    #y_test = np.array(list(map(lambda x: min(20, max(x, 0)), list(y_test))))
    rmse = np.sqrt(mean_squared_error(preds,y_test))
    return 'clip-rmse', rmse

In [5]:
def ensemble_level1(n_model, param, n_tree, x_train, y_train, month_in_cv, sub_name, verbose=True, random_state=42):
    '''Change random seed and train, then combine predictions'''
    
    np.random.seed(random_state)
    
    bst = {}
    preds = []
    m = month_in_cv[-2]
    
    for n in range(n_model):
        # Train the model with a different random seed
        param['seed'] = np.random.randint(100000)
        
        # Train the last cv model for prediction
        print('Start training model {} on month {}'.format(n, m))
        time_start = time.time()
        dtrain = xgb.DMatrix(x_train[m], y_train[m], nthread=-1)
        watchlist = [(dtrain, 'train')]

        bst[n] = xgb.train(param, 
                        dtrain, 
                        evals=watchlist,
                        verbose_eval=verbose,
                        num_boost_round=n_tree,
                        feval=clip_rmse)

        time_end = time.time()
        print('Train of model {} on month {} is finished, uses {:.2f} sec.\n'.format(n, m, time_end-time_start))

        # Prediction
        print('Start predicting model {} on month {}.'.format(n, m+1))
        preds.append(bst[n].predict(xgb.DMatrix(x_train[m+1])))
        print('Prediction of model {} on month {} is finished.\n'.format(n, m+1))
        
        print('---------------------------------------\n')
    
    preds_mean = np.array(preds).mean(axis=0)
    preds_mean = list(map(lambda x: min(20, max(x, 0)), list(preds_mean)))
    preds_mean = pd.DataFrame({'ID': cnt.index, 'item_cnt_month': preds_mean})
    preds_mean.to_csv(sub_name, index=False)
    
    
    return bst, preds, preds_mean

In [4]:
def cv_predict(param, n_tree, x_train, y_train, month_in_cv, sub_name, verbose=False):
    '''CV and predict'''
    progress = dict()
    bst_cv = dict()
    
    for m in month_in_cv:
        if m<month_in_cv[-2]:
            # CV 
            print('CV of month {} is started.'.format(m))
            time_start = time.time()
            progress[m] = {}
            dtrain = xgb.DMatrix(x_train[m], y_train[m], nthread=-1)
            dval = xgb.DMatrix(x_train[m+1], y_train[m+1], nthread=-1)
            watchlist = [(dtrain, 'train'), (dval, 'val')]

            bst_cv[m] = xgb.train(param, 
                            dtrain, 
                            evals=watchlist,
                            evals_result=progress[m], 
                            verbose_eval=verbose,
                            num_boost_round=n_tree,
                            feval=clip_rmse)

            time_end = time.time()

            print('CV of month {} is finished, uses {:.2f} sec, clip-rmse on val {:.2f}\n'.format(m, 
                time_end-time_start, progress[m]['val']['clip-rmse'][-1]))
            
        elif m==month_in_cv[-2]:
            # Train the last cv model for prediction
            print('Training of month {} is started.'.format(m))
            time_start = time.time()
            dtrain = xgb.DMatrix(x_train[m], y_train[m], nthread=-1)
            watchlist = [(dtrain, 'train')]

            bst_cv[m] = xgb.train(param, 
                            dtrain, 
                            evals=watchlist,
                            verbose_eval=verbose,
                            num_boost_round=n_tree,
                            feval=clip_rmse)

            time_end = time.time()
            print('Train of month {} is finished, uses {:.2f} sec.\n'.format(m, time_end-time_start))
                  
        else:
            # Prediction
            print('Predicting of month {} is started.'.format(m))
            preds = bst_cv[m-1].predict(xgb.DMatrix(x_train[m]))
            preds = list(map(lambda x: min(20, max(x, 0)), list(preds)))
            sub_df = pd.DataFrame({'ID': cnt.index, 'item_cnt_month': preds})
            sub_df.to_csv(sub_name, index=False)
            print('Prediction of month {} is finished.\n'.format(m))
        
    progress = {m: {'train': progress[m]['train']['clip-rmse'], 
            'val': progress[m]['val']['clip-rmse']} 
        for m in progress.keys()}
    progress = pd.DataFrame({(m, k): progress[m][k] 
                             for m in progress.keys() 
                             for k in progress[m].keys()})
    
    return progress, bst_cv, sub_df

In [6]:
def cv_predict_repeat(param, n_tree, n_repetition, x_train, y_train, month_in_cv, sub_name, random_state=42, verbose=False):
    '''CV and predict'''
    progress = dict()
    bst_cv = dict()
    preds = []
    
    np.random.seed(random_state)
    
    for n in range(n_repetition):
        param['seed'] = np.random.randint(1000000)
        print('Repetition {} starts.'.format(n))
        for m in month_in_cv:
            if m<month_in_cv[-2]:
                # CV 
                print('CV of month {} is started.'.format(m))
                time_start = time.time()
                progress[n, m] = {}
                dtrain = xgb.DMatrix(x_train[m], y_train[m], nthread=-1)
                dval = xgb.DMatrix(x_train[m+1], y_train[m+1], nthread=-1)
                watchlist = [(dtrain, 'train'), (dval, 'val')]

                bst_cv[n, m] = xgb.train(param, 
                                dtrain, 
                                evals=watchlist,
                                evals_result=progress[n, m], 
                                verbose_eval=verbose,
                                num_boost_round=n_tree,
                                feval=clip_rmse)

                time_end = time.time()

                print('CV of month {} is finished, uses {:.2f} sec, clip-rmse on val {:.4f}\n'.format(m, 
                    time_end-time_start, progress[n, m]['val']['clip-rmse'][-1]))

            elif m==month_in_cv[-2]:
                # Train the last cv model for prediction
                print('Training of month {} is started.'.format(m))
                time_start = time.time()
                dtrain = xgb.DMatrix(x_train[m], y_train[m], nthread=-1)
                watchlist = [(dtrain, 'train')]

                bst_cv[n, m] = xgb.train(param, 
                                dtrain, 
                                evals=watchlist,
                                verbose_eval=verbose,
                                num_boost_round=n_tree,
                                feval=clip_rmse)

                time_end = time.time()
                print('Train of month {} is finished, uses {:.2f} sec.\n'.format(m, time_end-time_start))

            else:
                # Prediction
                print('Predicting of month {} is started.'.format(m))
                
                preds_current = bst_cv[n, m-1].predict(xgb.DMatrix(x_train[m]))
                preds.append(preds_current)
                print('Prediction of month {} is finished.\n'.format(m))
        print('Repetition {} finishes.'.format(n))
        print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n\n\n')
    
    # Prediction for testing data set
    preds_mean = np.array(preds).mean(axis=0)
    preds_mean = list(map(lambda x: min(20, max(x, 0)), list(preds_mean)))
    preds_mean = pd.DataFrame({'ID': cnt.index, 'item_cnt_month': preds_mean})
    preds_mean.to_csv(sub_name, index=False)
    
    # CV results
    progress = {(n, m): {'train': progress[n, m]['train']['clip-rmse'], 
            'val': progress[n, m]['val']['clip-rmse']} 
        for (n, m) in progress.keys()}
    progress = pd.DataFrame({(m, k): progress[m][k] 
                             for m in progress.keys() 
                             for k in progress[m].keys()})
    
    # Process the DataFrame
    index_level = [k.tolist() for k in progress.columns.levels]
    index_level = list(itertools.product(index_level[0], index_level[1]))
    index_level_1, index_level_2 = zip(*index_level)
    index_level_tuples = []
    for i in range(len(index_level_1)):
        tmp = list(index_level_1[i])
        tmp.append(index_level_2[i])
        index_level_tuples.append(tuple(tmp))
    progress.columns = pd.MultiIndex.from_tuples(index_level_tuples, names=['repetition', 'month', 'data'])
    progress.index.name = 'iteration'
    
    # Average over multiple repetitions
    # progress_mean_repetition = progress.groupby(level=[1, 2], axis=1).mean()
    # Average and standard deviation over multiple repetitions and months
    #progress_mean_std = progress.groupby(level=2, axis=1).agg(np.mean, np.std)
    
    return progress, bst_cv, preds_mean

In [7]:
def create_train_data(feature_fixed_list, feature_rolling_dict, month_in_model=31, month_test=34, 
                      month_in_cv=None, features=None):
    '''
    Return x_train and y_train
    month_in_cv can be None or 33, 33 is for feature importance calculation
    month_test is the month to be tested
    month_in_model+len(month_in_cv)-1<=month_test
    '''
    if month_in_cv is None:
        month_in_cv = list(range(month_in_model, month_test+1))
    
    assert (month_in_model+len(month_in_cv)-1<=month_test), 'Months are not correct!'
    
    cols_cv = {k: list(range(k-month_in_model, k)) for k in month_in_cv} # The last month is test
    x_train = {}
    y_train = {}
    for m in month_in_cv:
        print(m)
        df_list = copy.copy(feature_fixed_list)
        df_list.extend([feature_rolling_dict[k].iloc[:, cols_cv[m]] 
                        for k in feature_rolling_dict.keys()])
        if features is None:
            x_train[m] = pd.concat(df_list, axis=1, sort=False).values
        else:
            x_train[m] = pd.concat(df_list, axis=1, sort=False).values[:, features]
        if m<month_test:
             # only train months have y_train, test month does not have it
             # clip ground truth since the objective is clipped
            y_train[m] = np.maximum(np.minimum(cnt.iloc[:, m].values, 40.0), 0.0)
            
    feature_names = pd.concat(df_list, axis=1, sort=False).columns # feature names of the last month
            
    return x_train, y_train, feature_names, month_in_model, month_test, month_in_cv, df_list

## Feature Engineering

In [8]:
# Load data
train = pd.read_csv('all/sales_train.csv.gz')
test = pd.read_csv('all/test.csv.gz')
shop = pd.read_csv('all/shops-translated.csv')
item = pd.read_csv('all/item_category.csv')

test.set_index('ID', inplace=True)
item.drop(['item_name_translated'], axis=1, inplace=True)
shop.drop(['Name'], axis=1, inplace=True)

le = LabelEncoder()
item['item_cat1'] = le.fit_transform(item['item_cat1'].astype(str))
item['item_cat2'] = le.fit_transform(item['item_cat2'].astype(str))
shop['City'] = le.fit_transform(shop['City'])
shop['Type'] = le.fit_transform(shop['Type'])

# test = test.merge(item, on='item_id', how='left')
# test = test.merge(shop, on='shop_id', how='left')

In [17]:
cnt = train.pivot_table(index=['shop_id', 'item_id'], columns='date_block_num', 
                        values='item_cnt_day', 
                        aggfunc='sum').fillna(0.0)
cnt.columns = ['cnt_'+str(k) for k in cnt.columns]

In [25]:
cnt_diff = cnt.diff(axis=1)/cnt.replace(0, 1).shift(axis=1, periods=1)

In [26]:
cnt_diff.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,cnt_0,cnt_1,cnt_2,cnt_3,cnt_4,cnt_5,cnt_6,cnt_7,cnt_8,cnt_9,...,cnt_24,cnt_25,cnt_26,cnt_27,cnt_28,cnt_29,cnt_30,cnt_31,cnt_32,cnt_33
shop_id,item_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,30,,31.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,31,,11.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,32,,0.666667,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,33,,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,35,,13.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
cnt.replace(0, 1).shift(axis=1, periods=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,cnt_0,cnt_1,cnt_2,cnt_3,cnt_4,cnt_5,cnt_6,cnt_7,cnt_8,cnt_9,...,cnt_24,cnt_25,cnt_26,cnt_27,cnt_28,cnt_29,cnt_30,cnt_31,cnt_32,cnt_33
shop_id,item_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,30,,1.0,31.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,31,,1.0,11.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,32,,6.0,10.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,33,,3.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,35,,1.0,14.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,36,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,40,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,42,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,43,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,49,,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [22]:
cnt.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,cnt_0,cnt_1,cnt_2,cnt_3,cnt_4,cnt_5,cnt_6,cnt_7,cnt_8,cnt_9,...,cnt_24,cnt_25,cnt_26,cnt_27,cnt_28,cnt_29,cnt_30,cnt_31,cnt_32,cnt_33
shop_id,item_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,30,0.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,31,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,32,6.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,33,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,35,1.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
