In [1]:
import os
if os.name=='nt':
    try:
        mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-8.1.0-posix-seh-rt_v6-rev0\\mingw64\\bin'
        os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
    except:
        pass
    
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tqdm
import gc
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

tqdm.tqdm.pandas()

%matplotlib inline

In [2]:
df = pd.read_hdf('../input/data_all.hdf', 'train_test_converted')

In [9]:
month_cols = df.fecha_dato.unique()

In [6]:
month_cols

array([  0,  31,  59,  90, 120, 151, 181, 212, 243, 273, 304, 334, 365,
       396, 425, 456, 486, 517], dtype=int64)

In [144]:
train_X = np.load('train_X.npy')

In [145]:
train_y = np.load('train_y.npy')

In [146]:
test_X = np.load('test_X.npy')

In [19]:
target_cols = ['ind_cco_fin_ult1','ind_cder_fin_ult1',
               'ind_cno_fin_ult1','ind_ctju_fin_ult1',
               'ind_ctma_fin_ult1','ind_ctop_fin_ult1',
               'ind_ctpp_fin_ult1',
               'ind_deco_fin_ult1',
               'ind_deme_fin_ult1',
               'ind_dela_fin_ult1',
               'ind_ecue_fin_ult1','ind_fond_fin_ult1',
               'ind_hip_fin_ult1','ind_plan_fin_ult1',
               'ind_pres_fin_ult1','ind_reca_fin_ult1',
               'ind_tjcr_fin_ult1','ind_valo_fin_ult1',
               'ind_viv_fin_ult1',
               'ind_nomina_ult1',
               'ind_nom_pens_ult1','ind_recibo_ult1']
target_cols = sorted(target_cols)

In [20]:
target_cols_map = {c+'_t':n for n, c in enumerate(target_cols)}
target_cols_map_reverse = {n:c for n, c in enumerate(target_cols)}

In [48]:
feature_cols = ['ind_empleado',
                'sexo', 'ind_nuevo', 
                'indrel', 'indrel_1mes', 
                'tiprel_1mes', 'indresi', 
                'indext', 'conyuemp', 
                'indfall', 'tipodom', 
                'ind_actividad_cliente', 'segmento', 
                'pais_residencia', 'canal_entrada', 
                'age', 'renta', 'antiguedad']
feature_cols = sorted(feature_cols)

In [49]:
len(feature_cols)

18

In [32]:
month_cols

array([  0,  31,  59,  90, 120, 151, 181, 212, 243, 273, 304, 334, 365,
       396, 425, 456, 486, 517], dtype=int64)

In [152]:
def prepare_data(m1, m2, train=True):
    '''m1 is the first month, m2 is the second month
    '''    
    m1 = month_cols[m1]
    m2 = month_cols[m2]

    # customer id in m2
    customer_m2 = df.loc[df.fecha_dato==m2, 'ncodpers'].values.tolist()
    customer_m2 = sorted(customer_m2)
    
    # sales in m1 for customers in both m1 and m2
    sales_m1 = df.loc[(df.fecha_dato==m1) & 
                      (df.ncodpers.isin(customer_m2)), 
                      ['ncodpers']+target_cols]
    #sales_m1.set_index('ncodpers', drop=True, inplace=True)
    #sales_m1.sort_index(inplace=True)

    # information of customers in m2
    feature_m2 = df.loc[df.fecha_dato==m2, ['ncodpers']+feature_cols].copy()
    #feature_m2.set_index('ncodpers', drop=False, inplace=True)
    #feature_m2.sort_index(inplace=True)

    # customer features in m2, including sales in m1
    #u = feature_m2.join(sales_m1)
    u = pd.merge(feature_m2, sales_m1, how='left', on='ncodpers')
    u[target_cols] = u[target_cols].fillna(0.0)

    x_train = u.copy()

    # generate target only for train dataset
    if train:
        # sales in m2 
        sales_m2 = df.loc[df.fecha_dato==m2, ['ncodpers']+target_cols].copy()
        sales_m2.fillna(0.0, inplace=True)
        #sales_m2.set_index('ncodpers', inplace=True)
        #sales_m2.sort_index(inplace=True)

        # sales in m1 for customers in m2, fill NAN with 0.0 for customers only m2
        n = pd.DataFrame(sales_m2.loc[:, 'ncodpers'])
        n = pd.merge(n, sales_m1, how='left', on='ncodpers')
        #n.loc[n.ncodpers.isin(sales_m1.index), target_cols] = sales_m1
        #n.loc[~n.index.isin(sales_m1.index), target_cols] = np.zeros(n.loc[~n.index.isin(sales_m1.index), target_cols].shape)
        n.fillna(0.0, inplace=True)

        # new products from m1 to m2
        #sales_m2 = sales_m2.subtract(n)
        sales_m2.set_index('ncodpers', inplace=True, drop=True)
        n.set_index('ncodpers', inplace=True, drop=True)
        w = sales_m2.subtract(n)

        # for each newly purchased product, generate one sample, so one customer can have 0, 1, or multiple samples
        n = pd.merge(x_train, w, how='left', left_on='ncodpers', right_index=True, suffixes=('', '_t'))
        
        #n = x_train.join(sales_m2, rsuffix='_t')
        n = n.melt(id_vars=x_train.columns)
        n.variable.replace(target_cols_map, inplace=True)
        n = n.loc[n.value>0].copy() # only keep added products
        n.drop('value', axis=1, inplace=True)
        n.reset_index(inplace=True, drop=True)

        x_train = n.iloc[:, :-1].copy()
        y_train = n.iloc[:, -1].copy()
        
        return x_train, y_train
    else:
        return x_train

In [158]:
x_train_june, y_train_june = prepare_data(4, 5)
x_test = prepare_data(-2, -1, train=False)
x_train_july, y_train_july = prepare_data(5, 6)

In [None]:
param = {'objective': 'multi:softprob', 
         'eta': 0.5, 
         'max_depth': 8, 
         'silent': 0, 
         'num_class': len(target_cols),
         'eval_metric': 'mlogloss',
         'min_child_weight': 1,
         'subsample': 0.7,
         'colsample_bytree': 0.7,
         'seed': 1}
num_rounds = 12

dtrain = xgb.DMatrix(x_train_june.values, y_train_june.values)
dval = xgb.DMatrix(x_train_july.values, y_train_july.values)
model = xgb.train(param, dtrain, num_rounds, evals=[(dtrain, 'train'), (dval, 'val')], verbose_eval=True)