In [1]:
import os
if os.name=='nt':
    try:
        mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-8.1.0-posix-seh-rt_v6-rev0\\mingw64\\bin'
        os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
    except:
        pass
    
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tqdm
import gc
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

tqdm.tqdm.pandas()

%matplotlib inline

In [2]:
df = pd.read_hdf('../input/data_all.hdf', 'train_test')
df2 = df.copy()

In [None]:
# Convert data types
for c in tqdm.tqdm_notebook(df.columns):
    if df[c].dtype.name=='category':
        le = LabelEncoder()
        df[c] = le.fit_transform(df[c])
    elif df[c].dtype.name=='datetime64[ns]':
        df[c] = (df[c] - df[c].min()).dt.days

Use 2015-05 and 2015-06 data to train a model, and predict on 2016-05 for 2016-06

1. Use two months data to create one dataset
    1. For each user in 2015-06, extract all the history data

In [4]:
target_cols = ['ind_ahor_fin_ult1','ind_aval_fin_ult1',
               'ind_cco_fin_ult1','ind_cder_fin_ult1',
               'ind_cno_fin_ult1','ind_ctju_fin_ult1',
               'ind_ctma_fin_ult1','ind_ctop_fin_ult1',
               'ind_ctpp_fin_ult1',
               #'ind_deco_fin_ult1',
               #'ind_deme_fin_ult1',
               'ind_dela_fin_ult1',
               'ind_ecue_fin_ult1','ind_fond_fin_ult1',
               'ind_hip_fin_ult1','ind_plan_fin_ult1',
               'ind_pres_fin_ult1','ind_reca_fin_ult1',
               'ind_tjcr_fin_ult1','ind_valo_fin_ult1',
               #'ind_viv_fin_ult1',
               'ind_nomina_ult1',
               'ind_nom_pens_ult1','ind_recibo_ult1']

In [5]:
feature_cols = ['fecha_dato', 'ncodpers', 'ind_empleado', 'pais_residencia', 'sexo',
       'age', 'fecha_alta', 'ind_nuevo', 'antiguedad', 'indrel',
       'ult_fec_cli_1t', 'indrel_1mes', 'tiprel_1mes', 'indresi', 'indext',
       'conyuemp', 'canal_entrada', 'indfall', 'tipodom', 'cod_prov',
       'nomprov', 'ind_actividad_cliente', 'renta', 'segmento',]

In [6]:
df_pivot = df.pivot_table(index='ncodpers', columns='fecha_dato', 
                                values=target_cols)

In [7]:
month_cols = df.fecha_dato.unique()

In [8]:
def calculate_max_score(df, df_pivot, m1, m2):
    '''Calculate the maximum possible score for each month'''
    
    # sales of m2
    tmp2 = df_pivot.loc[:, (slice(None), m2)].copy()
    # sales of m1
    tmp1 = df_pivot.loc[:, (slice(None), m1)].copy()

    # customer ids in m2
    tmp2_ncodpers = df.loc[df.fecha_dato==m2].ncodpers
    # keep customers existing in m2, and remove irrelavent customers
    tmp2 = tmp2.loc[tmp2.index.isin(tmp2_ncodpers)]    
    # replace NAN with 0.0
    tmp2.fillna(0.0, inplace=True)
    
    # customers in m2 should also in tmp1, even though some of them
    # are new in m2 and do not exist in m1 (they will be NAN)
    tmp1 = tmp1.loc[tmp1.index.isin(tmp2.index)]
    # replace NAN with 0.0
    tmp1.fillna(0.0, inplace=True)

    # change column names so that subtract works
    tmp1.columns = target_cols
    tmp2.columns = target_cols

    # sales in m2 - sales in m1
    tmp_diff = tmp2.subtract(tmp1)

    # replace negative values with 0.0
    tmp_diff[tmp_diff<0] = 0
    # replace NAN with 0.0
    tmp_diff.fillna(0.0, inplace=True)
    
    # maximum possible score
    score_max = tmp_diff.max(axis=1).sum()/tmp_diff.shape[0]
    
    return tmp_diff, score_max

In [None]:
#month_cols = df.fecha_dato.unique()
#best_score = {}
#for n, m in enumerate(month_cols[:-2]):
#    m1 = m
#    m2 = month_cols[n+1]
#    tmp_diff, score_max = calculate_max_score(df, df_pivot, m1, m2)
#    print(m1, m2, score_max)

Train on 2015-05 to 2015-06

In [122]:
# the two months
m1 = month_cols[4] # numpy.datetime64('2015-05-28T00:00:00.000000000')
m2 = month_cols[5] # numpy.datetime64('2015-06-28T00:00:00.000000000')

# target is the increment between 2015-06 and 2015-05
target, score_max = calculate_max_score(df, df_pivot, m1, m2)

# use customer features in 2015-06 
x_train = df.loc[df.fecha_dato==m2, feature_cols].copy()
# set customer id as row index
x_train.set_index('ncodpers', inplace=True)
# customer ids in 2015-06
customer_id = x_train.index.tolist()

# add sales in the previous month (2015-05)
prev_target = df_pivot.loc[df_pivot.index.isin(customer_id), (slice(None), m1)]
# change column names
prev_target.columns = target_cols
# join features
x_train = x_train.join(prev_target)
# replace NAN in previous target_cols with 0.0
x_train.loc[:, target_cols].fillna(0.0, inplace=True)

run XGBoost

In [123]:
# x_train.drop(['fecha_dato', 'fecha_alta', 'ult_fec_cli_1t'], axis=1, inplace=True)

In [124]:
param = {'objective': 'multi:softprob', 
         'eta': 0.05, 
         'max_depth': 8, 
         'silent': 0, 
         'num_class': len(target_cols),
         'eval_metric': 'mlogloss',
         'min_child_weight': 1,
         'subsample': 0.7,
         'colsample_bytree': 0.7,
         'seed': 1}
num_rounds = 50

dtrain = xgb.DMatrix(x_train.values, target.values)
model = xgb.train(param, dtrain, num_rounds, evals=[(dtrain, 'train')], verbose_eval=True)

[0]	train-mlogloss:2.52771
[1]	train-mlogloss:2.22013
[2]	train-mlogloss:1.995
[3]	train-mlogloss:1.81635
[4]	train-mlogloss:1.66806
[5]	train-mlogloss:1.54136
[6]	train-mlogloss:1.4309
[7]	train-mlogloss:1.33315
[8]	train-mlogloss:1.24565
[9]	train-mlogloss:1.16664
[10]	train-mlogloss:1.09475
[11]	train-mlogloss:1.02897
[12]	train-mlogloss:0.968483
[13]	train-mlogloss:0.912626
[14]	train-mlogloss:0.860867
[15]	train-mlogloss:0.812761
[16]	train-mlogloss:0.767935
[17]	train-mlogloss:0.726072
[18]	train-mlogloss:0.6869
[19]	train-mlogloss:0.650183
[20]	train-mlogloss:0.615715
[21]	train-mlogloss:0.583317
[22]	train-mlogloss:0.552829
[23]	train-mlogloss:0.524108
[24]	train-mlogloss:0.497026
[25]	train-mlogloss:0.47147
[26]	train-mlogloss:0.447335
[27]	train-mlogloss:0.424527
[28]	train-mlogloss:0.402961
[29]	train-mlogloss:0.382559
[30]	train-mlogloss:0.363247
[31]	train-mlogloss:0.344961
[32]	train-mlogloss:0.327638
[33]	train-mlogloss:0.311223
[34]	train-mlogloss:0.295663
[35]	train-ml

In [None]:
# the two months
m1 = month_cols[-2] # numpy.datetime64('2015-05-28T00:00:00.000000000')
m2 = month_cols[-1] # numpy.datetime64('2015-06-28T00:00:00.000000000')

# use customer features in 2015-06 
x_train = df.loc[df.fecha_dato==m2, feature_cols].copy()
# set customer id as row index
x_train.set_index('ncodpers', inplace=True)
# customer ids in 2015-06
customer_id = x_train.index.tolist()

# add sales in the previous month (2015-05)
prev_target = df_pivot.loc[df_pivot.index.isin(customer_id), (slice(None), m1)]
# change column names
prev_target.columns = target_cols
# join features
x_train = x_train.join(prev_target)
# replace NAN in previous target_cols with 0.0
x_train.loc[:, target_cols].fillna(0.0, inplace=True)

In [82]:
# the two months
m1 = month_cols[-2] # numpy.datetime64('2016-05-28T00:00:00.000000000')
m2 = month_cols[-1] # numpy.datetime64('2016-06-28T00:00:00.000000000')

# use customer features in 2016-06
x_train = df.loc[df.fecha_dato==m2, feature_cols].copy()
# set customer id as row index
x_train.set_index('ncodpers', inplace=True)

# add sales in the previous month (2015-05)
prev_target = df_pivot.loc[:, (slice(None), m1)]
# change column names
prev_target.columns = target_cols
# join features
x_train = x_train.join(prev_target)
# replace NAN in previous target_cols with 0.0
x_train.loc[:, target_cols].fillna(0.0, inplace=True)

In [101]:
prev_target.shape

(956645, 21)

In [102]:
x_train.shape

(929615, 44)

In [84]:
preds = model.predict(xgb.DMatrix(x_train.values))

In [94]:
preds = np.argsort(preds, axis=1)
preds = np.fliplr(preds)[:, :7]

In [95]:
test_id = df.loc[df.fecha_dato==m2, 'ncodpers'].values
final_preds = [' '.join([target_cols[k] for k in pred]) for pred in preds]
out_df = pd.DataFrame({'ncodpers': test_id, 'added_products': final_preds})
out_df.to_csv('../input/eda_4_1.csv.gz', compression='gzip', index=False)

In [98]:
model.get_score(importance_type='gain')

{}