In [1]:
import os
if os.name=='nt':
    try:
        mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-8.1.0-posix-seh-rt_v6-rev0\\mingw64\\bin'
        os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
    except:
        pass
    
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tqdm
import gc
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

tqdm.tqdm.pandas()

%matplotlib inline

In [2]:
df = pd.read_hdf('../input/data_all.hdf', 'train_test')

In [3]:
# Convert data types
for c in tqdm.tqdm_notebook(df.columns):
    if df[c].dtype.name=='category':
        le = LabelEncoder()
        df[c] = le.fit_transform(df[c])
    elif df[c].dtype.name=='datetime64[ns]':
        df[c] = (df[c] - df[c].min()).dt.days

HBox(children=(IntProgress(value=0, max=48), HTML(value='')))




In [4]:
df.to_hdf('../input/data_all.hdf', 'train_test_converted', complib='blosc:lz4', complevel=9, format='t')

  expected_mb = (expectedrows * rowsize) // MB


Use 2015-05 and 2015-06 data to train a model, and predict on 2016-05 for 2016-06

1. Use two months data to create one dataset
    1. For each user in 2015-06, extract all the history data

In [5]:
target_cols = ['ind_ahor_fin_ult1','ind_aval_fin_ult1',
               'ind_cco_fin_ult1','ind_cder_fin_ult1',
               'ind_cno_fin_ult1','ind_ctju_fin_ult1',
               'ind_ctma_fin_ult1','ind_ctop_fin_ult1',
               'ind_ctpp_fin_ult1',
               #'ind_deco_fin_ult1',
               #'ind_deme_fin_ult1',
               'ind_dela_fin_ult1',
               'ind_ecue_fin_ult1','ind_fond_fin_ult1',
               'ind_hip_fin_ult1','ind_plan_fin_ult1',
               'ind_pres_fin_ult1','ind_reca_fin_ult1',
               'ind_tjcr_fin_ult1','ind_valo_fin_ult1',
               #'ind_viv_fin_ult1',
               'ind_nomina_ult1',
               'ind_nom_pens_ult1','ind_recibo_ult1']
target_cols = sorted(target_cols)

In [6]:
feature_cols = ['fecha_dato', 'ncodpers', 'ind_empleado', 'pais_residencia', 'sexo',
                'age', 'fecha_alta', 'ind_nuevo', 'antiguedad', 'indrel',
                'ult_fec_cli_1t', 'indrel_1mes', 'tiprel_1mes', 'indresi', 'indext',
                'conyuemp', 'canal_entrada', 'indfall', 'tipodom', 'cod_prov',
                'nomprov', 'ind_actividad_cliente', 'renta', 'segmento']
feature_cols = sorted(feature_cols)

In [7]:
month_cols = df.fecha_dato.unique()

In [8]:
df_pivot = df.pivot_table(index='ncodpers', columns='fecha_dato', 
                          values=target_cols, aggfunc=np.sum)

In [9]:
df_pivot.to_hdf('../input/data_all.hdf_all.hdf', 'train_test_converted_pivot', complib='blosc:lz4', complevel=9, format='t')

  expected_mb = (expectedrows * rowsize) // MB


In [10]:
def calculate_max_score(df, df_pivot, m1, m2):
    '''Calculate the maximum possible score for each month, also return the new products in this month
    m2 is the late month, m1 is the month before it
    '''
    
    # sales of m2
    tmp2 = df_pivot.loc[:, (slice(None), m2)].copy()
    # sales of m1
    tmp1 = df_pivot.loc[:, (slice(None), m1)].copy()

    # customer ids in m2
    tmp2_ncodpers = df.loc[df.fecha_dato==m2].ncodpers
    # keep customers existing in m2, and remove irrelavent customers
    tmp2 = tmp2.loc[tmp2.index.isin(tmp2_ncodpers)]    
    # replace NAN with 0.0
    tmp2.fillna(0.0, inplace=True)

    # customers in m2 should also in tmp1, even though some of them
    # are new in m2 and do not exist in m1 (they will be NAN)
    tmp1 = tmp1.loc[tmp1.index.isin(tmp2.index)]
    # replace NAN with 0.0
    tmp1.fillna(0.0, inplace=True)

    # change column names so that subtract works
    tmp1.columns = target_cols
    tmp2.columns = target_cols

    # sales in m2 - sales in m1
    tmp_diff = tmp2.subtract(tmp1)

    # replace negative values with 0.0
    tmp_diff[tmp_diff<0] = 0
    # replace NAN with 0.0
    tmp_diff.fillna(0.0, inplace=True)
    
    # maximum possible score
    score_max = tmp_diff.max(axis=1).sum()/tmp_diff.shape[0]
    
    return tmp_diff, score_max

In [11]:
month_cols = df.fecha_dato.unique()
best_score = {}
for n, m in enumerate(month_cols[:-2]):
    m1 = m
    m2 = month_cols[n+1]
    tmp_diff, score_max = calculate_max_score(df, df_pivot, m1, m2)
    print(m1, m2, score_max)

0 31 0.0438368871873177
31 59 0.04537284113863597
59 90 0.045397363757937834
90 120 0.03886656845323337
120 151 0.05748050181139359
151 181 0.046959751366867634
181 212 0.041536952636441374
212 243 0.05104686633388796
243 273 0.0539904130115853
273 304 0.04038035159125447
304 334 0.04067340554658281
334 365 0.0331016328174368
365 396 0.04224001633177834
396 425 0.03301350375536713
425 456 0.030922981792013995
456 486 0.0319006970829446


Train on 2015-05 to 2015-06

In [12]:
# the two months
m1 = month_cols[4] # numpy.datetime64('2015-05-28T00:00:00.000000000')
m2 = month_cols[5] # numpy.datetime64('2015-06-28T00:00:00.000000000')

# target is the increment between 2015-06 and 2015-05
target, score_max = calculate_max_score(df, df_pivot, m1, m2)

# use customer features in 2015-06
x_train = df.loc[df.fecha_dato==m2, feature_cols].copy()
# set customer id as row index
x_train.set_index('ncodpers', inplace=True)
# customer ids in 2015-06
customer_id = x_train.index.tolist()

# add sales in the previous month (2015-05)
prev_target = df_pivot.loc[df_pivot.index.isin(customer_id), (slice(None), m1)]
# change column names
prev_target.columns = target_cols
# join features
x_train = x_train.join(prev_target)
# replace NAN in previous target_cols with 0.0
x_train.loc[:, target_cols].fillna(0.0, inplace=True)
# sort x_train by ncodpers
x_train.sort_index(inplace=True)

In [13]:
# set ncodpers as both the index and a column
x_train.reset_index(inplace=True)
x_train.set_index('ncodpers', drop=False, inplace=True)
x_train.sort_index(inplace=True)

In [14]:
# Convert target from multiple columns to single column, with value equals the column index

# convert 0/1 target to column index, starting from 1 since 0 cannot differentiate 0 and 1
u = target.values
u = u*np.arange(1, target.shape[1]+1)
target[target.columns] = u

# join target with train data
u = x_train.join(target, rsuffix='_t')
# get train data column name
x_train_col = x_train.columns.tolist()
# melt the joined data
u = u.melt(id_vars=x_train_col)
# remove zero rows and keep new products
u = u.loc[u.value>0]
# drop the variable column
u.drop(['variable'], inplace=True, axis=1)

In [44]:
# Prepare results for xgboost
x_train = u.iloc[:, :-1].copy()
# xgboost requires values of y_train in [0, num_class)
y_train = pd.DataFrame(u.iloc[:, -1].copy())
y_train['all_zero'] = np.zeros(y_train.shape)
y_train['value'] = y_train['value']-1
y_train = y_train.max(axis=1)

run XGBoost

In [25]:
# x_train.drop(['fecha_dato', 'fecha_alta', 'ult_fec_cli_1t'], axis=1, inplace=True)

In [46]:
param = {'objective': 'multi:softprob', 
         'eta': 0.05, 
         'max_depth': 8, 
         'silent': 0, 
         'num_class': len(target_cols),
         'eval_metric': 'mlogloss',
         'min_child_weight': 1,
         'subsample': 0.7,
         'colsample_bytree': 0.7,
         'seed': 1}
num_rounds = 50

dtrain = xgb.DMatrix(x_train.values, y_train.values-1)
model = xgb.train(param, dtrain, num_rounds, evals=[(dtrain, 'train')], verbose_eval=True)

[0]	train-mlogloss:2.88034
[1]	train-mlogloss:2.72684
[2]	train-mlogloss:2.61254
[3]	train-mlogloss:2.51241
[4]	train-mlogloss:2.42313
[5]	train-mlogloss:2.35193
[6]	train-mlogloss:2.28651
[7]	train-mlogloss:2.22772
[8]	train-mlogloss:2.17241
[9]	train-mlogloss:2.12307
[10]	train-mlogloss:2.07692
[11]	train-mlogloss:2.02938
[12]	train-mlogloss:1.98781
[13]	train-mlogloss:1.94965
[14]	train-mlogloss:1.9163
[15]	train-mlogloss:1.88056
[16]	train-mlogloss:1.84806
[17]	train-mlogloss:1.81657
[18]	train-mlogloss:1.78765
[19]	train-mlogloss:1.76201
[20]	train-mlogloss:1.73461
[21]	train-mlogloss:1.71128
[22]	train-mlogloss:1.68732
[23]	train-mlogloss:1.66754
[24]	train-mlogloss:1.64663
[25]	train-mlogloss:1.62556
[26]	train-mlogloss:1.60527
[27]	train-mlogloss:1.58546
[28]	train-mlogloss:1.56728
[29]	train-mlogloss:1.54993
[30]	train-mlogloss:1.53337
[31]	train-mlogloss:1.51793
[32]	train-mlogloss:1.50347
[33]	train-mlogloss:1.48873
[34]	train-mlogloss:1.47609
[35]	train-mlogloss:1.46256
[36

In [54]:
# the two months
m1 = month_cols[-2] # numpy.datetime64('2015-05-28T00:00:00.000000000')
m2 = month_cols[-1] # numpy.datetime64('2015-06-28T00:00:00.000000000')

# use customer features in 2015-06 
x_train = df.loc[df.fecha_dato==m2, feature_cols].copy()
# set customer id as row index
x_train.set_index('ncodpers', inplace=True)
# customer ids in 2015-06
customer_id = x_train.index.tolist()

# add sales in the previous month (2015-05)
prev_target = df_pivot.loc[df_pivot.index.isin(customer_id), (slice(None), m1)]
# change column names
prev_target.columns = target_cols
# join features
x_train = x_train.join(prev_target)
# replace NAN in previous target_cols with 0.0
x_train.loc[:, target_cols].fillna(0.0, inplace=True)
# reset index to include ncodpers as one column
x_train.reset_index(inplace=True)
x_train.set_index('ncodpers', drop=False, inplace=True)
x_train.sort_index(inplace=True)

In [65]:
preds = model.predict(xgb.DMatrix(x_train.values))
preds = np.argsort(preds, axis=1)
preds = np.fliplr(preds)[:, :7]

In [59]:
test_id = df.loc[df.fecha_dato==m2, 'ncodpers'].values
final_preds = [' '.join([target_cols[k] for k in pred]) for pred in preds]
out_df = pd.DataFrame({'ncodpers': test_id, 'added_products': final_preds})
out_df.to_csv('../input/eda_4_1.csv.gz', compression='gzip', index=False)

In [69]:
test_id = df.loc[df.fecha_dato==m2, 'ncodpers'].values

In [71]:
out_df

Unnamed: 0,ncodpers,added_products
0,15889,ind_reca_fin_ult1 ind_pres_fin_ult1 ind_nom_pe...
1,1170544,ind_pres_fin_ult1 ind_ctpp_fin_ult1 ind_aval_f...
2,1170545,ind_hip_fin_ult1 ind_cder_fin_ult1 ind_nom_pen...
3,1170547,ind_aval_fin_ult1 ind_reca_fin_ult1 ind_recibo...
4,1170548,ind_cder_fin_ult1 ind_fond_fin_ult1 ind_tjcr_f...
5,1170550,ind_hip_fin_ult1 ind_nom_pens_ult1 ind_cder_fi...
6,1170552,ind_reca_fin_ult1 ind_recibo_ult1 ind_pres_fin...
7,1170553,ind_aval_fin_ult1 ind_nom_pens_ult1 ind_ctpp_f...
8,1170555,ind_aval_fin_ult1 ind_reca_fin_ult1 ind_recibo...
9,1170557,ind_recibo_ult1 ind_pres_fin_ult1 ind_hip_fin_...
