In [1]:
import os
if os.name=='nt':
    try:
        mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-8.1.0-posix-seh-rt_v6-rev0\\mingw64\\bin'
        os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
    except:
        pass
    
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tqdm
import gc
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

tqdm.tqdm.pandas()

%matplotlib inline

In [2]:
df = pd.read_hdf('../input/data_all.hdf', 'train_test_converted')

In [3]:
target_cols = [
               #'ind_ahor_fin_ult1','ind_aval_fin_ult1',
               'ind_cco_fin_ult1','ind_cder_fin_ult1',
               'ind_cno_fin_ult1','ind_ctju_fin_ult1',
               'ind_ctma_fin_ult1','ind_ctop_fin_ult1',
               'ind_ctpp_fin_ult1',
               #'ind_deco_fin_ult1',
               #'ind_deme_fin_ult1',
               'ind_dela_fin_ult1',
               'ind_ecue_fin_ult1','ind_fond_fin_ult1',
               'ind_hip_fin_ult1','ind_plan_fin_ult1',
               'ind_pres_fin_ult1','ind_reca_fin_ult1',
               'ind_tjcr_fin_ult1','ind_valo_fin_ult1',
               #'ind_viv_fin_ult1',
               'ind_nomina_ult1',
               'ind_nom_pens_ult1','ind_recibo_ult1']
target_cols = sorted(target_cols)

In [4]:
target_cols_map = {c+'_t':n for n, c in enumerate(target_cols)}
target_cols_map_reverse = {n:c for n, c in enumerate(target_cols)}

In [5]:
feature_cols = ['ncodpers', 'ind_empleado', 'pais_residencia', 'sexo',
                'age', 'fecha_alta', 'ind_nuevo', 'antiguedad', 'indrel',
                'ult_fec_cli_1t', 'indrel_1mes', 'tiprel_1mes', 'indresi', 'indext',
                'conyuemp', 'canal_entrada', 'indfall', 'tipodom', 'cod_prov',
                'nomprov', 'ind_actividad_cliente', 'renta', 'segmento']
feature_cols = sorted(feature_cols)

In [6]:
month_cols = df.fecha_dato.unique()

In [7]:
def prepare_data(m1, m2, train=True):
    '''m1 is the first month, m2 is the second month
    '''    
    m1 = month_cols[m1]
    m2 = month_cols[m2]

    # customer id in m2
    customer_m2 = df.loc[df.fecha_dato==m2, 'ncodpers'].values.tolist()
    customer_m2 = sorted(customer_m2)
    
    # sales in m1 for customers in both m1 and m2
    sales_m1 = df.loc[(df.fecha_dato==m1) & 
                      (df.ncodpers.isin(customer_m2)), 
                      ['ncodpers']+target_cols]
    sales_m1.set_index('ncodpers', drop=True, inplace=True)
    sales_m1.sort_index(inplace=True)

    feature_m2 = df.loc[df.fecha_dato==m2, feature_cols].copy()
    feature_m2.set_index('ncodpers', drop=False, inplace=True)
    feature_m2.sort_index(inplace=True)

    # customer features in m2, including sales in m1
    u = feature_m2.join(sales_m1)
    u[target_cols] = u[target_cols].fillna(0.0)

    x_train = u.copy()

    # generate target only for train dataset
    if train:
        # sales in m2 
        sales_m2 = df.loc[df.fecha_dato==m2, ['ncodpers']+target_cols].copy()
        sales_m2.fillna(0.0, inplace=True)
        sales_m2.set_index('ncodpers', inplace=True)
        sales_m2.sort_index(inplace=True)

        # sales in m1 for customers in m2, fill NAN with 0.0 for customers only m2
        n = sales_m2.copy()
        n.loc[n.index.isin(sales_m1.index), target_cols] = sales_m1
        n.loc[~n.index.isin(sales_m1.index), target_cols] = np.zeros(n.loc[~n.index.isin(sales_m1.index), target_cols].shape)
        n.fillna(0.0, inplace=True)

        # new products from m1 to m2
        sales_m2 = sales_m2.subtract(n)

        # for each newly purchased product, generate one sample, so one customer can have 0, 1, or multiple samples
        n = x_train.join(sales_m2, rsuffix='_t')
        n = n.melt(id_vars=x_train.columns)
        n.variable.replace(target_cols_map, inplace=True)
        n = n.loc[n.value>0].copy() # only keep added products
        n.drop('value', axis=1, inplace=True)
        n.reset_index(inplace=True, drop=True)

        x_train = n.iloc[:, :-1].copy()
        y_train = n.iloc[:, -1].copy()
        
        return x_train, y_train
    else:
        return x_train, customer_m2, sales_m1, feature_m2

In [8]:
x_train_june, y_train_june = prepare_data(4, 5)
x_train_july, y_train_july = prepare_data(5, 6)

In [9]:
x_test, customer_m2, sales_m1, feature_m2 = prepare_data(-2, -1, train=False)

In [11]:
param = {'objective': 'multi:softprob', 
         'eta': 0.5, 
         'max_depth': 8, 
         'silent': 0, 
         'num_class': len(target_cols),
         'eval_metric': 'mlogloss',
         'min_child_weight': 1,
         'subsample': 0.7,
         'colsample_bytree': 0.7,
         'seed': 1}
num_rounds = 12

dtrain = xgb.DMatrix(x_train_june.values, y_train_june.values)
dval = xgb.DMatrix(x_train_july.values, y_train_july.values)
model = xgb.train(param, dtrain, num_rounds, evals=[(dtrain, 'train'), (dval, 'val')], verbose_eval=True)

[0]	train-mlogloss:1.70981	val-mlogloss:1.52834
[1]	train-mlogloss:1.51446	val-mlogloss:1.38789
[2]	train-mlogloss:1.3978	val-mlogloss:1.30822
[3]	train-mlogloss:1.31504	val-mlogloss:1.26586
[4]	train-mlogloss:1.26	val-mlogloss:1.22854
[5]	train-mlogloss:1.21856	val-mlogloss:1.20626
[6]	train-mlogloss:1.18207	val-mlogloss:1.19273
[7]	train-mlogloss:1.15243	val-mlogloss:1.17872
[8]	train-mlogloss:1.12966	val-mlogloss:1.17558
[9]	train-mlogloss:1.11073	val-mlogloss:1.171
[10]	train-mlogloss:1.09395	val-mlogloss:1.16493
[11]	train-mlogloss:1.07685	val-mlogloss:1.16096


In [36]:
test_id = df.loc[df.fecha_dato==month_cols[-1], 'ncodpers'].values
preds = model.predict(xgb.DMatrix(x_test.values))
preds = pd.DataFrame(preds, index=test_id, columns=target_cols)
preds = preds.subtract(sales_m1)
preds[preds<0] = 0
preds = np.argsort(preds.values, axis=1)
preds = np.fliplr(preds)[:, :7]
preds = preds[:, :7]

In [37]:
#test_id = df.loc[df.fecha_dato==month_cols[-1], 'ncodpers'].values
final_preds = [' '.join([target_cols_map_reverse[k] for k in pred]) for pred in preds]
out_df = pd.DataFrame({'ncodpers': test_id, 'added_products': final_preds})
out_df.to_csv('../input/eda_4_2.csv.gz', compression='gzip', index=False)

In [38]:
out_df

Unnamed: 0,ncodpers,added_products
0,15889,ind_recibo_ult1 ind_reca_fin_ult1 ind_nom_pens...
1,1170544,ind_cco_fin_ult1 ind_dela_fin_ult1 ind_reca_fi...
2,1170545,ind_nomina_ult1 ind_nom_pens_ult1 ind_cno_fin_...
3,1170547,ind_recibo_ult1 ind_nom_pens_ult1 ind_reca_fin...
4,1170548,ind_cno_fin_ult1 ind_dela_fin_ult1 ind_fond_fi...
5,1170550,ind_nom_pens_ult1 ind_nomina_ult1 ind_cno_fin_...
6,1170552,ind_nomina_ult1 ind_recibo_ult1 ind_nom_pens_u...
7,1170553,ind_nomina_ult1 ind_cco_fin_ult1 ind_dela_fin_...
8,1170555,ind_recibo_ult1 ind_ecue_fin_ult1 ind_reca_fin...
9,1170557,ind_cno_fin_ult1 ind_nom_pens_ult1 ind_nomina_...
