In [1]:
import os
if os.name=='nt':
    try:
        mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-8.1.0-posix-seh-rt_v6-rev0\\mingw64\\bin'
        os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
    except:
        pass
    
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tqdm
import gc
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

tqdm.tqdm.pandas()

%matplotlib inline

In [2]:
df = pd.read_hdf('../input/data_all.hdf', 'train_test_converted')

Use 2015-05 and 2015-06 data to train a model, and predict on 2016-05 for 2016-06

1. Use two months data to create one dataset
    1. For each user in 2015-06, extract all the history data

In [3]:
target_cols = [
               #'ind_ahor_fin_ult1','ind_aval_fin_ult1',
               'ind_cco_fin_ult1','ind_cder_fin_ult1',
               'ind_cno_fin_ult1','ind_ctju_fin_ult1',
               'ind_ctma_fin_ult1','ind_ctop_fin_ult1',
               'ind_ctpp_fin_ult1',
               #'ind_deco_fin_ult1',
               #'ind_deme_fin_ult1',
               'ind_dela_fin_ult1',
               'ind_ecue_fin_ult1','ind_fond_fin_ult1',
               'ind_hip_fin_ult1','ind_plan_fin_ult1',
               'ind_pres_fin_ult1','ind_reca_fin_ult1',
               'ind_tjcr_fin_ult1','ind_valo_fin_ult1',
               #'ind_viv_fin_ult1',
               'ind_nomina_ult1',
               'ind_nom_pens_ult1','ind_recibo_ult1']
target_cols = sorted(target_cols)

In [20]:
target_cols_map = {c+'_t':n for n, c in enumerate(target_cols)}

In [5]:
feature_cols = ['fecha_dato', 'ncodpers', 'ind_empleado', 'pais_residencia', 'sexo',
                'age', 'fecha_alta', 'ind_nuevo', 'antiguedad', 'indrel',
                'ult_fec_cli_1t', 'indrel_1mes', 'tiprel_1mes', 'indresi', 'indext',
                'conyuemp', 'canal_entrada', 'indfall', 'tipodom', 'cod_prov',
                'nomprov', 'ind_actividad_cliente', 'renta', 'segmento']
feature_cols = sorted(feature_cols)

In [6]:
month_cols = df.fecha_dato.unique()

In [7]:
july = month_cols[6]
june = month_cols[5]
may = month_cols[4]

customer_june = df.loc[df.fecha_dato==june, 'ncodpers'].values.tolist()
customer_june = sorted(customer_june)

customer_july = df.loc[df.fecha_dato==july, 'ncodpers'].values.tolist()
customer_july = sorted(customer_july)

Prepare train data for predicting new products in 2015-06

In [34]:
sales_may = df.loc[(df.fecha_dato==may) & 
                   (df.ncodpers.isin(customer_june)), 
                   ['ncodpers']+target_cols]
sales_may.set_index('ncodpers', drop=True, inplace=True)
sales_may.sort_index(inplace=True)

feature_june = df.loc[df.fecha_dato==june, feature_cols].copy()
feature_june.set_index('ncodpers', drop=False, inplace=True)
feature_june.sort_index(inplace=True)

u = feature_june.join(sales_may)
u[target_cols] = u[target_cols].fillna(0.0)

x_train_june = u.copy()

del u, feature_june
gc.collect()

# sales in june
target_june = df.loc[df.fecha_dato==june, ['ncodpers']+target_cols].copy()
target_june.fillna(0.0, inplace=True)
target_june.set_index('ncodpers', inplace=True)
target_june.sort_index(inplace=True)

# sales in may
n = target_june.copy()
n.loc[n.index.isin(sales_may.index), target_cols] = sales_may
n.loc[~n.index.isin(sales_may.index), target_cols] = np.zeros(n.loc[~n.index.isin(sales_may.index), target_cols].shape)
n.fillna(0.0, inplace=True)

# new products from may to june
target_june = target_june.subtract(n)

n = x_train_june.join(target_june, rsuffix='_t')
n = n.melt(id_vars=x_train_june.columns)

n.variable.replace(target_cols_map, inplace=True)

n = n.loc[n.value>0].copy()

n.drop('value', axis=1, inplace=True)
n.reset_index(inplace=True, drop=True)

x_train_june = n.iloc[:, :-1].copy()
y_train_june = n.iloc[:, -1].copy()

del n
gc.collect()

91

In [None]:
param = {'objective': 'multi:softprob', 
         'eta': 0.05, 
         'max_depth': 8, 
         'silent': 0, 
         'num_class': len(target_cols),
         'eval_metric': 'mlogloss',
         'min_child_weight': 1,
         'subsample': 0.7,
         'colsample_bytree': 0.7,
         'seed': 1}
num_rounds = 50

dtrain = xgb.DMatrix(x_train_june.values, y_train_june.values)
model = xgb.train(param, dtrain, num_rounds, evals=[(dtrain, 'train')], verbose_eval=True)

Prepare train data for predicting new products in 2015-07

sales_june = df.loc[(df.fecha_dato==june) & 
                   (df.ncodpers.isin(customer_july)), 
                   ['ncodpers']+target_cols]
sales_june.set_index('ncodpers', drop=True, inplace=True)
sales_june.sort_index(inplace=True)

feature_july = df.loc[df.fecha_dato==july, feature_cols].copy()
feature_july.set_index('ncodpers', drop=False, inplace=True)
feature_july.sort_index(inplace=True)

u = feature_july.join(sales_june)
u[target_cols] = u[target_cols].fillna(0.0)

x_train_july = u.copy()

del u
gc.collect()