## Simple CV with RAM-Limited Data

In [1]:
import os
if os.name=='nt':
    try:
        mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-8.1.0-posix-seh-rt_v6-rev0\\mingw64\\bin'
        os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
    except:
        pass
    
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tqdm
import gc
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

tqdm.tqdm.pandas()

%matplotlib inline

In [2]:
cat_cols = ['ncodpers',
 'canal_entrada',
 'conyuemp',
 'ind_actividad_cliente',
 'ind_empleado',
 'ind_nuevo',
 'indext',
 'indfall',
 'indrel',
 'indrel_1mes',
 'indresi',
 'pais_residencia',
 'segmento',
 'sexo',
 'tipodom',
 'tiprel_1mes',
 'age',
 'antiguedad',
 'renta']

target_cols = ['ind_cco_fin_ult1',
 'ind_cder_fin_ult1',
 'ind_cno_fin_ult1',
 'ind_ctju_fin_ult1',
 'ind_ctma_fin_ult1',
 'ind_ctop_fin_ult1',
 'ind_ctpp_fin_ult1',
 'ind_deco_fin_ult1',
 'ind_dela_fin_ult1',
 'ind_deme_fin_ult1',
 'ind_ecue_fin_ult1',
 'ind_fond_fin_ult1',
 'ind_hip_fin_ult1',
 'ind_nom_pens_ult1',
 'ind_nomina_ult1',
 'ind_plan_fin_ult1',
 'ind_pres_fin_ult1',
 'ind_reca_fin_ult1',
 'ind_recibo_ult1',
 'ind_tjcr_fin_ult1',
 'ind_valo_fin_ult1',
 'ind_viv_fin_ult1']

In [3]:
def create_train_test(month1, month2, target_flag=True):
    '''Create train and test data between month1 and month2'''
    
    # first/early month
    df1 = pd.read_hdf('../input/data_month_{}.hdf'.format(month1), 'data_month')
    # second/later month
    df2 = pd.read_hdf('../input/data_month_{}.hdf'.format(month2), 'data_month')
    
    # second month products
    df2_target = df2.loc[:, ['ncodpers']+target_cols].copy()
    df2_target.set_index('ncodpers', inplace=True, drop=False) # initially keep ncodpers as a column and drop it later
    # a dataframe containing the ncodpers only
    df2_ncodpers = pd.DataFrame(df2_target.ncodpers)
    # drop ncodpers from df2_target
    df2_target.drop('ncodpers', axis=1, inplace=True)
    
    # first month products for all the customers in the second month
    df1_target = df1.loc[:, ['ncodpers']+target_cols].copy()
    df1_target.set_index('ncodpers', inplace=True, drop=True) # do not keep ncodpers as column
    # obtain the products purchased by all the customers in the second month
    # by joining df1_target to df2_ncodpers, NAN filled by 0.0
    df1_target = df2_ncodpers.join(df1_target, how='left')
    df1_target.fillna(0.0, inplace=True)
    df1_target.drop('ncodpers', axis=1, inplace=True)
    
    # new products from the first to second month
    target = df2_target.subtract(df1_target)
    target[target<0] = 0
    target.fillna(0.0, inplace=True)
    
    # feature of the second month: 
    # 1. customer features in the second month
    # 2. products in the first month
    x_vars = df2[cat_cols].copy() # cat_cols already includes ncodpers
    x_vars.reset_index(inplace=True, drop=True) # drop original index and make a new one
    x_vars.reset_index(inplace=True, drop=False) # also set the new index as a column for recoding row orders
    x_vars_cols = x_vars.columns.tolist()
    x_vars_cols[0] = 'sample_order' # change the name of the new column
    x_vars.columns = x_vars_cols
    x_vars.set_index('ncodpers', drop=True, inplace=True) # set the index to ncodpers again
    x_vars = x_vars.join(df1_target) # direct join since df1_target contains all customers in month2
    
    # return x_vars if target_flag is False
    if not target_flag:
        x_vars.drop('sample_order', axis=1, inplace=True) # drop sample_order
        x_vars.reset_index(inplace=True, drop=False) # add ncodpers
        return x_vars
    
    # prepare target/label for each added product from the first to second month
    # join target to x_vars
    x_vars_new = x_vars.join(target, rsuffix='_t')
    # set ncodpers as one column
    x_vars_new.reset_index(inplace=True, drop=False)
    x_vars.reset_index(inplace=True, drop=False)
    
    # melt
    x_vars_new = x_vars_new.melt(id_vars=x_vars.columns)
    # mapping from target_cols to index
    target_cols_mapping = {c+'_t': n for (n, c) in enumerate(target_cols)}
    # replace column name by index
    x_vars_new.variable.replace(target_cols_mapping, inplace=True)
    # reorder rows
    x_vars_new.sort_values(['sample_order', 'variable'], inplace=True)
    # keep new products
    x_vars_new = x_vars_new[x_vars_new.value>0]
    # drop sample_order and value
    x_vars_new.drop(['sample_order', 'value'], axis=1, inplace=True)
    # keep the order of rows as in the original data set
    x_vars_new.reset_index(drop=True, inplace=True)
    
    # variable
    x_vars = x_vars_new.iloc[:, :-1].copy()
    # target/label
    target = x_vars_new.iloc[:, -1].copy()
    
    return x_vars, target

Generate data

In [4]:
x_train, y_train = create_train_test('2015-05-28', '2015-06-28', target_flag=True)

In [5]:
x_test = create_train_test('2016-05-28', '2016-06-28', target_flag=False)

In [6]:
train_X = np.load('train_X.npy')
train_y = np.load('train_y.npy')
test_X = np.load('test_X.npy')

In [14]:
a = train_X-x_train.values
np.abs(a).max()

5.000000000000143e-05

In [15]:
a = train_y-y_train.values
np.abs(a).max()

0

In [16]:
a = test_X-x_test.values
np.abs(a).max()

5.000000000000143e-05

## Train model

In [17]:
param = {'objective': 'multi:softprob', 
         'eta': 0.05, 
         'max_depth': 8, 
         'silent': 1, 
         'num_class': len(target_cols),
         'eval_metric': 'mlogloss',
         'min_child_weight': 1,
         'subsample': 0.7,
         'colsample_bytree': 0.7,
         'seed': 0}
num_rounds = 50

dtrain = xgb.DMatrix(x_train.values, y_train.values)
model = xgb.train(param, dtrain, num_rounds, evals=[(dtrain, 'train')], verbose_eval=True)

[0]	train-mlogloss:2.91645
[1]	train-mlogloss:2.78563
[2]	train-mlogloss:2.6751
[3]	train-mlogloss:2.58366
[4]	train-mlogloss:2.49453
[5]	train-mlogloss:2.41242
[6]	train-mlogloss:2.34612
[7]	train-mlogloss:2.28669
[8]	train-mlogloss:2.22937
[9]	train-mlogloss:2.17384
[10]	train-mlogloss:2.12307
[11]	train-mlogloss:2.08031
[12]	train-mlogloss:2.04
[13]	train-mlogloss:1.99893
[14]	train-mlogloss:1.95945
[15]	train-mlogloss:1.92307
[16]	train-mlogloss:1.88998
[17]	train-mlogloss:1.85772
[18]	train-mlogloss:1.82859
[19]	train-mlogloss:1.80279
[20]	train-mlogloss:1.77791
[21]	train-mlogloss:1.75428
[22]	train-mlogloss:1.72977
[23]	train-mlogloss:1.70668
[24]	train-mlogloss:1.68579
[25]	train-mlogloss:1.66468
[26]	train-mlogloss:1.6446
[27]	train-mlogloss:1.62652
[28]	train-mlogloss:1.61053
[29]	train-mlogloss:1.59396
[30]	train-mlogloss:1.57689
[31]	train-mlogloss:1.56132
[32]	train-mlogloss:1.54642
[33]	train-mlogloss:1.5324
[34]	train-mlogloss:1.5179
[35]	train-mlogloss:1.5043
[36]	train

Prediction from my model

In [18]:
preds = model.predict(xgb.DMatrix(x_test.values))
preds = np.argsort(preds, axis=1)
preds = np.fliplr(preds)[:, :7]

Write out prediction results from my model

In [20]:
test_id = x_test.loc[:, 'ncodpers'].values
final_preds = [' '.join([target_cols[k] for k in pred]) for pred in preds]

out_df = pd.DataFrame({'ncodpers': test_id, 'added_products': final_preds})
out_df.to_csv('eda_4_12.csv.gz', compression='gzip', index=False)

In [22]:
preds_t = np.load('preds.npy')

In [24]:
a = preds_t-preds

In [27]:
np.abs(a)

1568946