## Feature Engineering in RAM-Limited Data, Part 1

In [1]:
import os
if os.name=='nt':
    try:
        mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-8.1.0-posix-seh-rt_v6-rev0\\mingw64\\bin'
        os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
    except:
        pass
    
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tqdm
import gc
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

tqdm.tqdm.pandas()

%matplotlib inline

In [2]:
cat_cols = ['ncodpers',
 'canal_entrada',
 'conyuemp',
 'ind_actividad_cliente',
 'ind_empleado',
 'ind_nuevo',
 'indext',
 'indfall',
 'indrel',
 'indrel_1mes',
 'indresi',
 'pais_residencia',
 'segmento',
 'sexo',
 'tipodom',
 'tiprel_1mes',
 'age',
 'antiguedad',
 'renta']

target_cols = ['ind_cco_fin_ult1',
 'ind_cder_fin_ult1',
 'ind_cno_fin_ult1',
 'ind_ctju_fin_ult1',
 'ind_ctma_fin_ult1',
 'ind_ctop_fin_ult1',
 'ind_ctpp_fin_ult1',
 #'ind_deco_fin_ult1',
 'ind_dela_fin_ult1',
 #'ind_deme_fin_ult1',
 'ind_ecue_fin_ult1',
 'ind_fond_fin_ult1',
 'ind_hip_fin_ult1',
 'ind_nom_pens_ult1',
 'ind_nomina_ult1',
 'ind_plan_fin_ult1',
 'ind_pres_fin_ult1',
 'ind_reca_fin_ult1',
 'ind_recibo_ult1',
 'ind_tjcr_fin_ult1',
 'ind_valo_fin_ult1']
 #'ind_viv_fin_ult1']

In [3]:
def create_train_test(month1, month2, target_flag=True):
    '''Create train and test data between month1 and month2'''
    
    # first/early month
    df1 = pd.read_hdf('../input/data_month_{}.hdf'.format(month1), 'data_month')
    # second/later month
    df2 = pd.read_hdf('../input/data_month_{}.hdf'.format(month2), 'data_month')
    
    # second month products
    df2_target = df2.loc[:, ['ncodpers']+target_cols].copy()
    df2_target.set_index('ncodpers', inplace=True, drop=False) # initially keep ncodpers as a column and drop it later
    # a dataframe containing the ncodpers only
    df2_ncodpers = pd.DataFrame(df2_target.ncodpers)
    # drop ncodpers from df2_target
    df2_target.drop('ncodpers', axis=1, inplace=True)
    
    # first month products for all the customers in the second month
    df1_target = df1.loc[:, ['ncodpers']+target_cols].copy()
    df1_target.set_index('ncodpers', inplace=True, drop=True) # do not keep ncodpers as column
    # obtain the products purchased by all the customers in the second month
    # by joining df1_target to df2_ncodpers, NAN filled by 0.0
    df1_target = df2_ncodpers.join(df1_target, how='left')
    df1_target.fillna(0.0, inplace=True)
    df1_target.drop('ncodpers', axis=1, inplace=True)
    
    # new products from the first to second month
    target = df2_target.subtract(df1_target)
    target[target<0] = 0
    target.fillna(0.0, inplace=True)
    
    # feature of the second month: 
    # 1. customer features in the second month
    # 2. products in the first month
    x_vars = df2[cat_cols].copy() # cat_cols already includes ncodpers
    x_vars.reset_index(inplace=True, drop=True) # drop original index and make a new one
    x_vars.reset_index(inplace=True, drop=False) # also set the new index as a column for recoding row orders
    x_vars_cols = x_vars.columns.tolist()
    x_vars_cols[0] = 'sample_order' # change the name of the new column
    x_vars.columns = x_vars_cols
    x_vars.set_index('ncodpers', drop=True, inplace=True) # set the index to ncodpers again
    x_vars = x_vars.join(df1_target) # direct join since df1_target contains all customers in month2
    
    # concatenate this and previous month values of ind_activadad_cliente
    df2_ind_actividad_cliente = df2[['ncodpers', 'ind_actividad_cliente']].copy()
    df2_ind_actividad_cliente.set_index('ncodpers', inplace=True)
    df2_ind_actividad_cliente.sort_index(inplace=True)
    
    df1_ind_actividad_cliente = df1[['ncodpers', 'ind_actividad_cliente']].copy()
    df1_ind_actividad_cliente.set_index('ncodpers', inplace=True)
    df1_ind_actividad_cliente.sort_index(inplace=True)

    df2_ind_actividad_cliente = df2_ind_actividad_cliente.join(df1_ind_actividad_cliente, rsuffix='_p')
    df2_ind_actividad_cliente.fillna(2.0, inplace=True)
    df2_ind_actividad_cliente['ind_actividad_client_combine'] = 3*df2_ind_actividad_cliente.ind_actividad_cliente+df2_ind_actividad_cliente.ind_actividad_cliente_p
    df2_ind_actividad_cliente = pd.DataFrame(df2_ind_actividad_cliente.iloc[:, -1])

    x_vars = pd.merge(x_vars, df2_ind_actividad_cliente, left_index=True, right_index=True, how='left')
    
    # concatenate this and previous month value of tiprel_1mes
    df2_tiprel_1mes = df2[['ncodpers', 'tiprel_1mes']].copy()
    df2_tiprel_1mes.set_index('ncodpers', inplace=True)
    df2_tiprel_1mes.sort_index(inplace=True)

    df1_tiprel_1mes = df1[['ncodpers', 'tiprel_1mes']].copy()
    df1_tiprel_1mes.set_index('ncodpers', inplace=True)
    df1_tiprel_1mes.sort_index(inplace=True)

    df2_tiprel_1mes = df2_tiprel_1mes.join(df1_tiprel_1mes, rsuffix='_p')
    df2_tiprel_1mes.fillna(0.0, inplace=True)
    df2_tiprel_1mes['tiprel_1mes_combine'] = 6*df2_tiprel_1mes.tiprel_1mes+df2_tiprel_1mes.tiprel_1mes_p
    df2_tiprel_1mes = pd.DataFrame(df2_tiprel_1mes.iloc[:, -1])

    x_vars = pd.merge(x_vars, df2_tiprel_1mes, left_index=True, right_index=True, how='left')
    
    # combination of target columns
    x_vars['target_combine'] = np.sum(x_vars[target_cols].values*
        np.float_power(2, np.arange(-10, len(target_cols)-10)), axis=1, dtype=np.float64)
    
    # return x_vars, df2_ncodpers, df1, df2, df1_target, df2_target
    
    # return x_vars if target_flag is False
    if not target_flag:
        x_vars.drop('sample_order', axis=1, inplace=True) # drop sample_order
        x_vars.reset_index(inplace=True, drop=False) # add ncodpers
        return x_vars #, df2_ncodpers, df1, df2, df1_target, df2_target
    
    if target_flag:    
        # prepare target/label for each added product from the first to second month
        # join target to x_vars
        x_vars_new = x_vars.join(target, rsuffix='_t')
        # set ncodpers as one column
        x_vars_new.reset_index(inplace=True, drop=False)
        x_vars.reset_index(inplace=True, drop=False)

        # melt
        x_vars_new = x_vars_new.melt(id_vars=x_vars.columns)
        # mapping from target_cols to index
        target_cols_mapping = {c+'_t': n for (n, c) in enumerate(target_cols)}
        # replace column name by index
        x_vars_new.variable.replace(target_cols_mapping, inplace=True)
        # reorder rows
        x_vars_new.sort_values(['sample_order', 'variable'], inplace=True)
        # keep new products
        x_vars_new = x_vars_new[x_vars_new.value>0]
        # drop sample_order and value
        x_vars_new.drop(['sample_order', 'value'], axis=1, inplace=True)
        # keep the order of rows as in the original data set
        x_vars_new.reset_index(drop=True, inplace=True)

        var_cols = x_vars.columns.tolist()
        var_cols.remove('sample_order')
        # variable
        x_vars = x_vars_new.loc[:, var_cols].copy()
        # target/label
        target = x_vars_new.loc[:, 'variable'].copy()

        return x_vars, target

Generate data

In [4]:
x_train, y_train = create_train_test('2015-05-28', '2015-06-28', target_flag=True)

In [5]:
x_val, y_val = create_train_test('2015-11-28', '2015-12-28', target_flag=True)

### Combination of target cols

generate a value

In [None]:
x_vars['target_combine'] = np.sum(x_vars[target_cols].values*np.float_power(2, np.arange(-10, len(target_cols)-10)), axis=1, dtype=np.float64)

Combination of target cols, generate a string

In [None]:
n = x_vars[target_cols].values.astype(int)
n = [np.array_str(n[k]).strip('[]').replace(' ', '') for k in range(n.shape[0])]
x_vars['target_str'] = n

### ind_actividad_cliente, this and previous months

In [None]:
df2_ind_actividad_cliente = df2[['ncodpers', 'ind_actividad_cliente']].copy()
df2_ind_actividad_cliente.set_index('ncodpers', inplace=True)
df2_ind_actividad_cliente.sort_index(inplace=True)

df1_ind_actividad_cliente = df1[['ncodpers', 'ind_actividad_cliente']].copy()
df1_ind_actividad_cliente.set_index('ncodpers', inplace=True)
df1_ind_actividad_cliente.sort_index(inplace=True)

df2_ind_actividad_cliente = df2_ind_actividad_cliente.join(df1_ind_actividad_cliente, rsuffix='_p')
df2_ind_actividad_cliente.fillna(2.0, inplace=True)
df2_ind_actividad_cliente['ind_actividad_client_combine'] = 3*df2_ind_actividad_cliente.ind_actividad_cliente+df2_ind_actividad_cliente.ind_actividad_cliente_p
df2_ind_actividad_cliente = pd.DataFrame(df2_ind_actividad_cliente.iloc[:, -1])

x_train = pd.merge(x_train, df2_ind_actividad_cliente, left_index=True, right_index=True, how='left')

### tiprel_1mes, this and previous months

In [None]:
df2_tiprel_1mes = df2[['ncodpers', 'tiprel_1mes']].copy()
df2_tiprel_1mes.set_index('ncodpers', inplace=True)
df2_tiprel_1mes.sort_index(inplace=True)

In [None]:
df2_tiprel_1mes = df2[['ncodpers', 'tiprel_1mes']].copy()
df2_tiprel_1mes.set_index('ncodpers', inplace=True)
df2_tiprel_1mes.sort_index(inplace=True)

df1_tiprel_1mes = df1[['ncodpers', 'tiprel_1mes']].copy()
df1_tiprel_1mes.set_index('ncodpers', inplace=True)
df1_tiprel_1mes.sort_index(inplace=True)

df2_tiprel_1mes = df2_tiprel_1mes.join(df1_tiprel_1mes, rsuffix='_p')
df2_tiprel_1mes.fillna(0.0, inplace=True)
df2_tiprel_1mes['tiprel_1mes_combine'] = 6*df2_tiprel_1mes.tiprel_1mes+df2_tiprel_1mes.tiprel_1mes_p
df2_tiprel_1mes = pd.DataFrame(df2_tiprel_1mes.iloc[:, -1])

x_train = pd.merge(x_train, df2_tiprel_1mes, left_index=True, right_index=True, how='left')

In [None]:
#x_val, y_val = create_train_test('2015-11-28', '2015-12-28', target_flag=True)

In [None]:
#x_test = create_train_test('2016-05-28', '2016-06-28', target_flag=False)

## Train model

In [None]:
param = {'objective': 'multi:softprob', 
         'eta': 0.05, 
         'max_depth': 8, 
         'silent': 1, 
         'num_class': len(target_cols),
         'eval_metric': 'mlogloss',
         'min_child_weight': 1,
         'subsample': 0.7,
         'colsample_bytree': 0.7,
         'seed': 0}
num_rounds =§ 50

dtrain = xgb.DMatrix(x_train.values, y_train.values)
dval = xgb.DMatrix(x_val.values, y_val.values)
model = xgb.train(param, dtrain, num_rounds, evals=[(dtrain, 'train'), (dval, 'val')], verbose_eval=True)

Prediction from my model

In [None]:
preds = model.predict(xgb.DMatrix(x_test.values))

df_preds = pd.DataFrame(preds, index=x_test.index, columns=target_cols)
df_preds[x_test[target_cols]==1] = 0
preds = df_preds.values
preds = np.argsort(preds, axis=1)
preds = np.fliplr(preds)[:, :7]

Write out prediction results from my model

In [None]:
test_id = x_test.loc[:, 'ncodpers'].values
final_preds = [' '.join([target_cols[k] for k in pred]) for pred in preds]

out_df = pd.DataFrame({'ncodpers': test_id, 'added_products': final_preds})
out_df.to_csv('eda_4_15.csv.gz', compression='gzip', index=False)