# Feature Engineering

First find customers with new products in each month, then create new features only for these customers. This method can save memory more.

In [1]:
from santander_helper import *
%matplotlib inline

### New products
1. Find (customer, product) pairs that are new in the second month
2. Find new products for each customer in each month, and create mean encoding features for each product pattern. The statistics is over all the history.

In [2]:
# column names for newly purchased products
new_cols = [k+'_new' for k in target_cols]
new_cols_map = {k+'_new': n for n, k in enumerate(target_cols)}
# ordered dict containing new products in each month, key is the first month
du = collections.OrderedDict()
# new products information for mean encoding
mean_encoding = []
# (customer, product) pair for new products in every month
customer_product_pair = {}
for m1, m2 in tqdm.tqdm_notebook(list(zip(month_list[:-2], month_list[1:-1]))):
    # load first month data
    df1 = pd.read_hdf('../input/data_month_{}.hdf'.format(m1), 'data_month')
    # load second month data
    df2 = pd.read_hdf('../input/data_month_{}.hdf'.format(m2), 'data_month')

    # only keep products information
    df1 = df1[['ncodpers']+target_cols]
    df2 = df2[['ncodpers']+target_cols]

    # calculate new products
    # merge first and second month products
    x = df2.merge(df1, on=['ncodpers'], how='left', suffixes=('_l', ''))
    x.fillna(0.0, inplace=True)
    # calculate difference 
    x = x.iloc[:, 1:20].values-x.iloc[:, 20:].values
    x = pd.DataFrame(x, index=df2.ncodpers, columns=new_cols)
    # remove negative elements
    x[x<0] = 0
    # only keep customers with new products
    x = x[x.sum(axis=1)>0]
    
    # keep copy of customers with new products
    new_product = x.copy()
    
    # obtain (customer, product) pairs
    x = pd.DataFrame(x.stack())
    x.reset_index(inplace=True)
    x.columns = ['ncodpers', 'product', 'indicator']
    x.loc[:, 'product'] = x.loc[:, 'product'].map(new_cols_map)
    # only keep (customer, product) pairs for new products
    x = x.loc[x.indicator>0]
    x.drop('indicator', axis=1, inplace=True)
    x.reset_index(inplace=True, drop=True)
    # a list of customers with new products
    ncodpers_new_product = x.ncodpers.unique()
    customer_product_pair[m2] = x
#     x.to_hdf('../input/customer_product_pair_{}.hdf'.format(m2), 
#         'customer_product_pair')
    
    # only keep customers with new products in the second month
    # df2 can be used to create train data
    df2 = df2.loc[df2.ncodpers.isin(ncodpers_new_product)]
    
    # prepare mean encoding 
    # product pattern in the first month
    df1['target_combine'] = np.sum(df1.values[:, 1:] * 
    np.float_power(2, np.arange(0, len(target_cols))), 
    axis=1, dtype=np.float64)
    df1.drop(target_cols, axis=1, inplace=True)
    # number and indicator of new products
    new_product['n_new'] = new_product.loc[:, new_cols].sum(axis=1)
    new_product['ind_new'] = new_product.loc[:, new_cols].max(axis=1)
    # join with the first month 
    df1 = df1.join(new_product, on='ncodpers', how='left')
    df1.fillna(0.0, inplace=True)
    df1.drop('ncodpers', axis=1, inplace=True)
    
    # add results to list
    mean_encoding.append(df1)

# concatenate all data
mean_encoding = pd.concat(mean_encoding, ignore_index=True)
# calculate mean vaues 
mean_encoding_result = mean_encoding.groupby('target_combine').mean()
# save mean encoding result
mean_encoding_result.to_hdf('../input/mean_encoding_result_eda_4_21.hdf',
    'mean_encoding_result')

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))




In [3]:
a = []
for m in customer_product_pair.keys():
    customer_product_pair[m]['fecha_dato'] = m
    a.append(customer_product_pair[m])
a = pd.concat(a, ignore_index=True)
a.to_hdf('../input/customer_product_pair.hdf', 'customer_product_pair')

### Create x_train and y_train

Load second month

In [233]:
month_name = '2015-05-28'
df2 = pd.read_hdf('../input/data_month_{}.hdf'.format(month_name), 'data_month')
df2 = df2.loc[:, cat_cols]
df2 = df2.loc[df2.ncodpers.isin(customer_product_pair[month_name].ncodpers.unique())]

Load first month

In [234]:
month_name_prev = '2015-04-28'
df1_0 = pd.read_hdf('../input/data_month_{}.hdf'.format(month_name_prev), 'data_month')
df1 = df1_0.loc[:, cat_cols+target_cols]
df1_target = df1_0.loc[:, ['ncodpers']+target_cols]

In [235]:
df2 = df2.merge(df1_target, on='ncodpers', how='left')
df2.fillna(0.0, inplace=True)

Combination of `ind_actividad_cliente`

In [236]:
# second month ind_actividad_cliente
df2_copy = df2.loc[:, ['ncodpers', 'ind_actividad_cliente']].copy()
# first month ind_actividad_cliente
df1_copy = df1.loc[df1.ncodpers.isin(df2.ncodpers), ['ncodpers', 'ind_actividad_cliente']].copy()
# merge two months
df2_copy = pd.merge(df2_copy, df1_copy, on='ncodpers', suffixes=('', '_prev'), how='left')
# fillna
df2_copy.fillna(2.0, inplace=True)
# combine 
df2_copy['ind_actvidad_client_combine'] = df2_copy.ind_actividad_cliente.values*3+df2_copy.ind_actividad_cliente_prev.values
# drop other columns
df2_copy.drop(['ind_actividad_cliente', 'ind_actividad_cliente_prev'], axis=1, inplace=True)
# merge result back to df2
df2 = df2.merge(df2_copy, how='left', left_on='ncodpers', right_on='ncodpers')

Combination of `tiprel_1mes`

In [237]:
# second month tiprel_1mes
df2_copy = df2.loc[:, ['ncodpers', 'tiprel_1mes']].copy()
# first month tiprel_1mes
df1_copy = df1.loc[df1.ncodpers.isin(df2.ncodpers), ['ncodpers', 'tiprel_1mes']].copy()
# merge two months
df2_copy = pd.merge(df2_copy, df1_copy, on='ncodpers', suffixes=('', '_prev'), how='left')
# fillna
df2_copy.fillna(0.0, inplace=True)
# combine 
df2_copy['tiprel_1mes_combine'] = df2_copy.tiprel_1mes.values*6+df2_copy.tiprel_1mes_prev.values
# drop other columns
df2_copy.drop(['tiprel_1mes', 'tiprel_1mes_prev'], axis=1, inplace=True)
# merge result back to df2
df2 = df2.merge(df2_copy, how='left', left_on='ncodpers', right_on='ncodpers')

Combine target

In [238]:
df2['target_combine'] = np.sum(df2[target_cols].values*
    np.float_power(2, np.arange(0, len(target_cols))), axis=1, 
    dtype=np.float64)

In [239]:
df2 = df2.merge(mean_encoding_result, on='target_combine', how='left')

In [241]:
df2.dtypes

ncodpers                         int64
canal_entrada                  float64
conyuemp                         int64
ind_actividad_cliente          float64
ind_empleado                     int64
ind_nuevo                      float64
indext                           int64
indfall                          int64
indrel                         float64
indrel_1mes                    float64
indresi                          int64
pais_residencia                  int64
segmento                         int64
sexo                             int64
tipodom                        float64
tiprel_1mes                    float64
age                            float64
antiguedad                     float64
renta                          float64
ind_cco_fin_ult1               float64
ind_cder_fin_ult1              float64
ind_cno_fin_ult1               float64
ind_ctju_fin_ult1              float64
ind_ctma_fin_ult1              float64
ind_ctop_fin_ult1              float64
ind_ctpp_fin_ult1        

In [35]:
x_train, y_train, df = create_train_test_2('2015-06-28')

In [27]:
x_train.head()

Unnamed: 0,ncodpers,canal_entrada,conyuemp,ind_actividad_cliente,ind_empleado,ind_nuevo,indext,indfall,indrel,indrel_1mes,...,ind_nomina_ult1_new,ind_plan_fin_ult1_new,ind_pres_fin_ult1_new,ind_reca_fin_ult1_new,ind_recibo_ult1_new,ind_tjcr_fin_ult1_new,ind_valo_fin_ult1_new,n_new,ind_new,n_products
0,657786,5.0,0,1.0,1,0.0,2,2,0.0,1.0,...,0.003425,3.8e-05,1.9e-05,0.0,0.057903,0.00914,0.00053,0.082181,0.073136,2.0
1,658007,5.0,0,1.0,1,0.0,2,2,0.0,1.0,...,0.005525,0.0,0.0,0.0,0.053407,0.022099,0.0,0.090239,0.082873,3.0
2,658054,4.0,0,1.0,1,0.0,2,2,0.0,1.0,...,0.000807,1.9e-05,0.0,0.000356,0.016302,0.014633,0.000375,0.043373,0.04069,3.0
3,658045,4.0,0,0.0,1,0.0,2,2,0.0,1.0,...,0.001799,1.3e-05,2e-06,0.000359,0.011854,0.000798,0.000137,0.020539,0.016837,1.0
4,658042,4.0,0,1.0,1,0.0,2,2,0.0,1.0,...,0.003276,0.000128,0.0,0.001542,0.061404,0.0,0.001156,0.079067,0.072966,3.0


In [36]:
x_train.shape

(45140, 63)

### Create test data for '2016-06-28'

For test month, we need to predict for every customer. So no need to first select customer product pairs.

In [28]:
month = '2016-06-28'

month2 = month # the second month
month1 = month_list[month_list.index(month2)-1] # the first month

# Load second month
df2 = pd.read_hdf('../input/data_month_{}.hdf'.format(month2), 'data_month')
df2 = df2.loc[:, cat_cols]

# Load first month
df1_0 = pd.read_hdf('../input/data_month_{}.hdf'.format(month1), 'data_month')
df1 = df1_0.loc[:, cat_cols+target_cols] # keep cat_cols and target_cols
df1_target = df1_0.loc[:, ['ncodpers']+target_cols] # keep targets

# Merge first month product with second month customer information
df2 = df2.merge(df1_target, on='ncodpers', how='left')
df2.fillna(0.0, inplace=True)

# Combination of ind_activadad_cliente
# second month ind_actividad_cliente
df2_copy = df2.loc[:, ['ncodpers', 'ind_actividad_cliente']].copy()
# first month ind_actividad_cliente
df1_copy = df1.loc[:, ['ncodpers', 'ind_actividad_cliente']].copy()
# merge two months
df2_copy = pd.merge(df2_copy, df1_copy, on='ncodpers', suffixes=('', '_prev'), how='left')
# fillna
df2_copy.fillna(2.0, inplace=True)
# combine 
df2_copy['ind_actvidad_client_combine'] = df2_copy.ind_actividad_cliente.values*3+df2_copy.ind_actividad_cliente_prev.values
# drop other columns
df2_copy.drop(['ind_actividad_cliente', 'ind_actividad_cliente_prev'], axis=1, inplace=True)
# merge result back to df2
df2 = df2.merge(df2_copy, how='left', left_on='ncodpers', right_on='ncodpers')

# Combination of tiprel_1mes
# second month tiprel_1mes
df2_copy = df2.loc[:, ['ncodpers', 'tiprel_1mes']].copy()
# first month tiprel_1mes
df1_copy = df1.loc[:, ['ncodpers', 'tiprel_1mes']].copy()
# merge two months
df2_copy = pd.merge(df2_copy, df1_copy, on='ncodpers', suffixes=('', '_prev'), how='left')
# fillna
df2_copy.fillna(0.0, inplace=True)
# combine 
df2_copy['tiprel_1mes_combine'] = df2_copy.tiprel_1mes.values*6+df2_copy.tiprel_1mes_prev.values
# drop other columns
df2_copy.drop(['tiprel_1mes', 'tiprel_1mes_prev'], axis=1, inplace=True)
# merge result back to df2
df2 = df2.merge(df2_copy, how='left', left_on='ncodpers', right_on='ncodpers')    

# Combine target
df2['target_combine'] = np.sum(df2[target_cols].values*
    np.float_power(2, np.arange(0, len(target_cols))), axis=1, 
    dtype=np.float64)
# Load mean encoding data
mean_encoding_result = pd.read_hdf('../input/mean_encoding_result_eda_4_21.hdf',
'mean_encoding_result')
# Merge with mean encoding result
df2 = df2.merge(mean_encoding_result, on='target_combine', how='left')

# number of products in the first month
df2['n_products'] = df2[target_cols].sum(axis=1)

In [29]:
df2.head()

Unnamed: 0,ncodpers,canal_entrada,conyuemp,ind_actividad_cliente,ind_empleado,ind_nuevo,indext,indfall,indrel,indrel_1mes,...,ind_nomina_ult1_new,ind_plan_fin_ult1_new,ind_pres_fin_ult1_new,ind_reca_fin_ult1_new,ind_recibo_ult1_new,ind_tjcr_fin_ult1_new,ind_valo_fin_ult1_new,n_new,ind_new,n_products
0,15889,5.0,2,1.0,3,0.0,2,2,0.0,1.0,...,0.00678,0.0,0.0,0.0,0.115254,0.0,0.0,0.155932,0.135593,4.0
1,1170544,5.0,0,0.0,1,0.0,2,2,0.0,1.0,...,0.001799,1.3e-05,2e-06,0.000359,0.011854,0.000798,0.000137,0.020539,0.016837,1.0
2,1170545,1.0,0,1.0,1,0.0,2,2,0.0,1.0,...,0.001799,1.3e-05,2e-06,0.000359,0.011854,0.000798,0.000137,0.020539,0.016837,1.0
3,1170547,1.0,0,0.0,1,0.0,2,2,0.0,1.0,...,0.001799,1.3e-05,2e-06,0.000359,0.011854,0.000798,0.000137,0.020539,0.016837,1.0
4,1170548,1.0,0,0.0,1,0.0,2,2,0.0,1.0,...,0.001799,1.3e-05,2e-06,0.000359,0.011854,0.000798,0.000137,0.020539,0.016837,1.0


In [2]:
x_test = create_test()

In [3]:
x_test.shape

(929615, 63)