# Brief 

Initially started by using a GRU RNN that consumed each of the prior months maintaining a context and final linear layer that took the user features along with the final hidden output. This required more validation due to more variables and due to time constraint I cut the losses and simply implemented a more basic gradient boosting version that does pretty good. We make the assumption that the predicted new produts at a given month are conditionally independent of all the months 5 months prior and more given the buying patterns for the past 4 months.

In [None]:
import pickle
import gc
import xgboost
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from google.colab import drive
drive.mount('drive', force_remount=True)

Mounted at drive


# Defining Features of Interest

This section simply defines which temporal/user features will be used in the prediction along with which products we want to recommend. This is roughly the same as the landing page on the kaggle.

In [None]:
months = ['2015-01-28', '2015-02-28', '2015-03-28', '2015-04-28', '2015-05-28',
         '2015-06-28', '2015-07-28', '2015-08-28', '2015-09-28', '2015-10-28',
         '2015-11-28', '2015-12-28', '2016-01-28', '2016-02-28', '2016-03-28',
         '2016-04-28', '2016-05-28'] # unique months whose data we have
       
products = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1', 
            'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1', 
            'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1', 
            'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1', 
            'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
            'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
            'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
            'ind_nomina_ult1', 'ind_nom_pens_ult1',  'ind_recibo_ult1'] # all possible products available to buy
        
rec_products = ['ind_recibo_ult1', 'ind_cco_fin_ult1', 'ind_nom_pens_ult1',
                'ind_nomina_ult1', 'ind_tjcr_fin_ult1', 'ind_ecue_fin_ult1',
                'ind_cno_fin_ult1', 'ind_ctma_fin_ult1', 'ind_reca_fin_ult1',
                'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1', 'ind_valo_fin_ult1'] # all products we hope to recommend
    
features = ['fecha_dato', 'ncodpers', 'ind_empleado', 
                'pais_residencia', 'sexo', 'age', 'fecha_alta', 'ind_nuevo', 
                'antiguedad', 'indrel', 'ult_fec_cli_1t', 'indrel_1mes',
                'tiprel_1mes', 'indresi', 'indext', 'conyuemp', 'canal_entrada',
                'indfall', 'tipodom', 'cod_prov', 'nomprov',
                'ind_actividad_cliente', 'renta', 'segmento'] # user features provided

# Functions for parsing and munging/wrangling the data

These functions extract the data from the csv file and structure it so that a single instance has the inputs contain merged months of prior data into a single row. These are primarily for convenience and also do some small imputations such as that for missing rent values.

In [None]:
def get_data_by_month(dates, months, features, products):
    ids = dates.index[dates.fecha_dato.isin(months)]
    return pd.read_csv('drive/My Drive/santander-product-recommendation/train_ver2.csv', usecols=features + products, skiprows= range(1,ids[0]+1), nrows=len(ids), header=0)
    
def get_features(data, pdata):
    data['isEmployed'] = data['ind_empleado'].map(lambda x: 0 if (x=='N' or x=='S') else 1)
                    
    if data.antiguedad.dtype != np.int64 and data.antiguedad.dtype != np.float64:
        data['antiguedad'] = data.antiguedad.str.strip()
        data['antiguedad'] = data.antiguedad.map(lambda x: None if x=='NA' else int(x))

    data.antiguedad[data.antiguedad<0] = data.antiguedad.max()
    data.antiguedad.fillna(data.antiguedad.median(), inplace=True)
    
    if data.age.dtype != np.int64 and data.age.dtype != np.float64:
        data['age'] = data.age.str.strip()
        data['age'] = data.age.map(lambda x: None if x=='NA' else int(x))

    # fill missing values
    data.age.fillna(data.age.median(), inplace=True)
    data.tiprel_1mes.fillna('I', inplace=True)
    data['sexo'] = data['sexo'].map(lambda x: x != 'H').astype(int)
    data.segmento.fillna('02 - PARTICULARES', inplace=True)
    data.segmento = data.segmento.map(lambda x: x[:2])
    data.indfall.fillna('N', inplace=True)
    data['indfall'] = data['indfall'].map(lambda x: x == 'S').astype(int)
    
    data.cod_prov.fillna(99, inplace=True) # Dummy State Code for Foreign clients
    data['renta'] = pd.to_numeric(data['renta'], errors='coerce')
    for seg in data.segmento.unique(): # segment
        med = data[data.segmento==seg]['renta'].dropna().median()
        data.loc[(data.renta.isnull()) & (data.segmento==seg), 'renta'] = med # impute median from this segment of society

    Xclient = pd.concat([data[['ncodpers', 'isEmployed', 'sexo', 'age', 'antiguedad', 'indfall', 'ind_actividad_cliente', 'renta']], 
                        pd.get_dummies(data['tiprel_1mes'].apply(str)),
                        pd.get_dummies(data['segmento'].apply(str))],
                        axis=1)
    del data # clean up to avoid memory issues
    gc.collect()
    X = pd.merge(Xclient, pdata, how='left', on='ncodpers') # combine per customer data
    X.fillna(0, inplace=True)
    return X
    
def get_new_products(data, pdata):
    intsec = np.intersect1d(data.ncodpers, pdata.ncodpers) # only consider users with previous month and current month data
    merged = pd.merge(data, pdata, how='left', on='ncodpers')
    merged.fillna(0, inplace=True)
    added = pd.DataFrame(merged.ncodpers)
    for i, pr in enumerate(rec_products):
        added[pr] = merged.loc[:, pr + '_x'] - merged.loc[:, pr + '_y']
        added.loc[added[pr] == -1, pr] = 0
    
    return added.drop(['ncodpers'], axis=1)
    
def get_temp_features(merged, months):
    for month in months:
        temp_data = get_data_by_month(dates, [month], ['ncodpers'], products)
        i = months.index(month)
        merged = pd.merge(merged, temp_data, how='left', on='ncodpers', suffixes=[i, i+1])
        merged.fillna(0, inplace=True)
    return merged

In [None]:
dates = pd.read_csv('drive/My Drive/santander-product-recommendation/train_ver2.csv', usecols=['fecha_dato'], header=0)

month = '2015-06-28' # '2016-05-28'
pmonth = months[months.index(month) - 1]

data = get_data_by_month(dates, [month], features, products)
pdata = get_data_by_month(dates, [pmonth], ['ncodpers'], products)

X = get_features(data[features], pdata)

temp1 = months[months.index(pmonth) - 1]
temp2 = months[months.index(temp1) - 1]
temp3 = months[months.index(temp2) - 1]
temp4 = months[months.index(temp3) - 1]
target_months = [temp1, temp2, temp3, temp4]

X = get_temp_features(X, target_months)
X.drop(['ncodpers'], axis=1, inplace=True)

y = get_new_products(data[['ncodpers']+products], pdata)

del data, pdata

Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size=0.2, random_state=0)                                                      
Xtrain = X
ytrain = y

del X, y
gc.collect() # running into memory issues here so explicitly calling gc

new_products = np.sum(ytrain, axis=1)
Xtrain = Xtrain[new_products!=0]
ytrain = ytrain[new_products!=0]

targlist=[]
for row in yval.values:
    clientlist = []
    for i in range(yval.shape[1]):
        if row[i] == 1:
            clientlist.append(rec_products[i])
    targlist.append(clientlist)

# Training each of the Gradient Boosting Predictors

We implement the XGBoost library that use on gradient boosting to make a predictor for each product.

In [None]:
clfdict = {} # train a decision tree per product
probs = [] # predicted probabilities of products being purchased
freq = ytrain.sum(axis=0) # 
for pr in rec_products:
    clf = xgboost.XGBClassifier(max_depth=6, learning_rate = 0.08, subsample = 0.9, colsample_bytree = 0.9, n_estimators=100, base_score = freq[pr]/Xtrain.shape[0], nthread=4)
    clfdict[pr] = clf
    clf.fit(Xtrain, ytrain.loc[:, pr])
    ypredv = clf.predict(Xval)
    probs.append(clf.predict_proba(Xval)[:, 1])
    
probs = np.array(probs).T
likeliestprods = np.argsort(probs, axis=1)[:, :-8:-1] # ids of seven greatest probs
prlist = [[rec_products[j] for j in row] for row in likeliestprods]


del Xtrain, Xval, ytrain, yval # avoid loading test and train in memory at same time
gc.collect()

In [None]:
months = ['2015-01-28', '2015-02-28', '2015-03-28', '2015-04-28', '2015-05-28',
         '2015-06-28', '2015-07-28', '2015-08-28', '2015-09-28', '2015-10-28',
         '2015-11-28', '2015-12-28', '2016-01-28', '2016-02-28', '2016-03-28',
         '2016-04-28', '2016-05-28'] # unique months whose data we have
       
products = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1', 
            'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1', 
            'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1', 
            'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1', 
            'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
            'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
            'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
            'ind_nomina_ult1', 'ind_nom_pens_ult1',  'ind_recibo_ult1'] # all possible products available to buy
        
rec_products = ['ind_recibo_ult1', 'ind_cco_fin_ult1', 'ind_nom_pens_ult1',
                'ind_nomina_ult1', 'ind_tjcr_fin_ult1', 'ind_ecue_fin_ult1',
                'ind_cno_fin_ult1', 'ind_ctma_fin_ult1', 'ind_reca_fin_ult1',
                'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1', 'ind_valo_fin_ult1'] # all products we hope to recommend
    
features = ['fecha_dato', 'ncodpers', 'ind_empleado', 
                'pais_residencia', 'sexo', 'age', 'fecha_alta', 'ind_nuevo', 
                'antiguedad', 'indrel', 'ult_fec_cli_1t', 'indrel_1mes',
                'tiprel_1mes', 'indresi', 'indext', 'conyuemp', 'canal_entrada',
                'indfall', 'tipodom', 'cod_prov', 'nomprov',
                'ind_actividad_cliente', 'renta', 'segmento'] # user features provided

# Predictions

Recommends 5 products per applicable customer in test set. 

In [None]:
month = '2016-06-28'
pmonth = '2016-05-28'

data = pd.read_csv('drive/My Drive/santander-product-recommendation/test_ver2.csv', usecols=features, header=0)
pdata = get_data_by_month(dates, [pmonth], ['ncodpers'], products)

Xtest = get_features(data[features], pdata)

temp1 = months[months.index(pmonth) - 1]
temp2 = months[months.index(temp1) - 1]
temp3 = months[months.index(temp2) - 1]
temp4 = months[months.index(temp3) - 1]

months = [temp1, temp2, temp3, temp4]
Xtest = get_temp_features(Xtest, months)
         
ids = Xtest['ncodpers']
Xtest.drop(['ncodpers'], axis=1, inplace=True)

probs = []
for pr in rec_products:
    probs.append(clfdict[pr].predict_proba(Xtest)[:, 1])
probs = np.array(probs).T
        
likeliestprods = np.argsort(probs, axis=1)[:, :-6:-1]
test_preds = [[rec_products[j] for j in row] for row in likeliestprods] # recommend 5 likeliest products alternatively threshold on probs

  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


## Write predictions to files

In [None]:
out = pd.DataFrame()
out['id'] = ids 
for i in range(5):
    out['p' + str(i)] = np.array(test_preds)[:, i]
out.to_csv('output.csv')