In [None]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('my_input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/santander-product-recommendation/test_ver2.csv.zip
/kaggle/input/santander-product-recommendation/sample_submission.csv.zip
/kaggle/input/santander-product-recommendation/train_ver2.csv.zip


In [None]:
import csv
import datetime as dt

# Data Viz
import seaborn as sns
import matplotlib.pyplot as plt

# Data Manipulation
from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Similarity calculation
from sklearn.metrics.pairwise import cosine_similarity

# Import ML libraries
import lightgbm as lgb
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

# settings
pd.options.display.max_rows = 100
pd.options.display.max_columns = None

In [None]:
# import the tables
train = pd.read_csv(filepath_or_buffer='my_input/santander-product-recommendation/train_ver2.csv.zip')
test = pd.read_csv(filepath_or_buffer='my_input/santander-product-recommendation/test_ver2.csv.zip')

  train = pd.read_csv(filepath_or_buffer='/kaggle/input/santander-product-recommendation/train_ver2.csv.zip')
  test = pd.read_csv(filepath_or_buffer='/kaggle/input/santander-product-recommendation/test_ver2.csv.zip')


In [None]:
train.columns

Index(['fecha_dato', 'ncodpers', 'ind_empleado', 'pais_residencia', 'sexo',
       'age', 'fecha_alta', 'ind_nuevo', 'antiguedad', 'indrel',
       'ult_fec_cli_1t', 'indrel_1mes', 'tiprel_1mes', 'indresi', 'indext',
       'conyuemp', 'canal_entrada', 'indfall', 'tipodom', 'cod_prov',
       'nomprov', 'ind_actividad_cliente', 'renta', 'segmento',
       'ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
       'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1',
       'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
       'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
       'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
       'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
       'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
       'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1'],
      dtype='object')

In [None]:
bank_services_cols=['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
       'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1',
       'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
       'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
       'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
       'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
       'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
       'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1']

In [None]:
train[bank_services_cols] = train[bank_services_cols].astype(bool)

In [None]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
frequent_itemsets = apriori(train[bank_services_cols], min_support=0.01, use_colnames=True)

In [None]:
# Generate association rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

# Display the association rules
print(rules)


             antecedents                                        consequents  \
0     (ind_cco_fin_ult1)                                (ind_ctop_fin_ult1)   
1    (ind_ctop_fin_ult1)                                 (ind_cco_fin_ult1)   
2     (ind_cco_fin_ult1)                                (ind_dela_fin_ult1)   
3    (ind_dela_fin_ult1)                                 (ind_cco_fin_ult1)   
4    (ind_fond_fin_ult1)                                 (ind_cco_fin_ult1)   
..                   ...                                                ...   
591  (ind_tjcr_fin_ult1)  (ind_cno_fin_ult1, ind_recibo_ult1, ind_nom_pe...   
592    (ind_nomina_ult1)  (ind_recibo_ult1, ind_tjcr_fin_ult1, ind_cno_f...   
593   (ind_cno_fin_ult1)  (ind_tjcr_fin_ult1, ind_recibo_ult1, ind_nom_p...   
594    (ind_recibo_ult1)  (ind_tjcr_fin_ult1, ind_cno_fin_ult1, ind_nom_...   
595  (ind_nom_pens_ult1)  (ind_recibo_ult1, ind_tjcr_fin_ult1, ind_cno_f...   

     antecedent support  consequent support   suppo

In [None]:
## Data Profiling
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13647309 entries, 0 to 13647308
Data columns (total 48 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   fecha_dato             object 
 1   ncodpers               int64  
 2   ind_empleado           object 
 3   pais_residencia        object 
 4   sexo                   object 
 5   age                    object 
 6   fecha_alta             object 
 7   ind_nuevo              float64
 8   antiguedad             object 
 9   indrel                 float64
 10  ult_fec_cli_1t         object 
 11  indrel_1mes            object 
 12  tiprel_1mes            object 
 13  indresi                object 
 14  indext                 object 
 15  conyuemp               object 
 16  canal_entrada          object 
 17  indfall                object 
 18  tipodom                float64
 19  cod_prov               float64
 20  nomprov                object 
 21  ind_actividad_cliente  float64
 22  renta           

In [None]:
train.isnull().sum()

fecha_dato                      0
ncodpers                        0
ind_empleado                27734
pais_residencia             27734
sexo                        27804
age                             0
fecha_alta                  27734
ind_nuevo                   27734
antiguedad                      0
indrel                      27734
ult_fec_cli_1t           13622516
indrel_1mes                149781
tiprel_1mes                149781
indresi                     27734
indext                      27734
conyuemp                 13645501
canal_entrada              186126
indfall                     27734
tipodom                     27735
cod_prov                    93591
nomprov                     93591
ind_actividad_cliente       27734
renta                     2794375
segmento                   189368
ind_ahor_fin_ult1               0
ind_aval_fin_ult1               0
ind_cco_fin_ult1                0
ind_cder_fin_ult1               0
ind_cno_fin_ult1                0
ind_ctju_fin_u

In [None]:
train.sample(5)

Unnamed: 0,fecha_dato,ncodpers,ind_empleado,pais_residencia,sexo,age,fecha_alta,ind_nuevo,antiguedad,indrel,ult_fec_cli_1t,indrel_1mes,tiprel_1mes,indresi,indext,conyuemp,canal_entrada,indfall,tipodom,cod_prov,nomprov,ind_actividad_cliente,renta,segmento,ind_ahor_fin_ult1,ind_aval_fin_ult1,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,ind_ctju_fin_ult1,ind_ctma_fin_ult1,ind_ctop_fin_ult1,ind_ctpp_fin_ult1,ind_deco_fin_ult1,ind_deme_fin_ult1,ind_dela_fin_ult1,ind_ecue_fin_ult1,ind_fond_fin_ult1,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
7906709,2015-11-28,922907,N,ES,V,24,2011-07-28,0.0,52,1.0,,1.0,I,S,N,,KHE,N,1.0,14.0,CORDOBA,0.0,76461.3,03 - UNIVERSITARIO,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
9076685,2016-01-28,1456902,N,ES,H,20,2015-09-22,1.0,4,1.0,,1.0,I,S,N,,KHQ,N,1.0,6.0,BADAJOZ,0.0,,03 - UNIVERSITARIO,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3031438,2015-05-28,832006,N,ES,V,53,2009-02-04,0.0,77,1.0,,1.0,A,S,N,,KAT,N,1.0,28.0,MADRID,1.0,407844.72,02 - PARTICULARES,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False
4470046,2015-07-28,1271121,N,ES,H,22,2014-07-18,0.0,12,1.0,,1.0,I,S,N,,KHE,N,1.0,14.0,CORDOBA,0.0,32369.91,03 - UNIVERSITARIO,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1332259,2015-03-28,815064,N,ES,V,36,2008-11-08,0.0,80,1.0,,1.0,I,S,N,,KFC,N,1.0,36.0,PONTEVEDRA,0.0,138110.61,02 - PARTICULARES,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


# Data Cleaning

In [None]:
train.columns

Index(['fecha_dato', 'ncodpers', 'ind_empleado', 'pais_residencia', 'sexo',
       'age', 'fecha_alta', 'ind_nuevo', 'antiguedad', 'indrel',
       'ult_fec_cli_1t', 'indrel_1mes', 'tiprel_1mes', 'indresi', 'indext',
       'conyuemp', 'canal_entrada', 'indfall', 'tipodom', 'cod_prov',
       'nomprov', 'ind_actividad_cliente', 'renta', 'segmento',
       'ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
       'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1',
       'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
       'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
       'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
       'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
       'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
       'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1'],
      dtype='object')

In [None]:
# 1) Feature Name Transformation
col_names = {"ncodpers":"cust_id", "ind_empleado":"emp_index","pais_residencia":"residence",
            "sexo":"sex","fecha_alta":"first_date","ind_nuevo":"new_cust","antiguedad":"seniority",
            "indrel":"is_primary","ult_fec_cli_1t":"last_primary_date","indrel_1mes":"cust_type",
            "tiprel_1mes":"cust_rel_type","indresi":"residence_index","indext":"foreigner_index",
            "conyuemp":"spouse_index","canal_entrada":"channel","cod_prov":"province","nomprov":"province_name",
            "ind_actividad_cliente":"active_index","renta":"income","segmento":"segment"}

train.rename(col_names, axis = 1, inplace = True)
test.rename(col_names, axis = 1, inplace = True)

In [None]:
train.columns

Index(['fecha_dato', 'cust_id', 'emp_index', 'residence', 'sex', 'age',
       'first_date', 'new_cust', 'seniority', 'is_primary',
       'last_primary_date', 'cust_type', 'cust_rel_type', 'residence_index',
       'foreigner_index', 'spouse_index', 'channel', 'indfall', 'tipodom',
       'province', 'province_name', 'active_index', 'income', 'segment',
       'ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
       'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1',
       'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
       'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
       'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
       'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
       'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
       'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1'],
      dtype='object')

In [None]:
train.age = pd.to_numeric(train.age, errors='coerce')
train.income = pd.to_numeric(train.income, errors='coerce')
train.seniority = pd.to_numeric(train.seniority, errors='coerce')
train.first_date = pd.to_datetime(train.first_date, errors = 'coerce')
train['fecha_dato'] = pd.to_datetime(train['fecha_dato'])

test.age = pd.to_numeric(test.age, errors='coerce')
test.income = pd.to_numeric(test.income, errors='coerce')
test.seniority = pd.to_numeric(test.seniority, errors='coerce')
test.first_date = pd.to_datetime(test.first_date, errors = 'coerce')

test['fecha_dato'] = pd.to_datetime(test['fecha_dato'])

In [None]:
train.isnull().sum()

fecha_dato                  0
cust_id                     0
emp_index               27734
residence               27734
sex                     27804
age                     27734
first_date              27734
new_cust                27734
seniority               27734
is_primary              27734
last_primary_date    13622516
cust_type              149781
cust_rel_type          149781
residence_index         27734
foreigner_index         27734
spouse_index         13645501
channel                186126
indfall                 27734
tipodom                 27735
province                93591
province_name           93591
active_index            27734
income                2794375
segment                189368
ind_ahor_fin_ult1           0
ind_aval_fin_ult1           0
ind_cco_fin_ult1            0
ind_cder_fin_ult1           0
ind_cno_fin_ult1            0
ind_ctju_fin_ult1           0
ind_ctma_fin_ult1           0
ind_ctop_fin_ult1           0
ind_ctpp_fin_ult1           0
ind_deco_f

In [None]:
train.isnull().sum()/train.shape[0] * 100

fecha_dato            0.000000
cust_id               0.000000
emp_index             0.203220
residence             0.203220
sex                   0.203732
age                   0.203220
first_date            0.203220
new_cust              0.203220
seniority             0.203220
is_primary            0.203220
last_primary_date    99.818330
cust_type             1.097513
cust_rel_type         1.097513
residence_index       0.203220
foreigner_index       0.203220
spouse_index         99.986752
channel               1.363829
indfall               0.203220
tipodom               0.203227
province              0.685784
province_name         0.685784
active_index          0.203220
income               20.475648
segment               1.387585
ind_ahor_fin_ult1     0.000000
ind_aval_fin_ult1     0.000000
ind_cco_fin_ult1      0.000000
ind_cder_fin_ult1     0.000000
ind_cno_fin_ult1      0.000000
ind_ctju_fin_ult1     0.000000
ind_ctma_fin_ult1     0.000000
ind_ctop_fin_ult1     0.000000
ind_ctpp

In [None]:
# 3) Missing values imputation

# For features with missing values percentage < 10%, use the most common factor level to impute
cols = ['emp_index','residence','sex','first_date','new_cust','is_primary',"cust_type","cust_rel_type",
       "province","province_name","active_index","channel","segment"]

for i in cols:
    train.loc[train[i].isnull(), i] = train[i].value_counts().index[0]
    test.loc[test[i].isnull(), i] = test[i].value_counts().index[0]

# For features with missing value accounts for over 10%, impute their missing values based on the mean at different dimensions
# For Income, impute the missings using the medians of Customer Segment and Province
train['income'].fillna(train['income'].mean(), inplace = True)
test['income'].fillna(test['income'].mean(), inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['income'].fillna(train['income'].mean(), inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['income'].fillna(test['income'].mean(), inplace = True)


In [None]:
# Drop the last primary date and spouse index fields given over 99% missing values
train.drop(['last_primary_date','spouse_index'], axis = 1, inplace = True)
test.drop(['last_primary_date','spouse_index'], axis = 1, inplace = True)

In [None]:
train.head()

Unnamed: 0,fecha_dato,cust_id,emp_index,residence,sex,age,first_date,new_cust,seniority,is_primary,cust_type,cust_rel_type,residence_index,foreigner_index,channel,indfall,tipodom,province,province_name,active_index,income,segment,ind_ahor_fin_ult1,ind_aval_fin_ult1,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,ind_ctju_fin_ult1,ind_ctma_fin_ult1,ind_ctop_fin_ult1,ind_ctpp_fin_ult1,ind_deco_fin_ult1,ind_deme_fin_ult1,ind_dela_fin_ult1,ind_ecue_fin_ult1,ind_fond_fin_ult1,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
0,2015-01-28,1375586,N,ES,H,35.0,2015-01-12,0.0,6.0,1.0,1.0,A,S,N,KHL,N,1.0,29.0,MALAGA,1.0,87218.1,02 - PARTICULARES,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,2015-01-28,1050611,N,ES,V,23.0,2012-08-10,0.0,35.0,1.0,1.0,I,S,S,KHE,N,1.0,13.0,CIUDAD REAL,0.0,35548.74,03 - UNIVERSITARIO,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,2015-01-28,1050612,N,ES,V,23.0,2012-08-10,0.0,35.0,1.0,1.0,I,S,N,KHE,N,1.0,13.0,CIUDAD REAL,0.0,122179.11,03 - UNIVERSITARIO,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,2015-01-28,1050613,N,ES,H,22.0,2012-08-10,0.0,35.0,1.0,1.0,I,S,N,KHD,N,1.0,50.0,ZARAGOZA,0.0,119775.54,03 - UNIVERSITARIO,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,2015-01-28,1050614,N,ES,V,23.0,2012-08-10,0.0,35.0,1.0,1.0,A,S,N,KHE,N,1.0,50.0,ZARAGOZA,1.0,134254.318238,03 - UNIVERSITARIO,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [None]:
age_group = [0, 20, 45, 65, 100]
age_labels = ['young', 'adult', 'senior', 'elder']

train['age_grouped'] = pd.cut(train.age, bins = age_group, labels = age_labels)
test['agg_grouped'] = pd.cut(test.age, bins = age_group, labels = age_labels)

In [None]:
train_income_group = [0, np.nanpercentile(train['income'], 25), np.nanpercentile(train['income'], 50), np.nanpercentile(train['income'], 75), np.nanpercentile(train['income'], 100)]
test_income_group = [0, np.nanpercentile(test['income'], 25), np.nanpercentile(test['income'], 50), np.nanpercentile(test['income'], 75), np.nanpercentile(test['income'], 100)]
income_labels = ['Low','Ordinary','Median-high','High']

train['income_grouped'] = pd.cut(train['income'], train_income_group, labels = income_labels)
test['income_grouped'] = pd.cut(test['income'], test_income_group, labels = income_labels)

# Model Training

In [None]:
train_gbdt = train[train['fecha_dato'] == pd.to_datetime('2016-05-28')]
train_gbdt = train_gbdt[0:10000]

# Dummify the categorical features and keep the important ones only
product_list = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1','ind_cco_fin_ult1','ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1',
                'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1','ind_ctpp_fin_ult1','ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
                'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1','ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
                'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1','ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1']
feature_list = ['emp_index','sex','new_cust','is_primary','segment','age_grouped','income_grouped']
col_list = feature_list + product_list

In [None]:
train_gbdt = train_gbdt[col_list]
train_gbdt['new_cust'] = train_gbdt['new_cust'].astype('string')
train_gbdt['is_primary'] = train_gbdt['is_primary'].astype('string')
train_gbdt['segment'] = train_gbdt['segment'].replace({'01 - TOP':'VIP','02 - PARTICULARES':'Individuals','03 - UNIVERSITARIO':'Graduated'}).astype('string')
train_gbdt['emp_index'] = train_gbdt['emp_index'].astype('string')
train_gbdt['sex'] = train_gbdt['sex'].astype('string')
train_gbdt['age_grouped'] = train_gbdt['age_grouped'].astype('string')
train_gbdt['income_grouped'] = train_gbdt['income_grouped'].astype('string')

# Given the product list will be used as response variable to be predicted, the train_dummy will be used as customer features to the training process
train_dummy = train_gbdt.select_dtypes(include = 'string')
train_dummy = pd.get_dummies(train_dummy)

In [None]:
# 1) Split the data into training and testing datasets (80% training and 20% testing)
X = train_dummy

# 2) Train the GBDT + LR model for each product and predict the customer's probability of choosing each product in the following period
pred = {}

In [None]:
for i in product_list:

    # Use the target product column as the target variable
    if train_gbdt[i].nunique() == 2:

        y = train_gbdt[i]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

        # Train the GBDT model
        gbdt_model = GradientBoostingClassifier(n_estimators=300,  # Number of boosting stages
                                           learning_rate=0.1,  # Learning rate
                                           max_depth=3,  # Maximum depth of each tree
                                           random_state=123)
        gbdt_model.fit(X_train, y_train)
        # Generate GBDT features
        gbdt_features = gbdt_model.apply(X_train)[:, :, 0]

        # Train the LR model using the GBDT generated features
        lr_model = LogisticRegression(solver='lbfgs', C=1.0, random_state=123)

        lr_model.fit(gbdt_features, y_train)

        # Generate LR features
        gbdt_features_test = gbdt_model.apply(X_test)[:, :, 0]
        lr_features = lr_model.predict_proba(gbdt_features_test)[:, 1]

        # Store the predicted score for the product
        pred[i] = lr_features

    else:
        pass


  return lib.map_infer(values, mapper, convert=convert)
  return lib.map_infer(values, mapper, convert=convert)
  return lib.map_infer(values, mapper, convert=convert)
  return lib.map_infer(values, mapper, convert=convert)
  return lib.map_infer(values, mapper, convert=convert)
  return lib.map_infer(values, mapper, convert=convert)
  return lib.map_infer(values, mapper, convert=convert)
  return lib.map_infer(values, mapper, convert=convert)
  return lib.map_infer(values, mapper, convert=convert)
  return lib.map_infer(values, mapper, convert=convert)
  return lib.map_infer(values, mapper, convert=convert)
  return lib.map_infer(values, mapper, convert=convert)
  return lib.map_infer(values, mapper, convert=convert)
  return lib.map_infer(values, mapper, convert=convert)
  return lib.map_infer(values, mapper, convert=convert)
  return lib.map_infer(values, mapper, convert=convert)
  return lib.map_infer(values, mapper, convert=convert)
  return lib.map_infer(values, mapper, convert=c

In [None]:
pred = pd.DataFrame(pred)
test_gbdt = pd.concat([X_test.reset_index(names = 'cust_id'), pred],axis = 1, ignore_index = True)

col_list = ['cust_id'] + list(X_test.columns) + list(pred.columns)
test_gbdt.columns = col_list
test_gbdt['cust_id'] = test_gbdt['cust_id'].astype('string')

In [None]:
test_gbdt[list(pred.columns)].sample(10)

Unnamed: 0,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,ind_ctju_fin_ult1,ind_ctma_fin_ult1,ind_ctop_fin_ult1,ind_ctpp_fin_ult1,ind_deco_fin_ult1,ind_deme_fin_ult1,ind_dela_fin_ult1,ind_ecue_fin_ult1,ind_fond_fin_ult1,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
323,0.549087,6.437101e-05,0.091373,2e-06,0.002438,0.131242,0.0474511,2.809532e-05,0.004100145,0.018795,0.074171,0.00428,5.420061e-06,0.004593965,0.01381,0.05845,0.03210163,0.01327687,0.00953984,0.068592,0.074373,0.148849
978,0.012476,4.214138e-08,0.024462,0.733031,5.3e-05,2.665688e-08,2.993937e-10,1.604537e-06,9.748362e-12,0.015211,0.06469,1e-05,6.843236e-08,1.845333e-07,2e-05,0.015907,1.572992e-07,5.033015e-08,2.464971e-07,0.004982,0.002347,6.7e-05
315,0.575367,2.220658e-06,0.087176,1e-06,0.000491,0.1140461,0.06493562,0.00246399,0.002404118,0.015059,0.065644,0.010999,1.787065e-06,0.001593524,0.005267,0.104434,0.05767718,0.02127298,0.01591865,0.058026,0.062489,0.138838
1531,0.519508,4.58399e-05,0.019969,2e-06,0.000243,0.06239404,0.01447965,0.008536693,4.273637e-09,0.023411,0.029827,0.002906,3.819388e-07,0.001219522,0.006036,0.037614,0.003418589,0.02644812,1.331762e-07,0.003194,0.02465,0.024931
1421,0.539001,7.27782e-06,0.081278,3e-06,0.002027,0.08179896,0.04801338,1.055221e-06,0.001130448,0.028992,0.095282,0.014314,0.004720721,0.003697618,0.013483,0.087641,0.05104569,0.02054235,0.004417282,0.058107,0.059614,0.14453
227,0.546645,0.0001495222,0.019233,9e-06,9.7e-05,0.08962193,0.02071774,8.923477e-07,0.00319127,0.05955,0.039128,0.006657,1.036228e-06,5.877917e-06,0.014496,0.046737,0.02024874,0.01559308,1.52616e-05,0.023418,0.028155,0.042006
899,0.575367,2.220658e-06,0.087176,1e-06,0.000491,0.1140461,0.06493562,0.00246399,0.002404118,0.015059,0.065644,0.010999,1.787065e-06,0.001593524,0.005267,0.104434,0.05767718,0.02127298,0.01591865,0.058026,0.062489,0.138838
292,0.539001,7.27782e-06,0.081278,3e-06,0.002027,0.08179896,0.04801338,1.055221e-06,0.001130448,0.028992,0.095282,0.014314,0.004720721,0.003697618,0.013483,0.087641,0.05104569,0.02054235,0.004417282,0.058107,0.059614,0.14453
1146,0.539001,7.27782e-06,0.081278,3e-06,0.002027,0.08179896,0.04801338,1.055221e-06,0.001130448,0.028992,0.095282,0.014314,0.004720721,0.003697618,0.013483,0.087641,0.05104569,0.02054235,0.004417282,0.058107,0.059614,0.14453
941,0.578976,0.002384588,0.066099,3e-06,0.006014,0.08203433,0.03929491,2.567205e-06,0.001544382,0.020728,0.065115,0.011169,0.004937701,0.002492795,0.009718,0.052216,0.04183953,0.02222552,5.801213e-06,0.032545,0.032866,0.126658


In [None]:
def gbdt_product_recommender(df, cust_id, top_n):

    cust_id = str(cust_id)
    prod_list = df.loc[df['cust_id'] == cust_id, pred.columns].T
    prod_list.columns = ['pred_score']
    prod_list = prod_list.sort_values(by = 'pred_score', ascending = False)

    # Ouput the top N recommended products based on the customer's features. If the probability is lower than 0.5 do not output (the customer would not want this one)
    prod_list = prod_list[prod_list['pred_score'] >= 0.001]
    recommend_list = prod_list[0:top_n]

    while len(recommend_list) == 0:
        print("Based on the customer's info, there is no bank product recommended for now")
        break
    return recommend_list

In [None]:
gbdt_product_recommender(df = test_gbdt, cust_id = "12718116", top_n = 10)

Unnamed: 0,pred_score
ind_cco_fin_ult1,0.516553
ind_ctop_fin_ult1,0.083149
ind_recibo_ult1,0.082106
ind_cno_fin_ult1,0.062352
ind_reca_fin_ult1,0.05111
ind_ecue_fin_ult1,0.049002
ind_tjcr_fin_ult1,0.036589
ind_nom_pens_ult1,0.036542
ind_nomina_ult1,0.034199
ind_ctpp_fin_ult1,0.032839


In [None]:
train.segment.value_counts()

segment
02 - PARTICULARES     8149588
03 - UNIVERSITARIO    4935579
01 - TOP               562142
Name: count, dtype: int64