## santander product recommendation by logistic regression

## logistic regression

### import packages

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline
import warnings
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *

warnings.filterwarnings('ignore')

pd.set_option("display.max_columns", None)

### feature selection
- 날짜데이터 2개 제외 : antiguedad에 포함 되어있음
- 대부분이 null값인 column 2개 제외
- 중복되는 칼럼 제외

In [2]:
X_features= ["ncodpers",
 'ind_empleado','pais_residencia','sexo','age','ind_nuevo','antiguedad','indrel',
 'indrel_1mes','tiprel_1mes','indresi','indext','canal_entrada','indfall',
 'tipodom','cod_prov','ind_actividad_cliente','renta','segmento']

notuse = ["ult_fec_cli_1t","nomprov", "conyuemp", "fecha_dato", "fecha_alta"]

y_features = [
 'ind_ahor_fin_ult1','ind_aval_fin_ult1','ind_cco_fin_ult1','ind_cder_fin_ult1','ind_cno_fin_ult1','ind_ctju_fin_ult1',
 'ind_ctma_fin_ult1','ind_ctop_fin_ult1','ind_ctpp_fin_ult1','ind_deco_fin_ult1','ind_deme_fin_ult1',
 'ind_dela_fin_ult1','ind_ecue_fin_ult1','ind_fond_fin_ult1','ind_hip_fin_ult1','ind_plan_fin_ult1',
 'ind_pres_fin_ult1','ind_reca_fin_ult1','ind_tjcr_fin_ult1','ind_valo_fin_ult1','ind_viv_fin_ult1','ind_nomina_ult1',
 'ind_nom_pens_ult1','ind_recibo_ult1']

train_cols = X_features + y_features

In [3]:
df_train = pd.read_csv("all_clean.csv", usecols = train_cols, nrows = 1000000)

In [None]:
# df_test = pd.read_csv("test_ver2.csv", usecols = X_features)

## train - null clear

In [4]:
df_train = df_train[df_train['ind_nuevo'] == 0]
df_train = df_train[df_train['antiguedad'] != -999999]
df_train = df_train[df_train['indrel'] == 1]
df_train = df_train[df_train['indresi'] == 'S']
df_train = df_train[df_train['indfall'] == 'N']
df_train = df_train[df_train['tipodom'] == 1]
df_train = df_train[df_train['ind_empleado'] == 'N']
df_train = df_train[df_train['pais_residencia'] == 'ES']
df_train = df_train[df_train['indrel_1mes'] == 1]
df_train = df_train[df_train['tiprel_1mes'] == ('A' or 'I')]
df_train = df_train[df_train['indext'] == 'N']

# null - 최빈값
df_train["canal_entrada"] = df_train["canal_entrada"].fillna(df_train["canal_entrada"].value_counts().index[0])
# null - 중앙값
df_train["renta"] = df_train["renta"].fillna(df_train["renta"].median())
# null - 최빈값
df_train["segmento"] = df_train["segmento"].fillna(df_train["segmento"].value_counts().index[0])

## train - dtype clear
- to int

In [5]:
#segmento - dtype 변형
def func_for_segmento(x) :    
    if type(x) != float :
        if "1" in x :
            result = 1
        elif "2" in x :
            result = 2
        elif "3" in x :
            result = 3
    else : 
        result = None
    return result

df_train["segmento"] = df_train["segmento"].apply(func_for_segmento)

#canal_entrada - 상위 4개 제외, 그 이외 vlaue는 5번째껄로 통일
target_values = ["KHE", "KAT", "KFC", "KHQ"]

df_train["canal_entrada"] = df_train["canal_entrada"].apply(lambda x : "KHM" if x not in target_values else x)

## test - null clear

In [None]:
df_test = df_test[df_test['ind_nuevo'] == 0]
df_test = df_test[df_test['antiguedad'] != -999999]
df_test = df_test[df_test['indrel'] == 1]
df_test = df_test[df_test['indresi'] == 'S']
df_test = df_test[df_test['indfall'] == 'N']
df_test = df_test[df_test['tipodom'] == 1]
df_test = df_test[df_test['ind_empleado'] == 'N']
df_test = df_test[df_test['pais_residencia'] == 'ES']
df_test = df_test[df_test['indrel_1mes'] == 1]
df_test = df_test[df_test['tiprel_1mes'] == ('A' or 'I')]
df_test = df_test[df_test['indext'] == 'N']

# null - 최빈값
df_test["canal_entrada"] = df_test["canal_entrada"].fillna(df_test["canal_entrada"].value_counts().index[0])
# null - 평균값
df_test["renta"] = df_test["renta"].fillna(df_test["renta"].mean())
# null - 최빈값
df_test["segmento"] = df_test["segmento"].fillna(df_test["segmento"].value_counts().index[0])

In [None]:
#segmento - dtype 변형
df_test["segmento"] = df_test["segmento"].apply(func_for_segmento)

#canal_entrada - 상위 4개 제외, 그 이외 vlaue는 5번째껄로 통일
df_test["canal_entrada"] = df_test["canal_entrada"].apply(lambda x : "KHM" if x not in target_values else x)

In [7]:
df_train["age"]   = pd.to_numeric(df_train["age"], errors="coerce")
# df_test["age"]   = pd.to_numeric(df_test["age"], errors="coerce")

age_group = [[0,19],[19,26],[26,36],[36,41],[41,47],[47,55],[55,60],[60,70],[70,80],[80,170]]

def create_age_group(df):  
    df['age_group'] = np.nan
    for i,age in enumerate(age_group):
        row_index = (df['age'] >= age[0]) & (df['age'] < age[1])
        df.loc[row_index,'age_group'] = i
        
create_age_group(df_train)
# create_age_group(df_test)

df_train.drop('age', axis=1, inplace = True)
# df_test.drop('age', axis=1, inplace = True)

In [8]:
renta_group = [[0,50000],[50000,70000],[70000,100000],[100000,150000],[150000,200000],[200000,1000000],[1000000,29000000]]

def create_renta_group(df):  
    df['renta_group'] = np.nan
    for i,renta in enumerate(renta_group):
        row_index = (df['renta'] >= renta[0]) & (df['renta'] < renta[1])
        df.loc[row_index,'renta_group'] = i
        
create_renta_group(df_train)
# create_renta_group(df_test)

df_train.drop('renta', axis=1, inplace = True)
# df_test.drop('renta', axis=1, inplace = True)

In [9]:
drop_column = ['ind_nuevo','indrel','indresi','indfall','tipodom','ind_empleado','pais_residencia','indrel_1mes', 'indext', 'tiprel_1mes']

df_train.drop(drop_column, axis=1, inplace = True)

In [10]:
dummy_col = ["canal_entrada", "cod_prov", "sexo", "segmento"]

X_train = pd.get_dummies(df_train, prefix = dummy_col, columns = dummy_col)
# y_test = pd.get_dummies(df_test, prefix = dummy_col, columns = dummy_col)

In [15]:
y_train = df_train[["ncodpers"] + y_features]

In [17]:
X_train.reset_index(inplace = True, drop = True)
y_train.reset_index(inplace = True, drop = True)

In [18]:
X_train.head()

Unnamed: 0,ncodpers,antiguedad,ind_actividad_cliente,ind_ahor_fin_ult1,ind_aval_fin_ult1,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,ind_ctju_fin_ult1,ind_ctma_fin_ult1,ind_ctop_fin_ult1,ind_ctpp_fin_ult1,ind_deco_fin_ult1,ind_deme_fin_ult1,ind_dela_fin_ult1,ind_ecue_fin_ult1,ind_fond_fin_ult1,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1,age_group,renta_group,canal_entrada_KAT,canal_entrada_KFC,canal_entrada_KHE,canal_entrada_KHM,canal_entrada_KHQ,cod_prov_1.0,cod_prov_2.0,cod_prov_3.0,cod_prov_4.0,cod_prov_5.0,cod_prov_6.0,cod_prov_33.0,sexo_H,sexo_V,segmento_1,segmento_2,segmento_3
0,1377105,16.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.0,0.0,0,3.0,2.0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0
1,760289,88.0,1.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,4.0,2.0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0
2,1150802,32.0,1.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,1.0,2.0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1
3,179011,187.0,1.0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0.0,0.0,1,4.0,2.0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0
4,761473,88.0,1.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,4.0,2.0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0


In [19]:
y_train.head()

Unnamed: 0,ncodpers,ind_ahor_fin_ult1,ind_aval_fin_ult1,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,ind_ctju_fin_ult1,ind_ctma_fin_ult1,ind_ctop_fin_ult1,ind_ctpp_fin_ult1,ind_deco_fin_ult1,ind_deme_fin_ult1,ind_dela_fin_ult1,ind_ecue_fin_ult1,ind_fond_fin_ult1,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
0,1377105,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.0,0.0,0
1,760289,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0
2,1150802,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0
3,179011,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0.0,0.0,1
4,761473,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0


In [20]:
X_train.to_csv("X_train.csv", index = False)
y_train.to_csv("y_train.csv", index = False)

In [11]:
#y_features 제거
X_train = X_train.drop(labels = y_features + ["ncodpers"], axis = 1)
# y_test = y_test.drop(labels = "ncodpers", axis = 1)

In [15]:
df_train.head()

Unnamed: 0,ncodpers,sexo,antiguedad,canal_entrada,cod_prov,ind_actividad_cliente,segmento,ind_ahor_fin_ult1,ind_aval_fin_ult1,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,ind_ctju_fin_ult1,ind_ctma_fin_ult1,ind_ctop_fin_ult1,ind_ctpp_fin_ult1,ind_deco_fin_ult1,ind_deme_fin_ult1,ind_dela_fin_ult1,ind_ecue_fin_ult1,ind_fond_fin_ult1,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1,age_group,renta_group
0,1377105,V,16.0,KHM,1.0,1.0,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.0,0.0,0,3.0,2.0
1,760289,V,88.0,KFC,1.0,1.0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,4.0,2.0
3,1150802,H,32.0,KHE,1.0,1.0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,1.0,2.0
6,179011,V,187.0,KAT,1.0,1.0,2,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0.0,0.0,1,4.0,2.0
7,761473,H,88.0,KAT,1.0,1.0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,4.0,2.0


In [23]:
X_train.head()

Unnamed: 0,antiguedad,ind_actividad_cliente,age_group,renta_group,canal_entrada_KAT,canal_entrada_KFC,canal_entrada_KHE,canal_entrada_KHM,canal_entrada_KHQ,cod_prov_1.0,cod_prov_2.0,cod_prov_3.0,cod_prov_4.0,cod_prov_5.0,cod_prov_6.0,cod_prov_33.0,sexo_H,sexo_V,segmento_1,segmento_2,segmento_3
0,16.0,1.0,3.0,2.0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0
1,88.0,1.0,4.0,2.0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0
3,32.0,1.0,1.0,2.0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1
6,187.0,1.0,4.0,2.0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0
7,88.0,1.0,4.0,2.0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0


In [24]:
#train model
for col in y_features : 
    if len(y_train[col].unique()) != 1 : 
        print(col)
        model = LogisticRegression().fit(X_train, y_train[col])
        y_pred = model.predict(X_train)
        print(roc_auc_score(y_train[col], y_pred))

ind_ahor_fin_ult1
0.5
ind_cco_fin_ult1
0.551418541458
ind_cder_fin_ult1
0.5
ind_cno_fin_ult1
0.5
ind_ctju_fin_ult1
0.963037098328
ind_ctma_fin_ult1
0.5
ind_ctop_fin_ult1
0.622529169015
ind_ctpp_fin_ult1
0.5
ind_deco_fin_ult1
0.5
ind_deme_fin_ult1
0.5
ind_dela_fin_ult1
0.604579975686
ind_ecue_fin_ult1
0.532068290294
ind_fond_fin_ult1
0.503513032855
ind_hip_fin_ult1
0.5
ind_plan_fin_ult1
0.5
ind_pres_fin_ult1
0.5
ind_reca_fin_ult1
0.5
ind_tjcr_fin_ult1
0.5
ind_valo_fin_ult1
0.511821868185
ind_viv_fin_ult1
0.5
ind_nomina_ult1
0.5
ind_nom_pens_ult1
0.5
ind_recibo_ult1
0.499330945084


In [25]:
df_train.to_csv("train_preprocessing.csv", index = False)