## santander product recommendation by logistic regression

## logistic regression

### import packages

In [1]:
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
import multiprocessing
from scipy import sparse
import scipy as sp
import statsmodels.api as sm
import seaborn as sns
import matplotlib as mpl
import matplotlib.pylab as plt
import statsmodels.stats.api as sms
%matplotlib inline
import warnings
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *

warnings.filterwarnings('ignore')

pd.set_option("display.max_columns", None)



  from pandas.core import datetools


### feature selection
- 날짜데이터 2개 제외 : antiguedad에 포함 되어있음
- 대부분이 null값인 column 2개 제외
- 중복되는 칼럼 제외

In [2]:
X_features= ["ncodpers",
 'ind_empleado','pais_residencia','sexo','age','ind_nuevo','antiguedad','indrel',
 'indrel_1mes','tiprel_1mes','indresi','indext','canal_entrada','indfall',
 'tipodom','cod_prov','ind_actividad_cliente','renta','segmento']

notuse = ["ult_fec_cli_1t","nomprov", "conyuemp", "fecha_dato", "fecha_alta"]

y_features = [
 'ind_ahor_fin_ult1','ind_aval_fin_ult1','ind_cco_fin_ult1','ind_cder_fin_ult1','ind_cno_fin_ult1','ind_ctju_fin_ult1',
 'ind_ctma_fin_ult1','ind_ctop_fin_ult1','ind_ctpp_fin_ult1','ind_deco_fin_ult1','ind_deme_fin_ult1',
 'ind_dela_fin_ult1','ind_ecue_fin_ult1','ind_fond_fin_ult1','ind_hip_fin_ult1','ind_plan_fin_ult1',
 'ind_pres_fin_ult1','ind_reca_fin_ult1','ind_tjcr_fin_ult1','ind_valo_fin_ult1','ind_viv_fin_ult1','ind_nomina_ult1',
 'ind_nom_pens_ult1','ind_recibo_ult1']

train_cols = X_features + y_features

In [3]:
df_train = pd.read_csv("all_clean.csv", usecols = train_cols, nrows = 1000000)

In [4]:
df_test = pd.read_csv("test_ver2.csv", usecols = X_features)

## train - null clear

In [5]:
df_train = df_train[df_train['ind_nuevo'] == 0]
df_train = df_train[df_train['antiguedad'] != -999999]
df_train = df_train[df_train['indrel'] == 1]
df_train = df_train[df_train['indresi'] == 'S']
df_train = df_train[df_train['indfall'] == 'N']
df_train = df_train[df_train['tipodom'] == 1]
df_train = df_train[df_train['ind_empleado'] == 'N']
df_train = df_train[df_train['pais_residencia'] == 'ES']
df_train = df_train[df_train['indrel_1mes'] == 1]
df_train = df_train[df_train['tiprel_1mes'] == ('A' or 'I')]
df_train = df_train[df_train['indext'] == 'N']

# null - 최빈값
df_train["canal_entrada"] = df_train["canal_entrada"].fillna(df_train["canal_entrada"].value_counts().index[0])
# null - 평균값
df_train["renta"] = df_train["renta"].fillna(df_train["renta"].mean())
# null - 최빈값
df_train["segmento"] = df_train["segmento"].fillna(df_train["segmento"].value_counts().index[0])

## train - dtype clear
- to int

In [6]:
#segmento - dtype 변형
def func_for_segmento(x) :    
    if type(x) != float :
        if "1" in x :
            result = 1
        elif "2" in x :
            result = 2
        elif "3" in x :
            result = 3
    else : 
        result = None
    return result

df_train["segmento"] = df_train["segmento"].apply(func_for_segmento)

#canal_entrada - 상위 4개 제외, 그 이외 vlaue는 5번째껄로 통일
target_values = ["KHE", "KAT", "KFC", "KHQ"]

df_train["canal_entrada"] = df_train["canal_entrada"].apply(lambda x : "KHM" if x not in target_values else x)

## test - null clear

In [None]:
df_test = df_test[df_test['ind_nuevo'] == 0]
df_test = df_test[df_test['antiguedad'] != -999999]
df_test = df_test[df_test['indrel'] == 1]
df_test = df_test[df_test['indresi'] == 'S']
df_test = df_test[df_test['indfall'] == 'N']
df_test = df_test[df_test['tipodom'] == 1]
df_test = df_test[df_test['ind_empleado'] == 'N']
df_test = df_test[df_test['pais_residencia'] == 'ES']
df_test = df_test[df_test['indrel_1mes'] == 1]
df_test = df_test[df_test['tiprel_1mes'] == ('A' or 'I')]
df_test = df_test[df_test['indext'] == 'N']

# null - 최빈값
df_test["canal_entrada"] = df_test["canal_entrada"].fillna(df_test["canal_entrada"].value_counts().index[0])
# null - 평균값
df_test["renta"] = df_test["renta"].fillna(df_test["renta"].mean())
# null - 최빈값
df_test["segmento"] = df_test["segmento"].fillna(df_test["segmento"].value_counts().index[0])

In [None]:
#segmento - dtype 변형
df_test["segmento"] = df_test["segmento"].apply(func_for_segmento)

#canal_entrada - 상위 4개 제외, 그 이외 vlaue는 5번째껄로 통일
df_test["canal_entrada"] = df_test["canal_entrada"].apply(lambda x : "KHM" if x not in target_values else x)

In [None]:
dummy_col = [ "ind_empleado", "indresi", "indext", "canal_entrada", "indfall", "tiprel_1mes", "sexo", "pais_residencia"]

X_train = pd.get_dummies(df_train, prefix = dummy_col, columns = dummy_col)
y_test = pd.get_dummies(df_test, prefix = dummy_col, columns = dummy_col)

In [None]:
#y_features 제거
X_train = X_train.drop(columns = y_features + ["ncodpers"])
y_train = df_train[y_features]
y_test = y_test.drop(columns = "ncodpers")

In [None]:
sum(X_train.isnull().sum())

In [None]:
y_test.head()

In [None]:
#train model
model = LogisticRegression().fit(X_train, y_train[y_train.columns[0]])

In [None]:
y_pred = model.predict(X_train)

In [None]:
confusion_matrix(y_train[y_train.columns[0]], y_pred)

In [None]:
print(roc_auc_score(y_train[y_train.columns[0]], y_pred))

In [None]:
sample = pd.read_csv("sample_submission.csv")

In [None]:
sample.head()

In [None]:
df_test.head()