In [764]:


import numpy as np
import pandas as pd   
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

In [765]:
data = pd.read_csv('application_record.csv') 
record = pd.read_csv('credit_record.csv')  

In [766]:
data.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0


In [767]:
record.head()

Unnamed: 0,ID,MONTHS_BALANCE,STATUS
0,5001711,0,X
1,5001711,-1,0
2,5001711,-2,0
3,5001711,-3,0
4,5001712,0,C


In [768]:
plt.rcParams['figure.facecolor'] = 'white'

# Feature Engineering

## Response Variable

In [769]:
# find all users' account open month.
begin_month=pd.DataFrame(record.groupby(["ID"])["MONTHS_BALANCE"].agg(min))
begin_month=begin_month.rename(columns={'MONTHS_BALANCE':'begin_month'}) 
new_data=pd.merge(data,begin_month,how="left",on="ID") #merge to record data

Generally, users in risk should be in 3%, thus I choose users who overdue for more than 60 days as target risk users. Those samples are marked as '1', else are '0'.

In [770]:
record['dep_value'] = None
record['dep_value'][record['STATUS'] =='2']='Yes' 
record['dep_value'][record['STATUS'] =='3']='Yes' 
record['dep_value'][record['STATUS'] =='4']='Yes' 
record['dep_value'][record['STATUS'] =='5']='Yes' 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  record['dep_value'][record['STATUS'] =='2']='Yes'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  record['dep_value'][record['STATUS'] =='3']='Yes'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  record['dep_value'][record['STATUS'] =='4']='Yes'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  record['dep_value'

In [771]:
cpunt=record.groupby('ID').count()
cpunt['dep_value'][cpunt['dep_value'] > 0]='Yes' 
cpunt['dep_value'][cpunt['dep_value'] == 0]='No' 
cpunt = cpunt[['dep_value']]
new_data=pd.merge(new_data,cpunt,how='inner',on='ID')
new_data['target']=new_data['dep_value']
new_data.loc[new_data['target']=='Yes','target']=1
new_data.loc[new_data['target']=='No','target']=0
new_data.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,...,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,begin_month,dep_value,target
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,...,-4542,1,1,0,0,,2.0,-15.0,No,0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,...,-4542,1,1,0,0,,2.0,-14.0,No,0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,...,-1134,1,0,0,0,Security staff,2.0,-29.0,No,0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,-3051,1,0,1,1,Sales staff,1.0,-4.0,No,0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,-3051,1,0,1,1,Sales staff,1.0,-26.0,No,0


In [772]:
print(cpunt['dep_value'].value_counts())
cpunt['dep_value'].value_counts(normalize=True)

No     45318
Yes      667
Name: dep_value, dtype: int64


No     0.985495
Yes    0.014505
Name: dep_value, dtype: float64

## Features

+ rename 

In [773]:
new_data.rename(columns={'CODE_GENDER':'Gender','FLAG_OWN_CAR':'Car','FLAG_OWN_REALTY':'Reality',
                         'CNT_CHILDREN':'ChldNo','AMT_INCOME_TOTAL':'inc',
                         'NAME_EDUCATION_TYPE':'edutp','NAME_FAMILY_STATUS':'famtp',
                        'NAME_HOUSING_TYPE':'houtp','FLAG_EMAIL':'email',
                         'NAME_INCOME_TYPE':'inctp','FLAG_WORK_PHONE':'wkphone',
                         'FLAG_PHONE':'phone','CNT_FAM_MEMBERS':'famsize',
                        'OCCUPATION_TYPE':'occyp'
                        },inplace=True)

In [774]:
new_data.isnull().sum()

ID                   0
Gender               0
Car                  0
Reality              0
ChldNo               0
inc                  0
inctp                0
edutp                0
famtp                0
houtp                0
DAYS_BIRTH           0
DAYS_EMPLOYED        0
FLAG_MOBIL           0
wkphone              0
phone                0
email                0
occyp            11323
famsize              0
begin_month          0
dep_value            0
target               0
dtype: int64

In [775]:
new_data.dropna()
new_data = new_data.mask(new_data == 'NULL').dropna()

#### Gender

In [776]:
new_data['Gender'] = new_data['Gender'].replace(['F','M'],[0,1])
print(new_data['Gender'].value_counts())
new_data.head()

0    15630
1     9504
Name: Gender, dtype: int64


Unnamed: 0,ID,Gender,Car,Reality,ChldNo,inc,inctp,edutp,famtp,houtp,...,DAYS_EMPLOYED,FLAG_MOBIL,wkphone,phone,email,occyp,famsize,begin_month,dep_value,target
2,5008806,1,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,...,-1134,1,0,0,0,Security staff,2.0,-29.0,No,0
3,5008808,0,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,-3051,1,0,1,1,Sales staff,1.0,-4.0,No,0
4,5008809,0,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,-3051,1,0,1,1,Sales staff,1.0,-26.0,No,0
5,5008810,0,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,-3051,1,0,1,1,Sales staff,1.0,-26.0,No,0
6,5008811,0,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,-3051,1,0,1,1,Sales staff,1.0,-38.0,No,0


#### Having a car or not

In [777]:
new_data['Car'] = new_data['Car'].replace(['N','Y'],[0,1])
print(new_data['Car'].value_counts())
new_data.head()

0    14618
1    10516
Name: Car, dtype: int64


Unnamed: 0,ID,Gender,Car,Reality,ChldNo,inc,inctp,edutp,famtp,houtp,...,DAYS_EMPLOYED,FLAG_MOBIL,wkphone,phone,email,occyp,famsize,begin_month,dep_value,target
2,5008806,1,1,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,...,-1134,1,0,0,0,Security staff,2.0,-29.0,No,0
3,5008808,0,0,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,-3051,1,0,1,1,Sales staff,1.0,-4.0,No,0
4,5008809,0,0,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,-3051,1,0,1,1,Sales staff,1.0,-26.0,No,0
5,5008810,0,0,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,-3051,1,0,1,1,Sales staff,1.0,-26.0,No,0
6,5008811,0,0,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,-3051,1,0,1,1,Sales staff,1.0,-38.0,No,0


#### Having house reality or not

In [778]:
new_data['Reality'] = new_data['Reality'].replace(['N','Y'],[0,1])
print(new_data['Reality'].value_counts())
new_data.head()

1    16461
0     8673
Name: Reality, dtype: int64


Unnamed: 0,ID,Gender,Car,Reality,ChldNo,inc,inctp,edutp,famtp,houtp,...,DAYS_EMPLOYED,FLAG_MOBIL,wkphone,phone,email,occyp,famsize,begin_month,dep_value,target
2,5008806,1,1,1,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,...,-1134,1,0,0,0,Security staff,2.0,-29.0,No,0
3,5008808,0,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,-3051,1,0,1,1,Sales staff,1.0,-4.0,No,0
4,5008809,0,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,-3051,1,0,1,1,Sales staff,1.0,-26.0,No,0
5,5008810,0,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,-3051,1,0,1,1,Sales staff,1.0,-26.0,No,0
6,5008811,0,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,-3051,1,0,1,1,Sales staff,1.0,-38.0,No,0


#### Having a phone or not

In [779]:
new_data['phone'] = new_data['phone'].replace(['N','Y'],[0,1])
print(new_data['phone'].value_counts())
new_data.head()

0    17775
1     7359
Name: phone, dtype: int64


Unnamed: 0,ID,Gender,Car,Reality,ChldNo,inc,inctp,edutp,famtp,houtp,...,DAYS_EMPLOYED,FLAG_MOBIL,wkphone,phone,email,occyp,famsize,begin_month,dep_value,target
2,5008806,1,1,1,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,...,-1134,1,0,0,0,Security staff,2.0,-29.0,No,0
3,5008808,0,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,-3051,1,0,1,1,Sales staff,1.0,-4.0,No,0
4,5008809,0,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,-3051,1,0,1,1,Sales staff,1.0,-26.0,No,0
5,5008810,0,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,-3051,1,0,1,1,Sales staff,1.0,-26.0,No,0
6,5008811,0,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,-3051,1,0,1,1,Sales staff,1.0,-38.0,No,0


#### Having an email or not

In [780]:
new_data['email'] = new_data['email'].replace([0,1],[0,1])
print(new_data['email'].value_counts())
new_data.head()

0    22604
1     2530
Name: email, dtype: int64


Unnamed: 0,ID,Gender,Car,Reality,ChldNo,inc,inctp,edutp,famtp,houtp,...,DAYS_EMPLOYED,FLAG_MOBIL,wkphone,phone,email,occyp,famsize,begin_month,dep_value,target
2,5008806,1,1,1,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,...,-1134,1,0,0,0,Security staff,2.0,-29.0,No,0
3,5008808,0,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,-3051,1,0,1,1,Sales staff,1.0,-4.0,No,0
4,5008809,0,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,-3051,1,0,1,1,Sales staff,1.0,-26.0,No,0
5,5008810,0,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,-3051,1,0,1,1,Sales staff,1.0,-26.0,No,0
6,5008811,0,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,-3051,1,0,1,1,Sales staff,1.0,-38.0,No,0


#### Having a Work Phone or not

In [781]:
new_data['Reality'] = new_data['Reality'].replace(['N','Y'],[0,1])
print(new_data['Reality'].value_counts())
new_data.head()

1    16461
0     8673
Name: Reality, dtype: int64


Unnamed: 0,ID,Gender,Car,Reality,ChldNo,inc,inctp,edutp,famtp,houtp,...,DAYS_EMPLOYED,FLAG_MOBIL,wkphone,phone,email,occyp,famsize,begin_month,dep_value,target
2,5008806,1,1,1,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,...,-1134,1,0,0,0,Security staff,2.0,-29.0,No,0
3,5008808,0,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,-3051,1,0,1,1,Sales staff,1.0,-4.0,No,0
4,5008809,0,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,-3051,1,0,1,1,Sales staff,1.0,-26.0,No,0
5,5008810,0,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,-3051,1,0,1,1,Sales staff,1.0,-26.0,No,0
6,5008811,0,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,-3051,1,0,1,1,Sales staff,1.0,-38.0,No,0


### Continuous Variables

#### Children Numbers

In [782]:
new_data.loc[new_data['ChldNo'] >= 2,'ChldNo']=2
print(new_data['ChldNo'].value_counts(sort=False))

0    15908
1     6118
2     3108
Name: ChldNo, dtype: int64


In [783]:
new_data['ChldNo'].unique()

array([0, 2, 1], dtype=int64)

#### Age
Bucketing Continuous Variables

In [784]:
new_data['age'] = new_data['DAYS_BIRTH']//-365

In [785]:
new_data.head()

Unnamed: 0,ID,Gender,Car,Reality,ChldNo,inc,inctp,edutp,famtp,houtp,...,FLAG_MOBIL,wkphone,phone,email,occyp,famsize,begin_month,dep_value,target,age
2,5008806,1,1,1,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,...,1,0,0,0,Security staff,2.0,-29.0,No,0,58
3,5008808,0,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,1,0,1,1,Sales staff,1.0,-4.0,No,0,52
4,5008809,0,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,1,0,1,1,Sales staff,1.0,-26.0,No,0,52
5,5008810,0,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,1,0,1,1,Sales staff,1.0,-26.0,No,0,52
6,5008811,0,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,1,0,1,1,Sales staff,1.0,-38.0,No,0,52


In [786]:
new_data = new_data.drop('DAYS_BIRTH',1)


#### Working Years
+ Equal-length Bucketing

In [787]:
new_data['work years']=(new_data['DAYS_EMPLOYED'])//-365

In [788]:
new_data = new_data.drop('DAYS_EMPLOYED',1)


#### Famliy Size

In [789]:
new_data['famsize'].value_counts(sort=False)

20.0        1
4.0      2576
7.0        18
2.0     12697
3.0      5216
15.0        3
6.0        51
5.0       307
9.0         2
1.0      4263
Name: famsize, dtype: int64

In [790]:
new_data['famsize']=new_data['famsize'].astype(int)
new_data.loc[new_data['famsize']>=4,'famsize']= 4
new_data['famsize'].unique()

array([2, 1, 4, 3])

### Categorical Features

#### Income Type

In [791]:
print(new_data['inctp'].value_counts(sort=False))

Working                 15622
Commercial associate     7052
Pensioner                  13
State servant            2437
Student                    10
Name: inctp, dtype: int64


In [792]:
print(new_data['inctp'].value_counts(sort=False))
print(new_data['inctp'].value_counts(normalize=True,sort=False))
new_data.loc[new_data['inctp']=='Pensioner','inctp']='State servant'
new_data.loc[new_data['inctp']=='Student','inctp']='State servant'

Working                 15622
Commercial associate     7052
Pensioner                  13
State servant            2437
Student                    10
Name: inctp, dtype: int64
Working                 0.621549
Commercial associate    0.280576
Pensioner               0.000517
State servant           0.096960
Student                 0.000398
Name: inctp, dtype: float64


In [793]:
inctp = pd.get_dummies(new_data['inctp'], drop_first=True)



In [794]:
new_data = new_data.drop('inctp',1)


In [795]:
new_data = new_data.join(inctp)

In [796]:
new_data.head()

Unnamed: 0,ID,Gender,Car,Reality,ChldNo,inc,edutp,famtp,houtp,FLAG_MOBIL,...,email,occyp,famsize,begin_month,dep_value,target,age,work years,State servant,Working
2,5008806,1,1,1,0,112500.0,Secondary / secondary special,Married,House / apartment,1,...,0,Security staff,2,-29.0,No,0,58,3,0,1
3,5008808,0,0,1,0,270000.0,Secondary / secondary special,Single / not married,House / apartment,1,...,1,Sales staff,1,-4.0,No,0,52,8,0,0
4,5008809,0,0,1,0,270000.0,Secondary / secondary special,Single / not married,House / apartment,1,...,1,Sales staff,1,-26.0,No,0,52,8,0,0
5,5008810,0,0,1,0,270000.0,Secondary / secondary special,Single / not married,House / apartment,1,...,1,Sales staff,1,-26.0,No,0,52,8,0,0
6,5008811,0,0,1,0,270000.0,Secondary / secondary special,Single / not married,House / apartment,1,...,1,Sales staff,1,-38.0,No,0,52,8,0,0


In [797]:
new_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25134 entries, 2 to 36456
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ID             25134 non-null  int64  
 1   Gender         25134 non-null  int64  
 2   Car            25134 non-null  int64  
 3   Reality        25134 non-null  int64  
 4   ChldNo         25134 non-null  int64  
 5   inc            25134 non-null  float64
 6   edutp          25134 non-null  object 
 7   famtp          25134 non-null  object 
 8   houtp          25134 non-null  object 
 9   FLAG_MOBIL     25134 non-null  int64  
 10  wkphone        25134 non-null  int64  
 11  phone          25134 non-null  int64  
 12  email          25134 non-null  int64  
 13  occyp          25134 non-null  object 
 14  famsize        25134 non-null  int32  
 15  begin_month    25134 non-null  float64
 16  dep_value      25134 non-null  object 
 17  target         25134 non-null  object 
 18  age   

#### Occupation Type

In [798]:
new_data = new_data.drop('occyp',1)

In [799]:
new_data.head()

Unnamed: 0,ID,Gender,Car,Reality,ChldNo,inc,edutp,famtp,houtp,FLAG_MOBIL,...,phone,email,famsize,begin_month,dep_value,target,age,work years,State servant,Working
2,5008806,1,1,1,0,112500.0,Secondary / secondary special,Married,House / apartment,1,...,0,0,2,-29.0,No,0,58,3,0,1
3,5008808,0,0,1,0,270000.0,Secondary / secondary special,Single / not married,House / apartment,1,...,1,1,1,-4.0,No,0,52,8,0,0
4,5008809,0,0,1,0,270000.0,Secondary / secondary special,Single / not married,House / apartment,1,...,1,1,1,-26.0,No,0,52,8,0,0
5,5008810,0,0,1,0,270000.0,Secondary / secondary special,Single / not married,House / apartment,1,...,1,1,1,-26.0,No,0,52,8,0,0
6,5008811,0,0,1,0,270000.0,Secondary / secondary special,Single / not married,House / apartment,1,...,1,1,1,-38.0,No,0,52,8,0,0


#### House Type

In [800]:
home_type = pd.get_dummies(new_data['houtp'], drop_first=True)

In [801]:
new_data = new_data.join(home_type)

In [802]:
new_data = new_data.drop('houtp', 1)

#### Education

In [803]:
new_data['edutp'].unique()

array(['Secondary / secondary special', 'Higher education',
       'Incomplete higher', 'Lower secondary', 'Academic degree'],
      dtype=object)

In [804]:
educ = pd.get_dummies(new_data['edutp'],drop_first=True)

In [805]:
new_data = new_data.join(educ)
new_data = new_data.drop('edutp', 1)

In [806]:
new_data.head()

Unnamed: 0,ID,Gender,Car,Reality,ChldNo,inc,famtp,FLAG_MOBIL,wkphone,phone,...,Working,House / apartment,Municipal apartment,Office apartment,Rented apartment,With parents,Higher education,Incomplete higher,Lower secondary,Secondary / secondary special
2,5008806,1,1,1,0,112500.0,Married,1,0,0,...,1,1,0,0,0,0,0,0,0,1
3,5008808,0,0,1,0,270000.0,Single / not married,1,0,1,...,0,1,0,0,0,0,0,0,0,1
4,5008809,0,0,1,0,270000.0,Single / not married,1,0,1,...,0,1,0,0,0,0,0,0,0,1
5,5008810,0,0,1,0,270000.0,Single / not married,1,0,1,...,0,1,0,0,0,0,0,0,0,1
6,5008811,0,0,1,0,270000.0,Single / not married,1,0,1,...,0,1,0,0,0,0,0,0,0,1


####  Marriage Condition

In [807]:
new_data['famtp'].unique()

array(['Married', 'Single / not married', 'Civil marriage', 'Separated',
       'Widow'], dtype=object)

In [808]:
marry = pd.get_dummies(new_data['famtp'], drop_first=True)
new_data = new_data.join(marry)

In [809]:
new_data = new_data.drop('famtp',1)

In [810]:
new_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25134 entries, 2 to 36456
Data columns (total 31 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   ID                             25134 non-null  int64  
 1   Gender                         25134 non-null  int64  
 2   Car                            25134 non-null  int64  
 3   Reality                        25134 non-null  int64  
 4   ChldNo                         25134 non-null  int64  
 5   inc                            25134 non-null  float64
 6   FLAG_MOBIL                     25134 non-null  int64  
 7   wkphone                        25134 non-null  int64  
 8   phone                          25134 non-null  int64  
 9   email                          25134 non-null  int64  
 10  famsize                        25134 non-null  int32  
 11  begin_month                    25134 non-null  float64
 12  dep_value                      25134 non-null 

In [811]:
new_data['dep_value'] = new_data['dep_value'].replace(['No','Yes'],[0,1])
print(new_data['dep_value'].value_counts())
new_data.head()

0    24712
1      422
Name: dep_value, dtype: int64


Unnamed: 0,ID,Gender,Car,Reality,ChldNo,inc,FLAG_MOBIL,wkphone,phone,email,...,Rented apartment,With parents,Higher education,Incomplete higher,Lower secondary,Secondary / secondary special,Married,Separated,Single / not married,Widow
2,5008806,1,1,1,0,112500.0,1,0,0,0,...,0,0,0,0,0,1,1,0,0,0
3,5008808,0,0,1,0,270000.0,1,0,1,1,...,0,0,0,0,0,1,0,0,1,0
4,5008809,0,0,1,0,270000.0,1,0,1,1,...,0,0,0,0,0,1,0,0,1,0
5,5008810,0,0,1,0,270000.0,1,0,1,1,...,0,0,0,0,0,1,0,0,1,0
6,5008811,0,0,1,0,270000.0,1,0,1,1,...,0,0,0,0,0,1,0,0,1,0


In [812]:
X.iloc[:,14:]

Unnamed: 0,Working,House / apartment,Municipal apartment,Office apartment,Rented apartment,With parents,Higher education,Incomplete higher,Lower secondary,Secondary / secondary special,Married,Separated,Single / not married,Widow
2,0.780295,0.370374,-0.182713,-0.089333,-0.133327,-0.245611,-0.629414,-0.20281,-0.086577,0.703804,0.659904,-0.248963,-0.398535,-0.15369
3,-1.281515,0.370374,-0.182713,-0.089333,-0.133327,-0.245611,-0.629414,-0.20281,-0.086577,0.703804,-1.515312,-0.248963,2.509092,-0.15369
4,-1.281515,0.370374,-0.182713,-0.089333,-0.133327,-0.245611,-0.629414,-0.20281,-0.086577,0.703804,-1.515312,-0.248963,2.509092,-0.15369
5,-1.281515,0.370374,-0.182713,-0.089333,-0.133327,-0.245611,-0.629414,-0.20281,-0.086577,0.703804,-1.515312,-0.248963,2.509092,-0.15369
6,-1.281515,0.370374,-0.182713,-0.089333,-0.133327,-0.245611,-0.629414,-0.20281,-0.086577,0.703804,-1.515312,-0.248963,2.509092,-0.15369
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36452,0.780295,0.370374,-0.182713,-0.089333,-0.133327,-0.245611,-0.629414,-0.20281,-0.086577,0.703804,0.659904,-0.248963,-0.398535,-0.15369
36453,-1.281515,0.370374,-0.182713,-0.089333,-0.133327,-0.245611,1.588715,-0.20281,-0.086577,-1.420794,0.659904,-0.248963,-0.398535,-0.15369
36454,-1.281515,0.370374,-0.182713,-0.089333,-0.133327,-0.245611,1.588715,-0.20281,-0.086577,-1.420794,0.659904,-0.248963,-0.398535,-0.15369
36455,0.780295,0.370374,-0.182713,-0.089333,-0.133327,-0.245611,-0.629414,-0.20281,-0.086577,0.703804,0.659904,-0.248963,-0.398535,-0.15369


In [755]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25134 entries, 2 to 36456
Data columns (total 28 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Gender                         25134 non-null  float64
 1   Car                            25134 non-null  float64
 2   Reality                        25134 non-null  float64
 3   ChldNo                         25134 non-null  float64
 4   inc                            25134 non-null  float64
 5   wkphone                        25134 non-null  float64
 6   phone                          25134 non-null  float64
 7   email                          25134 non-null  float64
 8   famsize                        25134 non-null  float64
 9   begin_month                    25134 non-null  float64
 10  dep_value                      25134 non-null  float64
 11  age                            25134 non-null  float64
 12  work years                     25134 non-null 

# Algorithms

+ Split Dataset

In [816]:
new_data = new_data.reset_index()

In [819]:
new_data = new_data.drop('index',1)


KeyError: "['index'] not found in axis"

In [818]:
Y = new_data['target']
X = new_data.drop(['ID', 'target'],1)


In [758]:
X = (X-X.mean())/X.std()

In [759]:
X = X.drop('FLAG_MOBIL',1)

In [760]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25134 entries, 2 to 36456
Data columns (total 28 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Gender                         25134 non-null  float64
 1   Car                            25134 non-null  float64
 2   Reality                        25134 non-null  float64
 3   ChldNo                         25134 non-null  float64
 4   inc                            25134 non-null  float64
 5   wkphone                        25134 non-null  float64
 6   phone                          25134 non-null  float64
 7   email                          25134 non-null  float64
 8   famsize                        25134 non-null  float64
 9   begin_month                    25134 non-null  float64
 10  dep_value                      25134 non-null  float64
 11  age                            25134 non-null  float64
 12  work years                     25134 non-null 

+ After over sampling, the number between 1 and 0 is balanced. It can be seen from the confusion matrix.

In [761]:
X_train, X_test, y_train, y_test = train_test_split(X,Y,  test_size=0.2,
                                                    random_state = 0)

In [762]:
X_train

Unnamed: 0,Gender,Car,Reality,ChldNo,inc,wkphone,phone,email,famsize,begin_month,...,Rented apartment,With parents,Higher education,Incomplete higher,Lower secondary,Secondary / secondary special,Married,Separated,Single / not married,Widow
32644,1.282383,1.17899,-1.377637,0.722207,0.073353,1.628506,-0.643422,-0.334548,0.826422,-1.878349,...,-0.133327,-0.245611,-0.629414,4.930539,-0.086577,-1.420794,0.659904,-0.248963,-0.398535,-0.15369
23562,-0.779767,-0.84815,-1.377637,0.722207,-0.787801,1.628506,1.554128,-0.334548,0.826422,0.311478,...,-0.133327,-0.245611,1.588715,-0.202810,-0.086577,-1.420794,0.659904,-0.248963,-0.398535,-0.15369
28234,-0.779767,-0.84815,0.725852,-0.695914,-0.787801,-0.614035,-0.643422,-0.334548,-0.310796,-1.513377,...,7.500041,-0.245611,-0.629414,-0.202810,-0.086577,0.703804,0.659904,-0.248963,-0.398535,-0.15369
35037,1.282383,-0.84815,-1.377637,2.140328,-0.787801,-0.614035,-0.643422,-0.334548,1.963641,-0.296807,...,-0.133327,-0.245611,1.588715,-0.202810,-0.086577,-1.420794,0.659904,-0.248963,-0.398535,-0.15369
31972,-0.779767,-0.84815,0.725852,-0.695914,0.202526,-0.614035,-0.643422,-0.334548,-0.310796,-0.722607,...,-0.133327,-0.245611,-0.629414,-0.202810,-0.086577,0.703804,0.659904,-0.248963,-0.398535,-0.15369
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19323,-0.779767,-0.84815,0.725852,-0.695914,-0.787801,-0.614035,1.554128,-0.334548,-0.310796,1.588877,...,-0.133327,-0.245611,-0.629414,-0.202810,-0.086577,0.703804,0.659904,-0.248963,-0.398535,-0.15369
28690,-0.779767,-0.84815,0.725852,0.722207,-0.357224,-0.614035,1.554128,-0.334548,0.826422,0.858935,...,-0.133327,-0.245611,-0.629414,-0.202810,-0.086577,0.703804,0.659904,-0.248963,-0.398535,-0.15369
14503,1.282383,-0.84815,-1.377637,-0.695914,0.719218,-0.614035,-0.643422,-0.334548,-1.448015,0.250650,...,-0.133327,-0.245611,-0.629414,-0.202810,-0.086577,0.703804,-1.515312,-0.248963,2.509092,-0.15369
16065,-0.779767,-0.84815,0.725852,-0.695914,-0.357224,-0.614035,-0.643422,-0.334548,-0.310796,1.528048,...,-0.133327,-0.245611,1.588715,-0.202810,-0.086577,-1.420794,0.659904,-0.248963,-0.398535,-0.15369


## Logistic Regression   

In [763]:
model = LogisticRegression(C=0.8,
                           random_state=0,
                           solver='lbfgs')
model.fit(X_train, y_train)
y_predict = model.predict(X_test)


ValueError: Unknown label type: 'unknown'

## Decision Tree

In [589]:
model = DecisionTreeClassifier(max_depth=12,
                               min_samples_split=8,
                               random_state=1024)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)

print('Accuracy Score is {:.5}'.format(accuracy_score(y_test, y_predict)))
print(pd.DataFrame(confusion_matrix(y_test,y_predict)))

plot_confusion_matrix(confusion_matrix(y_test,y_predict),
                      classes=class_names, normalize = True, 
                      title='Normalized Confusion Matrix: CART')

ValueError: Unknown label type: 'unknown'

## Random Forest   



<center>
    <img style="border-radius: 0.3125em;
    box-shadow: 0 2px 4px 0 rgba(34,36,38,.12),0 2px 10px 0 rgba(34,36,38,.08);" 
    src="https://d1rwhvwstyk9gu.cloudfront.net/2019/03/Random-Forest-Algorithm.jpg">
    <br>
    <div style="color:orange; border-bottom: 1px solid #d9d9d9;
    display: inline-block;
    color: #999;
    padding: 2px;">Random Forest</div>
</center>

In [None]:
model = RandomForestClassifier(n_estimators=250,
                              max_depth=12,
                              min_samples_leaf=16
                              )
model.fit(X_train, y_train)
y_predict = model.predict(X_test)

print('Accuracy Score is {:.5}'.format(accuracy_score(y_test, y_predict)))
print(pd.DataFrame(confusion_matrix(y_test,y_predict)))

plot_confusion_matrix(confusion_matrix(y_test,y_predict),
                      classes=class_names, normalize = True, 
                      title='Normalized Confusion Matrix: Ramdom Forests')

## SVM


<center>
    <img style="border-radius: 0.3125em;
    box-shadow: 0 2px 4px 0 rgba(34,36,38,.12),0 2px 10px 0 rgba(34,36,38,.08);" 
    src="https://i.loli.net/2019/11/13/fryWG5al7OPHDiA.gif">
    <br>
    <div style="color:orange; border-bottom: 1px solid #d9d9d9;
    display: inline-block;
    color: #999;
    padding: 2px;">Support Vector Machine</div>
</center>

In [None]:
model = svm.SVC(C = 0.8,
                kernel='linear')
model.fit(X_train, y_train)
y_predict = model.predict(X_test)

print('Accuracy Score is {:.5}'.format(accuracy_score(y_test, y_predict)))
print(pd.DataFrame(confusion_matrix(y_test,y_predict)))

plot_confusion_matrix(confusion_matrix(y_test,y_predict),
                      classes=class_names, normalize = True, 
                      title='Normalized Confusion Matrix: SVM')

## LightGBM

In [None]:
model = LGBMClassifier(num_leaves=31,
                       max_depth=8, 
                       learning_rate=0.02,
                       n_estimators=250,
                       subsample = 0.8,
                       colsample_bytree =0.8
                      )
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print('Accuracy Score is {:.5}'.format(accuracy_score(y_test, y_predict)))
print(pd.DataFrame(confusion_matrix(y_test,y_predict)))

Showing important features:

In [None]:
def plot_importance(classifer, x_train, point_size = 25):
    '''plot feature importance'''
    values = sorted(zip(x_train.columns, classifer.feature_importances_), key = lambda x: x[1] * -1)
    imp = pd.DataFrame(values,columns = ["Name", "Score"])
    imp.sort_values(by = 'Score',inplace = True)
    sns.scatterplot(x = 'Score',y='Name', linewidth = 0,
                data = imp,s = point_size, color='red').set(
    xlabel='importance', 
    ylabel='features')
    
plot_importance(model, X_train,20)   

In [None]:
model.booster_.feature_importance(importance_type='gain')

## Xgboost

In [None]:
model = XGBClassifier(max_depth=12,
                      n_estimators=250,
                      min_child_weight=8, 
                      subsample=0.8, 
                      learning_rate =0.02,    
                      seed=42)

model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print('Accuracy Score is {:.5}'.format(accuracy_score(y_test, y_predict)))
print(pd.DataFrame(confusion_matrix(y_test,y_predict)))

In [None]:
plot_importance(model, X_train, 20)   

## CatBoost

In [None]:
model = CatBoostClassifier(iterations=250,
                           learning_rate=0.2,
                           od_type='Iter',
                           verbose=25,
                           depth=16,
                           random_seed=42)

model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print('CatBoost Accuracy Score is {:.5}'.format(accuracy_score(y_test, y_predict)))
print(pd.DataFrame(confusion_matrix(y_test,y_predict)))

<font size=3 > Please upvote it if you like it! </font>