In [1]:
# libraries
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# read data file
df = pd.read_csv('./data/churn.csv')

In [3]:
df.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [4]:
df['Geography'].unique()

array(['France', 'Spain', 'Germany'], dtype=object)

In [5]:
df = df[['CreditScore', 'Geography', 'Gender', 'Tenure', 'Balance', 
         'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Exited']]
print(df.head())

   CreditScore Geography  Gender  Tenure    Balance  NumOfProducts  HasCrCard  \
0          619    France  Female       2       0.00              1          1   
1          608     Spain  Female       1   83807.86              1          0   
2          502    France  Female       8  159660.80              3          1   
3          699    France  Female       1       0.00              2          0   
4          850     Spain  Female       2  125510.82              1          1   

   IsActiveMember  EstimatedSalary  Exited  
0               1        101348.88       1  
1               1        112542.58       0  
2               0        113931.57       1  
3               0         93826.63       0  
4               1         79084.10       0  


In [6]:
geo_df = pd.get_dummies(df['Geography'])
gender_df = pd.get_dummies(df['Gender'])

print(geo_df.head())

   France  Germany  Spain
0       1        0      0
1       0        0      1
2       1        0      0
3       1        0      0
4       0        0      1


In [7]:
df = pd.concat([df, geo_df, gender_df], axis=1)

In [8]:
df = df.drop(['Geography', 'Gender'], axis=1)
print(df.head())

   CreditScore  Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0          619       2       0.00              1          1               1   
1          608       1   83807.86              1          0               1   
2          502       8  159660.80              3          1               0   
3          699       1       0.00              2          0               0   
4          850       2  125510.82              1          1               1   

   EstimatedSalary  Exited  France  Germany  Spain  Female  Male  
0        101348.88       1       1        0      0       1     0  
1        112542.58       0       0        0      1       1     0  
2        113931.57       1       1        0      0       1     0  
3         93826.63       0       1        0      0       1     0  
4         79084.10       0       0        0      1       1     0  


In [9]:
Y = df['Exited']

X = df.copy()
X = X.drop(['Exited'], axis=1)

In [10]:
# # Separating traing & test sets
# X = df.iloc[:,:-1]
# Y = (df['Y'] > 140).factorize()[0]

from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X,Y,test_size=0.33,random_state=1) 

In [None]:
####################################################
# RandomForestClassifier
####################################################

In [13]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(xtrain, ytrain)

pred = model.predict(xtest)

from sklearn.metrics import accuracy_score
accuracy_score(pred, ytest)

# confusion matrix, classification report

0.8133333333333334

In [15]:
model.feature_importances_

array([0.20955707, 0.1133983 , 0.20082673, 0.1494052 , 0.02040766,
       0.02947913, 0.21983893, 0.01077489, 0.02017403, 0.00866159,
       0.0076992 , 0.00977727])

In [16]:
xtrain.columns

Index(['CreditScore', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'France', 'Germany', 'Spain',
       'Female', 'Male'],
      dtype='object')

In [None]:
# Scaler(Standard, MinMax...)
# train with important features only
# Correlation(VIF, correlation matrix) - 다중공선성

In [None]:
####################################################
# Linear Regression
####################################################

In [None]:
# simple linear regression, Linear Regression
from sklearn.linear_model import LogisticRegression



In [None]:
####################################################
# KNN Classifier
####################################################

In [None]:
# simple KNN classification
from sklearn.neighbors import KNeighborsClassifier


In [None]:
####################################################
# Classification Tree
####################################################

In [None]:
# classification tree, SVM, RF
from sklearn.tree import DecisionTreeClassifier
