# 분류

## 서비스 이탈예측 데이터

데이터 설명 : 고객의 신상정보 데이터를 통한 회사 서비스 이탈 예측 (종속변수 : Exited)


In [2]:
x_train = "https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/X_train.csv"
y_train = "https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/y_train.csv"
x_test = "https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/X_test.csv"
# x_label(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/y_test.csv

In [3]:
import pandas as pd

# 데이터 로드
x_train = pd.read_csv(x_train)
y_train = pd.read_csv(y_train)
x_test = pd.read_csv(x_test)

In [4]:
x_train

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,15799217,Zetticci,791,Germany,Female,35,7,52436.20,1,1,0,161051.75
1,15748986,Bischof,705,Germany,Male,42,8,166685.92,2,1,1,55313.51
2,15722004,Hsiung,543,France,Female,31,4,138317.94,1,0,0,61843.73
3,15780966,Pritchard,709,France,Female,32,2,0.00,2,0,0,109681.29
4,15636731,Ts'ai,714,Germany,Female,36,1,101609.01,2,1,1,447.73
...,...,...,...,...,...,...,...,...,...,...,...,...
6494,15702806,Martin,696,Spain,Male,24,9,0.00,1,0,0,10883.52
6495,15674179,Vorobyova,513,Germany,Male,34,7,60515.13,1,0,0,124571.09
6496,15790204,Myers,663,Spain,Female,22,9,0.00,1,1,0,29135.89
6497,15690772,Hughes,635,Spain,Female,48,2,0.00,2,1,1,136551.25


In [5]:
y_train

Unnamed: 0,CustomerId,Exited
0,15799217,0
1,15748986,0
2,15722004,0
3,15780966,0
4,15636731,0
...,...,...
6494,15702806,0
6495,15674179,0
6496,15790204,1
6497,15690772,0


In [6]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6499 entries, 0 to 6498
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerId       6499 non-null   int64  
 1   Surname          6499 non-null   object 
 2   CreditScore      6499 non-null   int64  
 3   Geography        6499 non-null   object 
 4   Gender           6499 non-null   object 
 5   Age              6499 non-null   int64  
 6   Tenure           6499 non-null   int64  
 7   Balance          6499 non-null   float64
 8   NumOfProducts    6499 non-null   int64  
 9   HasCrCard        6499 non-null   int64  
 10  IsActiveMember   6499 non-null   int64  
 11  EstimatedSalary  6499 non-null   float64
dtypes: float64(2), int64(7), object(3)
memory usage: 609.4+ KB


In [7]:
x_train.nunique()

CustomerId         6499
Surname            2289
CreditScore         459
Geography             3
Gender                4
Age                  69
Tenure               11
Balance            4162
NumOfProducts         4
HasCrCard             2
IsActiveMember        2
EstimatedSalary    6499
dtype: int64

In [8]:
drop_col = ['CustomerId', 'Surname']

x_train_drop = x_train.drop(columns=drop_col)
x_test_drop = x_test.drop(columns=drop_col)

In [9]:
from sklearn.model_selection import train_test_split

x_train_dummies = pd.get_dummies(x_train_drop) # x_train_drop에 대한 one-hot encoding 진행
x_test_dummies = pd.get_dummies(x_test_drop) # x_train에 대해 적용했으면 x_test도 반드시 진행 (형태 유지)
x_test_dummies = x_test_dummies[x_train_dummies.columns] # x_train과 형태가 같도록 맞춰준다
y = y_train['Exited'] # y는 label만 이용

X_train, X_validation, Y_train, Y_validation = train_test_split(x_train_dummies, y, test_size=0.33, random_state=42)

In [10]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)

rf.fit(X_train,Y_train)

In [12]:
# metrics
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, recall_score, precision_score

# model_score
predict_train_label = rf.predict(X_train)
predict_train_proba = rf.predict_proba(X_train)[:,1]

predict_validation_label = rf.predict(X_validation)
predict_validation_proba = rf.predict_proba(X_validation)[:,1]

print('train accuracy :', accuracy_score(Y_train, predict_train_label))
print('validation accuracy :', accuracy_score(Y_validation, predict_validation_label))
print('\n')

print('train f1_score :', f1_score(Y_train, predict_train_label))
print('validation f1_score :', f1_score(Y_validation, predict_validation_label))
print('\n')

print('train recall_score :', recall_score(Y_train, predict_train_label))
print('validation accuracy :', recall_score(Y_validation, predict_validation_label))
print('\n')

print('train precision_score :', precision_score(Y_train, predict_train_label))
print('validation precision_score :', precision_score(Y_validation, predict_validation_label))
print('\n')

print('train roc_auc_score :', roc_auc_score(Y_train, predict_train_proba))
print('validation roc_auc_score :', roc_auc_score(Y_validation, predict_validation_proba))

predict_test_label = rf.predict(x_test_dummies)
predict_test_proba = rf.predict_proba(x_test_dummies)[:,1]

train accuracy : 1.0
validation accuracy : 0.8657342657342657


train f1_score : 1.0
validation f1_score : 0.5920679886685553


train recall_score : 1.0
validation accuracy : 0.4543478260869565


train precision_score : 1.0
validation precision_score : 0.8495934959349594


train roc_auc_score : 1.0
validation roc_auc_score : 0.8542568700812798
