# 작업 2유형
- https://www.datamanim.com/dataset/03_dataq/typetwo.html#id3

# 1. 서비스 이탈예측 데이터(Classification)

> Attention
- 데이터 설명 : 고객의 신상정보 데이터를 통한 회사 서비스 이탈 예측 (종속변수 : Exited)
- 데이터 출처 : https://www.kaggle.com/shubh0799/churn-modelling 에서 변형
- X_train : https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/X_train.csv
- y_train : https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/y_train.csv
- X_test : https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/X_test.csv
- y_test(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/y_test.csv

#### 0. 시험 환경 세팅

In [None]:
import pandas as pd

X_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/X_train.csv")
y_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/y_train.csv")
X_test= pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/X_test.csv")
y_test = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/y_test.csv")

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((6499, 12), (6499, 2), (3501, 12), (3501, 2))

#### 1. 라이브러리 및 데이터 호출

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
# 필요한 라이브러리 import

In [None]:
import numpy as np
import pandas as pd

- CustomerId 컬럼 제거
- y_train의 Exited 컬럼을 label data 로 지정

In [None]:
del X_train["CustomerId"]
del X_test["CustomerId"]
y_train =y_train["Exited"]
y_test =y_test["Exited"]

In [None]:
X_train.head()

Unnamed: 0,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,Zetticci,791,Germany,Female,35,7,52436.2,1,1,0,161051.75
1,Bischof,705,Germany,Male,42,8,166685.92,2,1,1,55313.51
2,Hsiung,543,France,Female,31,4,138317.94,1,0,0,61843.73
3,Pritchard,709,France,Female,32,2,0.0,2,0,0,109681.29
4,Ts'ai,714,Germany,Female,36,1,101609.01,2,1,1,447.73


In [None]:
y_train.head(5)

0    0
1    0
2    0
3    0
4    0
Name: Exited, dtype: int64

#### 2. EDA
- X_train 의 기초 통계량, null 값 확인

In [None]:
# X_train 의 기초 통계량, null 값 확인
print(X_train.info())
print(X_train.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6499 entries, 0 to 6498
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Surname          6499 non-null   object 
 1   CreditScore      6499 non-null   int64  
 2   Geography        6499 non-null   object 
 3   Gender           6499 non-null   object 
 4   Age              6499 non-null   int64  
 5   Tenure           6499 non-null   int64  
 6   Balance          6499 non-null   float64
 7   NumOfProducts    6499 non-null   int64  
 8   HasCrCard        6499 non-null   int64  
 9   IsActiveMember   6499 non-null   int64  
 10  EstimatedSalary  6499 non-null   float64
dtypes: float64(2), int64(6), object(3)
memory usage: 558.6+ KB
None
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary 

In [None]:
# X_test 의 기초 통계량, null 값 확인
print(X_test.info())
print(X_test.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3501 entries, 0 to 3500
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Surname          3501 non-null   object 
 1   CreditScore      3501 non-null   int64  
 2   Geography        3501 non-null   object 
 3   Gender           3501 non-null   object 
 4   Age              3501 non-null   int64  
 5   Tenure           3501 non-null   int64  
 6   Balance          3501 non-null   float64
 7   NumOfProducts    3501 non-null   int64  
 8   HasCrCard        3501 non-null   int64  
 9   IsActiveMember   3501 non-null   int64  
 10  EstimatedSalary  3501 non-null   float64
dtypes: float64(2), int64(6), object(3)
memory usage: 301.0+ KB
None
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary 

In [None]:
object_columns = X_train.dtypes[X_train.dtypes == X_train.dtypes[0]] #X_train.dtypes[0] 가 object 형태이기 때문에 

In [None]:
object_columns.index

Index(['Surname', 'Geography', 'Gender'], dtype='object')

In [None]:
for i in object_columns.index:
    print(i, X_train[i].nunique())

Surname 2289
Geography 3
Gender 4


In [None]:
del X_train["Surname"]
del X_test["Surname"]

#### 3. Preprocessing
- Gender : 띄어쓰기 제거 및 대문자 변경
- 수치형 컬럼 : 정규화(MinMaxScaler)
- 범주형 컬럼 : 인코딩(LabelEncoder)

In [None]:
print(X_train["Gender"].unique())
# 띄어쓰기 제거
# 대문자 로 전부 변경
X_train["Gender"] = X_train["Gender"].str.upper().str.replace(" ","")
X_test["Gender"] = X_test["Gender"].str.upper().str.replace(" ","")
print(X_test["Gender"].unique())

['Female' 'Male' ' male' 'female']
['FEMALE' 'MALE']


In [None]:
def remove_space_upper(df,col):
    # print(df[col].unique)
    # 띄어쓰기 제거
    # 대문자 로 전부 변경
    df[col] = df[col].str.upper().str.replace(" ","")
    # print(df[col].unique)
    return df

In [None]:
X_train = remove_space_upper(X_train, "Gender")
X_test = remove_space_upper(X_test, "Gender")

In [None]:
X_train = remove_space_upper(X_train, "Geography")
X_test = remove_space_upper(X_test, "Geography")

In [None]:
X_train.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,791,GERMANY,FEMALE,35,7,52436.2,1,1,0,161051.75
1,705,GERMANY,MALE,42,8,166685.92,2,1,1,55313.51
2,543,FRANCE,FEMALE,31,4,138317.94,1,0,0,61843.73
3,709,FRANCE,FEMALE,32,2,0.0,2,0,0,109681.29
4,714,GERMANY,FEMALE,36,1,101609.01,2,1,1,447.73


In [None]:
# 수치형 컬럼 정규화

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
# 방법 1
int_col = X_train.select_dtypes(include=["int", "float"]).columns
for i in int_col:
    ss = StandardScaler()
    X_train[i] = ss.fit_transform(X_train[[i]]) # test data 를 이용해 fit -> transform
    X_test[i] = ss.transform(X_test[[i]]) # 이미 fit 되어있기 때문에 transform 만 해준다. 

In [None]:
# 방법 2
# scaler = StandardScaler()
# X_train[int_col] = scaler.fit_transform(X_train[int_col])
# X_test[int_col] = scaler.transform(X_test[int_col])

In [None]:
X_train.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,1.455346,GERMANY,FEMALE,-0.376792,0.677301,-0.391014,-0.897814,0.640843,-1.0292,1.047721
1,0.565183,GERMANY,MALE,0.289748,1.023136,1.439829,0.829508,0.640843,0.971629,-0.777233
2,-1.111636,FRANCE,FEMALE,-0.757672,-0.360202,0.985234,-0.897814,-1.560445,-1.0292,-0.664527
3,0.606586,FRANCE,FEMALE,-0.662452,-1.051871,-1.231301,0.829508,-1.560445,-1.0292,0.16111
4,0.658339,GERMANY,FEMALE,-0.281572,-1.397706,0.396976,0.829508,0.640843,0.971629,-1.724171


In [None]:
# 범주형 컬럼 인코딩

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
# 방법 2
obj_col = X_train.select_dtypes(include="object").columns
for i in obj_col:
    le = LabelEncoder()
    X_train[i] = le.fit_transform(X_train[i])
    X_test[i] = le.transform(X_test[i])


In [None]:
X_train.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,1.455346,1,0,-0.376792,0.677301,-0.391014,-0.897814,0.640843,-1.0292,1.047721
1,0.565183,1,1,0.289748,1.023136,1.439829,0.829508,0.640843,0.971629,-0.777233
2,-1.111636,0,0,-0.757672,-0.360202,0.985234,-0.897814,-1.560445,-1.0292,-0.664527
3,0.606586,0,0,-0.662452,-1.051871,-1.231301,0.829508,-1.560445,-1.0292,0.16111
4,0.658339,1,0,-0.281572,-1.397706,0.396976,0.829508,0.640843,0.971629,-1.724171


#### 4. Modeling
- 분류MODEL 을 이용해 서비스 이탈예측
- 각 MODEL 의 기본값을 이용해 예측해 보세요
    - logistic regressor
    - KNN
    - SVC
    - RandomForestClassifier
    - XGBClassifier
- accuracy_score, roc_auc_score 를 이용해 평가
    - accuracy : 0.85
    - auc : 0.85
- gridsearch를 활용한 best model 과 best_parameter, RandomForestClassifier의 feature_importances_ 는 어떤값을 나타 내는가?

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

In [None]:
rf = RandomForestClassifier(random_state=1)
gb = GradientBoostingClassifier(random_state=1)
xgb = XGBClassifier(random_state=1)

models = [rf, gb, xgb]

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
params = {"n_estimators":[100,200,300,400,500], "max_depth":[1,2,3,4,5]}
best_models = []
for model in models:
    gs = GridSearchCV(model, param_grid=params, cv = 5, scoring="roc_auc", n_jobs=4)
    gs.fit(X_train, y_train)

    print(f"="*10)
    print(f"model : {model}")
    print(f"params : {gs.best_params_}")
    print(f"score : {gs.best_score_}")

    best_models.append(gs.best_estimator_)

model : RandomForestClassifier()
params : {'max_depth': 5, 'n_estimators': 300}
score : 0.8477363987150698
model : GradientBoostingClassifier()
params : {'max_depth': 3, 'n_estimators': 100}
score : 0.862244776172996
model : XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)
params : {'max_depth': 

In [None]:
len(best_models)

3

In [None]:
model = best_models[1]

In [None]:
test_predict = model.predict(X_test)
test_predict_prob = model.predict_proba(X_test)

In [None]:
print(f"accuracy_score: {accuracy_score(y_test, test_predict)}")

print(f"roc_auc_score: {roc_auc_score(y_test, test_predict_prob[:,1])}")

accuracy_score: 0.8608968866038275
roc_auc_score: 0.8602910993015549


In [None]:
#===================================================