# Random Search, Grid Search Project 3

- Grid Search로 KNN 알고리즘 모델 튜닝
- Mobile 데이터로 모델링

In [1]:
# Library 불러오기
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
%config InlineBackend.figure_format='retina'

In [2]:
# 데이터 불러오기
path = 'https://raw.githubusercontent.com/jangrae/csv/master/mobile.csv'
data = pd.read_csv(path)

In [3]:
# 상위 5개 행 확인
data.head()

Unnamed: 0,id,COLLEGE,INCOME,OVERAGE,LEFTOVER,HOUSE,HANDSET_PRICE,OVER_15MINS_CALLS_PER_MONTH,AVERAGE_CALL_DURATION,REPORTED_SATISFACTION,REPORTED_USAGE_LEVEL,CONSIDERING_CHANGE_OF_PLAN,CHURN
0,1,0,31953,0,6,313378,161,0,4,unsat,little,no,0
1,2,1,36147,0,13,800586,244,0,6,unsat,little,considering,0
2,3,1,27273,230,0,305049,201,16,15,unsat,very_little,perhaps,0
3,4,0,120070,38,33,788235,780,3,2,unsat,very_high,considering,1
4,5,1,29215,208,85,224784,241,21,1,very_unsat,little,never_thought,0


In [4]:
# 기초통계량 확인
data.describe()

Unnamed: 0,id,COLLEGE,INCOME,OVERAGE,LEFTOVER,HOUSE,HANDSET_PRICE,OVER_15MINS_CALLS_PER_MONTH,AVERAGE_CALL_DURATION,CHURN
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,10000.5,0.5024,80281.44775,85.97955,23.89865,493155.26425,389.6151,8.0007,6.00225,0.4926
std,5773.647028,0.500007,41680.586319,85.992324,26.816645,252407.884692,213.820682,8.925418,4.402558,0.499958
min,1.0,0.0,20007.0,-2.0,0.0,150002.0,130.0,0.0,1.0,0.0
25%,5000.75,0.0,42217.0,0.0,0.0,263714.25,219.0,1.0,2.0,0.0
50%,10000.5,1.0,75366.5,59.0,14.0,452259.5,326.0,4.0,5.0,0.0
75%,15000.25,1.0,115881.75,179.0,41.0,702378.0,533.25,15.0,10.0,1.0
max,20000.0,1.0,159983.0,335.0,89.0,999996.0,899.0,29.0,15.0,1.0


In [5]:
# 변수 제거 : id
drop_cols = ['id']
data.drop(drop_cols, axis=1, inplace=True)
data.head()

Unnamed: 0,COLLEGE,INCOME,OVERAGE,LEFTOVER,HOUSE,HANDSET_PRICE,OVER_15MINS_CALLS_PER_MONTH,AVERAGE_CALL_DURATION,REPORTED_SATISFACTION,REPORTED_USAGE_LEVEL,CONSIDERING_CHANGE_OF_PLAN,CHURN
0,0,31953,0,6,313378,161,0,4,unsat,little,no,0
1,1,36147,0,13,800586,244,0,6,unsat,little,considering,0
2,1,27273,230,0,305049,201,16,15,unsat,very_little,perhaps,0
3,0,120070,38,33,788235,780,3,2,unsat,very_high,considering,1
4,1,29215,208,85,224784,241,21,1,very_unsat,little,never_thought,0


In [6]:
# x, y 분리
target = 'CHURN'

x = data.drop(target, axis=1)
y = data.loc[:, target]

In [7]:
# 가변수화 : REPORTED_SATISFACTION, REPORTED_USAGE_LEVEL, CONSIDERING_CHANGE_OF_PLAN
dumm_cols = ['REPORTED_SATISFACTION', 'REPORTED_USAGE_LEVEL', 'CONSIDERING_CHANGE_OF_PLAN']
x = pd.get_dummies(data=x, columns=dumm_cols, drop_first=True)
x.head()

Unnamed: 0,COLLEGE,INCOME,OVERAGE,LEFTOVER,HOUSE,HANDSET_PRICE,OVER_15MINS_CALLS_PER_MONTH,AVERAGE_CALL_DURATION,REPORTED_SATISFACTION_sat,REPORTED_SATISFACTION_unsat,REPORTED_SATISFACTION_very_sat,REPORTED_SATISFACTION_very_unsat,REPORTED_USAGE_LEVEL_high,REPORTED_USAGE_LEVEL_little,REPORTED_USAGE_LEVEL_very_high,REPORTED_USAGE_LEVEL_very_little,CONSIDERING_CHANGE_OF_PLAN_considering,CONSIDERING_CHANGE_OF_PLAN_never_thought,CONSIDERING_CHANGE_OF_PLAN_no,CONSIDERING_CHANGE_OF_PLAN_perhaps
0,0,31953,0,6,313378,161,0,4,0,1,0,0,0,1,0,0,0,0,1,0
1,1,36147,0,13,800586,244,0,6,0,1,0,0,0,1,0,0,1,0,0,0
2,1,27273,230,0,305049,201,16,15,0,1,0,0,0,0,0,1,0,0,0,1
3,0,120070,38,33,788235,780,3,2,0,1,0,0,0,0,1,0,1,0,0,0
4,1,29215,208,85,224784,241,21,1,0,0,0,1,0,1,0,0,0,1,0,0


In [8]:
# train, test 7:3 분리
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

In [9]:
# 정규화
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [12]:
# 성능 예측
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

model = KNeighborsClassifier(n_neighbors=5)
cv_score = cross_val_score(model, x_train, y_train)

print(cv_score)
print('평균:', cv_score.mean())
print('표준편차:', cv_score.std())

[0.58571429 0.60035714 0.57785714 0.60321429 0.59785714]
평균: 0.5930000000000001
표준편차: 0.009637829377097347


In [13]:
# 모델 튜닝
from sklearn.model_selection import GridSearchCV

param = {'n_neighbors': range(3, 11)}

model_dt = KNeighborsClassifier()

model = GridSearchCV(model_dt,
                     param,
                     cv=5,
                     scoring='accuracy')

In [14]:
# 학습
model.fit(x_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(3, 11)}, scoring='accuracy')

In [15]:
# 결과 확인
print("="*80)
print(model.cv_results_['mean_test_score'])
print('-'*80)
print(model.best_params_)
print('-'*80)
print(model.best_score_)
print('='*80)

[0.5905     0.58628571 0.593      0.58807143 0.59714286 0.59414286
 0.59535714 0.59542857]
--------------------------------------------------------------------------------
{'n_neighbors': 7}
--------------------------------------------------------------------------------
0.5971428571428572


In [16]:
# 성능 평가
from sklearn.metrics import confusion_matrix, classification_report

y_pred = model.predict(x_test)

print('confusion matrix:\n', confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

confusion matrix:
 [[1945 1120]
 [1194 1741]]
              precision    recall  f1-score   support

           0       0.62      0.63      0.63      3065
           1       0.61      0.59      0.60      2935

    accuracy                           0.61      6000
   macro avg       0.61      0.61      0.61      6000
weighted avg       0.61      0.61      0.61      6000

