#### Hyperparameter-Tuning
- 최적의 옵션값을 찾아서 알고리즘의 성능을 극재화하는 방법

In [6]:
import pandas as pd

In [7]:
df = pd.read_csv('test2_modify.csv')

In [8]:
label = 'IsCanceled'
features = ['LeadTime', 'ArrivalDateYear',
       'ArrivalDateWeekNumber', 'ArrivalDateDayOfMonth',
       'StaysInWeekendNights', 'StaysInWeekNights', 'Adults', 'Children',
       'Babies', 'IsRepeatedGuest', 'PreviousCancellations',
       'PreviousBookingsNotCanceled', 'BookingChanges', 'DaysInWaitingList', 'ADR',
       'TotalOfSpecialRequests', 'day_stay',
       'Foreigner', 'Country_Ratio', 'car_yes', 'RA_Changed']

train, test = df[0::2], df[1::2]
test = test.reset_index()
train = train.reset_index()

X_train, Y_train = train[features], train[label]
X_test, Y_test = test[features], test[label]

In [9]:
from sklearn.tree import DecisionTreeClassifier as dt
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.ensemble import GradientBoostingClassifier as gb

In [10]:
from sklearn.metrics import confusion_matrix as cm

In [11]:
model = rf()
model.fit(X_train, Y_train)

RandomForestClassifier()

In [12]:
for_plot = pd.DataFrame()
for_plot['predict'] = model.predict(X_test)
for_plot['actual'] = Y_test
for_plot.head()

Unnamed: 0,predict,actual
0,0,0
1,0,0
2,1,1
3,1,0
4,0,0


In [13]:
TN, FP, FN, TP = cm(for_plot['actual'], for_plot['predict']).ravel()
recall = TP / (TP + FN)
recall

0.7809954751131222

In [14]:
def scorer(model):
    model.fit(X_train, Y_train)

    for_plot = pd.DataFrame()
    for_plot['predict'] = model.predict(X_test)
    for_plot['actual'] = Y_test

    TN, FP, FN, TP = cm(for_plot['actual'], for_plot['predict']).ravel()
    recall = TP / (TP + FN)
    
    return recall

In [16]:
model = rf(max_depth=15, max_features=0.5)
scorer(model)

0.7650678733031674

#### 접근법 3가지
- Grid-Search
- Random-Search
- Optimizer

In [18]:
md_Is = [3, 5, 7, 9, 11, 13, 15, 17]
mf_Is = [0.1, 0.3, 0.5, 0.7, 0.9]
all_scores = []

for md in md_Is:
    for mf in mf_Is:
        model = rf(max_depth=md, max_features=mf, n_jobs=-1)
        all_scores.append({'recall':scorer(model), 'md':md, 'mf':mf})
    
    hyper_df = pd.DataFrame(all_scores)

In [19]:
hyper_df.sort_values(by='recall', ascending=False)

Unnamed: 0,recall,md,mf
39,0.793846,17,0.9
37,0.791312,17,0.5
38,0.787692,17,0.7
33,0.778281,15,0.7
34,0.776109,15,0.9
32,0.77086,15,0.5
36,0.770498,17,0.3
29,0.751493,13,0.9
31,0.746968,15,0.3
28,0.746244,13,0.7


In [20]:
import time

target_time = 5
t0 = time.time()

while True:
    t1 = time.time() - t0
    
    if (t1 > target_time):
        break

print('반복 종료')

반복 종료


In [21]:
from random import randint

randint(1, 999) / 1000

0.498

In [24]:
target_time = 600
t0 = time.time()

while True:
    t1 = time.time() - t0
    if (t1 > target_time):
        break
        
    md = randint(15, 20)
    mf = randint(1, 999) / 1000
    
    model = rf(max_depth=md, max_features=mf, n_jobs=-1)
    all_scores.append({'recall':scorer(model), 'md':md, 'mf':mf})
    
hyper_df = pd.DataFrame(all_scores)
hyper_df.sort_values(by='recall', ascending=False)

Unnamed: 0,recall,md,mf
447,0.802715,20,0.503
480,0.801991,19,0.585
405,0.801810,19,0.711
503,0.801448,20,0.715
89,0.801448,20,0.950
...,...,...,...
1,0.576833,3,0.300
15,0.576109,9,0.100
10,0.506063,7,0.100
5,0.447059,5,0.100


In [25]:
target_time = 600
t0 = time.time()

while True:
    t1 = time.time() - t0
    if (t1 > target_time):
        break
        
    md = randint(15, 20)
    mf = randint(1, 999) / 1000
    model = rf(max_depth=md, max_features=mf, n_jobs=-1)
    all_scores.append({'recall':scorer(model), 'md':md, 'mf':mf})
    
hyper_df = pd.DataFrame(all_scores)
hyper_df.sort_values(by='recall', ascending=False).head(10)

# Coarse - find search

Unnamed: 0,recall,md,mf
447,0.802715,20,0.503
960,0.801991,20,0.663
480,0.801991,19,0.585
405,0.80181,19,0.711
840,0.80181,18,0.819
89,0.801448,20,0.95
652,0.801448,19,0.928
503,0.801448,20,0.715
737,0.801448,20,0.972
611,0.801267,20,0.536


In [42]:
df = pd.read_csv('test3.csv')
print(df.shape)
df.head().T

# 출처 : https://community.ibm.com/accelerators/

(7043, 21)


Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


#### Business Understanding
- 통신사 고객 관리 데이터
- 고객과의 약점을 통해 정기적인 현금 흐름을 만들어냄
- 고객 인원 수가 중요해보임 → 신규 고객 유치, 기본 고객 이탈을 방지

- customerID : 고객 ID
- gender : 성별
- SeniorCitizen : 시니어인 시민 명수
- Partner : 파트너 여부
- Dependents : 부양하고 있는 가족이 있는지 여부
- tenure : 계약 기간
- PhoneService : 핸드폰 서비스 사용 여부
- MultipleLines : 여러 가지 선의 여부
- InternetService : 인터넷 서비스 종류
- OnlineSecurity : 온라인 보안 사용 여부
- OnlineBackup : 온라인 백업 선택 여부
- DeviceProtection : 장치 보호 선택 여부
- TechSupport : 기술 지원 선택 여부
- StreamingTV : 스트리밍 TV 서비스 선택 여부
- StreamingMovies : 스트리밍 영화 서비스 선택 여부
- Contract : 재개약하는 기간
- PaperlessBilling : 종이가 아닌 청구서 선택 여부
- PaymentMethod : 결제 방식
- MonthlyCharges : 월별 청구 금액
- TotalCharges : 총 청구 금액
- Churn : 이동, 통신사 변경 여부

In [44]:
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [45]:
df.sort_values(by='TotalCharges')['TotalCharges']

936           
3826          
4380          
753           
5218          
         ...  
6646    997.75
5598     998.1
3686    999.45
3353     999.8
2845     999.9
Name: TotalCharges, Length: 7043, dtype: object

In [46]:
df.sort_values(by='TotalCharges')['TotalCharges'][936]

' '

#### 빈 값 처리 방법 (NULL 처리)
- 그냥 지우기
- 도메인 지식을 이용해 채워넣기
- 평균 대체법
- 해당 column을 label로 두고, 모델을 학습시켜서 예측값으로 빈 값 채우기