# 1. 라이브러리 Import 및 데이터 Read

In [10]:
import os
import pandas as pd

In [11]:
# 경로 설정
os.chdir(r'C:\Users\user\Python_study\data')

In [12]:
# 분석 데이터 Read
data = pd.read_csv("bank-additional-full.csv", sep = ';')

data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


#### One Hot Encoding
- 기계는 숫자만 인식해서 패턴으로 학습할 수 있음
- Input Data 중 카테고리형 변수(object)를 수치화
- pd.get_dummies 함수를 통해 실행

In [13]:
data.dtypes

age                 int64
job                object
marital            object
education          object
default            object
housing            object
loan               object
contact            object
month              object
day_of_week        object
duration            int64
campaign            int64
pdays               int64
previous            int64
poutcome           object
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
y                  object
dtype: object

In [14]:
# One Hot Encoding
data = pd.get_dummies(data,columns=['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome'])

data.head()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
0,56,261,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,0,0,1,0,0,0,0,1,0
1,57,149,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,0,0,1,0,0,0,0,1,0
2,37,226,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,0,0,1,0,0,0,0,1,0
3,40,151,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,0,0,1,0,0,0,0,1,0
4,56,307,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,0,0,1,0,0,0,0,1,0


In [16]:
# 아이디 생성
data['id']=range(len(data))

In [17]:
len(data)

41188

In [18]:
# data 중에서 30000개를 랜덤으로 추출해 train 데이터로 지정
train = data.sample(30000,replace=False,random_state=2020).reset_index().drop(['index'],axis=1)

In [19]:
# 추출되지 않은 나머지를 test 데이터로 지정
test = data.loc[ ~data['id'].isin(train['id']) ].reset_index().drop(['index'],axis=1)

# 2. RandomForest 실습

<strong>특징</strong>

1. 해석이 어려움 (앙상블 기법)
2. 매우 느림
3. Decision Tree보다 더 객관적인 변수 중요도를 뽑아낼 수 있음 (더 좋은 성능)


<strong>파라미터</strong>

1. n_estimators : 몇 개의 Decision Tree를 만들 것인지
2. max_depth : 각 Decision Tree의 최대 깊이
3. min_samples_split : 각 Decision Tree에서 각 노드의 최소 샘플 수

<strong>RandomForest 학습</strong>

In [30]:
# Decision Tree를 위한 패키지
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 500, min_samples_split = 10)

In [31]:
# 학습에 활용할 input 변수를 지정
input_var = ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed',
       'job_admin.', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'marital_unknown', 'education_basic.4y', 'education_basic.6y',
       'education_basic.9y', 'education_high.school', 'education_illiterate',
       'education_professional.course', 'education_university.degree',
       'education_unknown', 'default_no', 'default_unknown', 'default_yes',
       'housing_no', 'housing_unknown', 'housing_yes', 'loan_no',
       'loan_unknown', 'loan_yes', 'contact_cellular', 'contact_telephone',
       'month_apr', 'month_aug', 'month_dec', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_sep',
       'day_of_week_fri', 'day_of_week_mon', 'day_of_week_thu',
       'day_of_week_tue', 'day_of_week_wed', 'poutcome_failure',
       'poutcome_nonexistent', 'poutcome_success']

In [32]:
# Decision Tree 모델을 train 데이터에서 학습
rf.fit(train[input_var],train['y'])

RandomForestClassifier(min_samples_split=10, n_estimators=500)

In [33]:
# 예측 진행
predictions = rf.predict(test[input_var])

In [35]:
test['pred'] = predictions
test.head()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,id,pred
0,37,226,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,1,0,0,0,0,1,0,2,no
1,59,139,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,1,0,0,0,0,1,0,6,no
2,41,217,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,1,0,0,0,0,1,0,7,no
3,29,137,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,1,0,0,0,0,1,0,12,no
4,30,38,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,1,0,0,0,0,1,0,20,no


In [36]:
# 정확도 확인
(pd.Series(predictions)==test['y']).mean()

0.9134787272077226

### Decision Tree와의 비교

In [37]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(min_samples_split=10)

In [39]:
dt.fit(train[input_var], train['y'])

test['pred'] = dt.predict(test[input_var])

In [40]:
(test['pred']==test['y']).mean()

0.8977475867000357

<strong>변수 중요도 뽑아내기</strong>

In [41]:
feature_imp = rf.feature_importances_
imp_df = pd.DataFrame({'var':input_var,
                       'imp':feature_imp})

imp_df.sort_values(['imp'],ascending=False)

Unnamed: 0,var,imp
1,duration,3.218195e-01
8,euribor3m,9.906194e-02
9,nr.employed,7.555290e-02
0,age,5.597246e-02
3,pdays,3.753325e-02
...,...,...
21,job_unknown,1.775689e-03
47,month_dec,7.855791e-04
25,marital_unknown,6.965761e-04
30,education_illiterate,1.062912e-04
