## 의사결정트리에  grid searchCV를 활용한 모델 탐색

In [3]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import pandas as pd

#parameters에 들어가는 커널은 선형과 비선형 등을 적용할 수도 있다.
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size = 0.2, random_state =121)
parameters = {'max_depth': [1,2,3],'min_samples_split':[2,3]}

dtree = DecisionTreeClassifier()
grid_dtree = GridSearchCV(dtree, param_grid=parameters, cv = 3, refit =True)
grid_dtree.fit(X_train, y_train)

scores_df = pd.DataFrame(grid_dtree.cv_results_)

print(f'GridSearchCV 최적 파라미터: {grid_dtree.best_params_}')
print(f'GridSearchCV 최고 정확도: {grid_dtree.best_score_}')
scores_df[['params','rank_test_score','mean_test_score','std_test_score']]


GridSearchCV 최적 파라미터: {'max_depth': 3, 'min_samples_split': 2}
GridSearchCV 최고 정확도: 0.975


Unnamed: 0,params,rank_test_score,mean_test_score,std_test_score
0,"{'max_depth': 1, 'min_samples_split': 2}",5,0.7,1.110223e-16
1,"{'max_depth': 1, 'min_samples_split': 3}",5,0.7,1.110223e-16
2,"{'max_depth': 2, 'min_samples_split': 2}",3,0.958333,0.03118048
3,"{'max_depth': 2, 'min_samples_split': 3}",3,0.958333,0.03118048
4,"{'max_depth': 3, 'min_samples_split': 2}",1,0.975,0.02041241
5,"{'max_depth': 3, 'min_samples_split': 3}",1,0.975,0.02041241


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.datasets import load_iris
import pandas as pd

iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)
parameters = {'C': [1,10], 'kernel' :('linear', 'poly', 'rbf')}
svm_model = svm.SVC()
clf = GridSearchCV(svm_model, param_grid=parameters, cv = 5, refit=True)
clf.fit(X_train, y_train)

clf_df = pd.DataFrame(clf.cv_results_)

#print(f'최적 파라미터 : {clf_df.best_params_}')
#print(f'최적 결과 : {clf_df.best_score_}')
#clf_df[['params','rank_test_score','mean_test_score','std_test_score']]
clf_df[clf_df['rank_test_score'] == 1]


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000801,0.0004,0.000201,0.000401,1,linear,"{'C': 1, 'kernel': 'linear'}",1.0,0.958333,0.875,1.0,0.958333,0.958333,0.045644,1


In [58]:
import pandas as pd
from numpy import nan
name = ['홍길동','이나영','마징가','변사또','원더우먼','원빈','현빈','박보검','김상민','김한림']
age= [0, 4, 21,20, 38, 100, 40 ,61, 11, 23]
gender = ['male', 'female', 'male', 'female', 'male', 'female', 'male', 'female', 'male','male']
people_df = pd.DataFrame({'name': name, 'age': age, 'gender': gender})

In [59]:
#타입확인
people_df.dtypes
#타입변환, 원본에 저장해야 적용
people_df['gender'] = people_df['gender'].astype('category')


In [60]:
#연속형 데이터의 경우 그룹을 정해서 범주를 생성
#cut(데이터, 구간정보, 구간라벨)
#연령 대 범주(아동청소년, 청년, 중년, 장년, 노년)로 나누려고 함
#범위를 0~100.1을 5개로 나누어서
ret1 = pd.cut(people_df['age'], bins = 5,labels= ['아동청소년', '청년', '중년', '장년', '노년'])
# 0<= <=20 가장 작은 범주 포함
# 20< <=40
#...
people_df['cate_age'] = ret1
# people_df['cate_age'] = ret1
#people_df['cate_age']= ret1
people_df

Unnamed: 0,name,age,gender,cate_age
0,홍길동,0,male,아동청소년
1,이나영,4,female,아동청소년
2,마징가,21,male,청년
3,변사또,20,female,아동청소년
4,원더우먼,38,male,청년
5,원빈,100,female,노년
6,현빈,40,male,청년
7,박보검,61,female,장년
8,김상민,11,male,아동청소년
9,김한림,23,male,청년


In [61]:
import pandas as pd
people_df = pd.concat([people_df, pd.get_dummies(people_df['cate_age'])],axis=1)
people_df

Unnamed: 0,name,age,gender,cate_age,아동청소년,청년,중년,장년,노년
0,홍길동,0,male,아동청소년,1,0,0,0,0
1,이나영,4,female,아동청소년,1,0,0,0,0
2,마징가,21,male,청년,0,1,0,0,0
3,변사또,20,female,아동청소년,1,0,0,0,0
4,원더우먼,38,male,청년,0,1,0,0,0
5,원빈,100,female,노년,0,0,0,0,1
6,현빈,40,male,청년,0,1,0,0,0
7,박보검,61,female,장년,0,0,0,1,0
8,김상민,11,male,아동청소년,1,0,0,0,0
9,김한림,23,male,청년,0,1,0,0,0


In [74]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
import numpy as np
le = LabelEncoder()
# 전달 데이터 확인 후 라벨 생성
le.fit(people_df['cate_age'])
en_age = le.transform(people_df['cate_age'])
# 동시에 하기
le.fit_transform(people_df['cate_age'])
# 정수 => 문자열로 돌리기
le.inverse_transform(en_age)



array(['아동청소년', '아동청소년', '청년', '아동청소년', '청년', '노년', '청년', '장년', '아동청소년',
       '청년'], dtype=object)

In [76]:
#원-핫 인코딩
OH = OneHotEncoder()
items = np.array(people_df['cate_age']).reshape(-1, 1)
items = OH.fit_transform(items).toarray()
items

array([[0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.]])

In [75]:
Ode = OrdinalEncoder()
Ode.fit_transform(people_df['cate_age'].to_numpy().reshape(-1, 1))


array([[1.],
       [1.],
       [3.],
       [1.],
       [3.],
       [0.],
       [3.],
       [2.],
       [1.],
       [3.]])

In [79]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np
#0에서 10값을 0~1로 변환 
train_array = np.arange(0,11).reshape(-1,1)
test_array = np.arange(0,6).reshape(-1,1)
scaler = MinMaxScaler()
scaler.fit(train_array)
train_scale = scaler.transform(train_array)
print(f'train 스케일링 결과{train_scale}')

scaler.fit(test_array)
test_scale = scaler.transform(test_array)
print(f'test 스케일링: {test_scale}')
#=> 스케일링이 같은 비율로 되지않았음

#fit을 원래있는 train_array를 기준으로 하는 것이 맞음
from sklearn.preprocessing import MinMaxScaler


train 스케일링 결과[[0. ]
 [0.1]
 [0.2]
 [0.3]
 [0.4]
 [0.5]
 [0.6]
 [0.7]
 [0.8]
 [0.9]
 [1. ]]
test 스케일링: [[0. ]
 [0.2]
 [0.4]
 [0.6]
 [0.8]
 [1. ]]
