In [41]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

train = pd.read_csv('./data/train.csv', index_col='no')
test = pd.read_csv('./data/test.csv', index_col='no')

def strip_title(row):
    return row.strip()

# train 공백제거
# except pass 는 수치형나오면 오류나는거를 pass한것
for i in range(len(train.columns)):
    try:
        train.iloc[:,i] = train.iloc[:,i].apply(strip_title)
    except:
        pass
# test 공백제거
for i in range(len(test.columns)):
    try:
        test.iloc[:,i] = test.iloc[:,i].apply(strip_title)
    except:
        pass


# ?을 other로 변환
train['occupation'] = train['occupation'].replace('?','other')
train['workclass'] = train['workclass'].replace('?','other')
train['native-country'] = train['native-country'].replace('?','other')

# ?을 other로 변환
test['occupation'] = test['occupation'].replace('?','other')
test['workclass'] = test['workclass'].replace('?','other')
test['native-country'] = test['native-country'].replace('?','other')

# train.drop(['fnlwgt','capital-gain', 'capital-loss','education'],axis=1,inplace=True)
# test.drop(['fnlwgt','capital-gain', 'capital-loss','education'],axis=1,inplace=True)

train.drop(['fnlwgt','education'],axis=1,inplace=True)
test.drop(['fnlwgt','education'],axis=1,inplace=True)

train['native-country'] = train['native-country'].map({'United-States':'United-States',"other":'United-States'})
train['native-country'].fillna("other-country",inplace=True)
test['native-country'] = test['native-country'].map({'United-States':'United-States',"other":'United-States'})
test['native-country'].fillna("other-country",inplace=True)

# 인코딩할 컬럼명만 선택
categorical_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
# categorical_features = ['workclass','occupation']
# 반복문으로 인코딩하기
for feature_name in categorical_features:
    one_hot = pd.get_dummies(train[feature_name], prefix = feature_name)
    train = pd.concat([train,one_hot], axis=1) # 기존 데이터 끝에 one_hot을 붙임.
    train.drop(feature_name, axis=1, inplace=True) # 기존 글자컬럼을 삭제
    
for feature_name in categorical_features:
    one_hot = pd.get_dummies(test[feature_name], prefix = feature_name)
    test = pd.concat([test,one_hot], axis=1) # 기존 데이터 끝에 one_hot을 붙임.
    test.drop(feature_name, axis=1, inplace=True) # 기존 글자컬럼을 삭제
train.drop("workclass_other", axis = 1, inplace = True)
test.drop("workclass_other", axis = 1, inplace = True)


train.sort_index(axis=1, inplace=True)
test.sort_index(axis=1, inplace=True)
X_train = train.drop(['income'], axis=1)
y_train = train.income
X_test=test
X_train.sort_index(axis=1, inplace=True)
X_test.sort_index(axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=0)

In [42]:
from sklearn.ensemble import AdaBoostClassifier

bagging_adb = AdaBoostClassifier(n_estimators=90, random_state=10)
bagging_adb.fit(X_train, y_train)

print("훈련 세트 정확도 : {:.3f}".format(bagging_adb.score(X_train, y_train)))
print("테스트 세트 정확도 : {:.3f}".format(bagging_adb.score(X_test, y_test)))

훈련 세트 정확도 : 0.861
테스트 세트 정확도 : 0.863


In [60]:
# 시작값과 끝값을 설정
param_grid = {"n_estimators": (50, 100),
              "max_features": (10, 30),
             'max_depth': (10,14,18)}

In [61]:
from skopt import BayesSearchCV
import warnings

warnings.filterwarnings('ignore')

bayes_search = BayesSearchCV(RandomForestClassifier(), 
                             param_grid, 
                             random_state=721,
                             cv=8,
                             n_jobs=-1)

bayes_search.fit(X_train, y_train)

BayesSearchCV(cv=8, estimator=RandomForestClassifier(), n_jobs=-1,
              random_state=721,
              search_spaces={'max_depth': (10, 14, 18),
                             'max_features': (10, 30),
                             'n_estimators': (50, 100)})

In [62]:
print("테스트 세트 점수: {:.4f}".format(bayes_search.score(X_test, y_test)))

테스트 세트 점수: 0.8622


In [63]:
print("최적 매개변수: {}".format(bayes_search.best_params_))
print("최고 교차 검증 점수: {:.4f}".format(bayes_search.best_score_))
print("최고 성능 모델:\n{}".format(bayes_search.best_estimator_))

최적 매개변수: OrderedDict([('max_depth', 14), ('max_features', 18), ('n_estimators', 65)])
최고 교차 검증 점수: 0.8624
최고 성능 모델:
RandomForestClassifier(max_depth=14, max_features=18, n_estimators=65)


In [None]:
# 시작값과 끝값을 설정
param_grid = {"n_estimators": (50, 100),
              "max_features": (10, 30),
             'max_depth': (10,14,18)}
# 최적 매개변수: OrderedDict([('max_depth', 14), ('max_features', 18), ('n_estimators', 65)])
# 최고 교차 검증 점수: 0.8624
# 최고 성능 모델:
# RandomForestClassifier(max_depth=14, max_features=18, n_estimators=65)

In [None]:
param_grid = {"n_estimators": (50, 100),
              "max_features": (10, 30),
             'max_depth': (4,5,6)}
bayes_search = BayesSearchCV(RandomForestClassifier(), 
                             param_grid, 
                             random_state=721,
                             cv=5,
                             n_jobs=-1)

bayes_search.fit(X_train, y_train)print("최적 매개변수: {}".format(bayes_search.best_params_))
print("최고 교차 검증 점수: {:.4f}".format(bayes_search.best_score_))
print("최고 성능 모델:\n{}".format(bayes_search.best_estimator_))
# 최적 매개변수: OrderedDict([('max_depth', 6), ('max_features', 21), ('n_estimators', 69)])
# 최고 교차 검증 점수: 0.8543
# 최고 성능 모델:
# RandomForestClassifier(max_depth=6, max_features=21, n_estimators=69)

In [None]:
print("최적 매개변수: {}".format(bayes_search.best_params_))
print("최고 교차 검증 점수: {:.4f}".format(bayes_search.best_score_))
print("최고 성능 모델:\n{}".format(bayes_search.best_estimator_))
# 최적 매개변수: OrderedDict([('max_features', 17), ('n_estimators', 88)])
# 최고 교차 검증 점수: 0.85
# 최고 성능 모델:
# RandomForestClassifier(max_features=17, n_estimators=88)

In [None]:
# 딕셔너리 형태로 파라미터의 값들을 설정
param_grid = {"n_estimators": range(50, 100, 5),
              "max_features": range(10, 30, 3),
             "max_depth" : range(3,6)}

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# return_train_score : 훈련 폴드(cv)의 점수를 받을지 여부를 설정
# scoring : 분류 (accuracy, f1), 회귀 (neg_mean_squared_error, r2)
# n_jobs : 병렬처리 수 (CPU 코어 수가 충분하다면 설정, 디폴트 1) - 내부적으로 멀티프로세스 동작 -> 속도 증가
#grid_search = GridSearchCV(SVC(), param_grid, cv=5, return_train_score=True, scoring='f1_micro', n_jobs=2)
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=10, n_jobs=-1)

grid_search.fit(X_train, y_train)
print("테스트 세트 점수: {:.4f}".format(grid_search.score(X_test, y_test)))
#테스트 세트 점수: 0.8546
print("최적 매개변수: {}".format(grid_search.best_params_))
print("최고 교차 검증 점수: {:.4f}".format(grid_search.best_score_))
print("최고 성능 모델:\n{}".format(grid_search.best_estimator_))
# 최적 매개변수: {'max_depth': 5, 'max_features': 28, 'n_estimators': 75}
# 최고 교차 검증 점수: 0.8502
# 최고 성능 모델:
# RandomForestClassifier(max_depth=5, max_features=28, n_estimators=75)

In [None]:
# 랜덤

In [None]:
from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(RandomForestClassifier(), 
                                   param_grid, 
                                   random_state=0,
                                   cv=5)

random_search.fit(X_train, y_train)
print("테스트 세트 점수: {:.4f}".format(random_search.score(X_test, y_test)))
#테스트 세트 점수: 0.8529
print("최적 매개변수: {}".format(random_search.best_params_))
print("최고 교차 검증 점수: {:.4f}".format(random_search.best_score_))
print("최고 성능 모델:\n{}".format(random_search.best_estimator_))
# 최적 매개변수: {'n_estimators': 90, 'max_features': 22, 'max_depth': 5}
# 최고 교차 검증 점수: 0.8479
# 최고 성능 모델:
# RandomForestClassifier(max_depth=5, max_features=22, n_estimators=90)