In [2]:
# 기본
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 경고 뜨지 않게 설정
import warnings
warnings.filterwarnings('ignore')

# 그래프 설정
sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 데이터 전처리 알고리즘
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# 학습용과 검증용으로 나누는 함수
from sklearn.model_selection import train_test_split

# 교차 검증
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# 평가함수
# 분류용
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# 회귀용
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# 모델의 최적의 하이퍼 파라미터를 찾기 위한 도구
from sklearn.model_selection import GridSearchCV

# 머신러닝 알고리즘 - 분류
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

# 머신러닝 알고리즘 - 회귀
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor

# 학습 모델 저장을 위한 라이브러리
import pickle

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
import torch
print(torch.cuda.is_available())  # True면 GPU 사용 가능
print(torch.cuda.get_device_name(0))  # GPU 이름 출력

True
Tesla T4


### 프로젝트 셋팅

In [4]:
# 학습이 완료된 모델을 저장할 파일 이름
best_model_path = '/content/drive/MyDrive/model/best_model_spaceship_titanic.dat'
# 교차검증 횟수
cv_count = 10
# 교차 검증
kfold = KFold(n_splits=cv_count, shuffle=True, random_state=1)
# 평가 결과를 담을 리스트
# 필요하다면 다른 것도 만들어주세요
f1_score_list = []
# 학습 모델 이름
model_name_list = []

### 데이터 준비
- 데이터를 읽어오고 필요한 전처리까지 다 한다음 입력데이터는 train_X, 결과데이터는 train_y라는 변수에 담아서 준비해주세요

In [5]:
# 데이터를 읽어온다.
train_df = pd.read_parquet('/content/drive/MyDrive/data/pca_train.parquet')
test_df = pd.read_parquet('/content/drive/MyDrive/data/pca_test.parquet')

display(train_df)
display(test_df)

Unnamed: 0,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,...,p17,p18,p19,p20,p21,p22,p23,p24,p25,Segment
0,-2243.552933,73177.822751,-23307.067102,7561.747680,-3243.689945,4528.594709,-4408.328899,8369.204846,-6309.014641,1232.178495,...,-883.032259,7041.455364,-670.054714,1078.355402,1920.870547,2544.053619,1235.750939,-679.654072,-2314.977216,D
1,-4123.593577,-18596.799844,-5281.553928,552.003971,2047.728562,2851.121296,-2005.535562,3877.903215,-889.924568,3031.998145,...,1026.829959,673.134529,657.221496,1475.141165,863.926840,-1177.211648,2831.235120,-68.137893,-409.367207,E
2,93844.494981,123215.256247,-21110.839100,-7111.076138,-5582.733393,13547.760971,-1048.762694,9425.227220,1943.737288,-143.578503,...,-2319.200651,-747.349426,-2800.120608,1230.518402,-1406.511713,-2206.140597,1277.749684,738.283748,1992.859764,C
3,10648.448214,115700.176918,-21742.094573,11510.390672,-7770.532201,-7559.367421,-5333.651781,13026.736609,-76.952401,259.298011,...,-483.344560,5666.289339,-71.778393,949.211049,2950.268504,975.595853,801.520108,-9.139923,-968.875714,D
4,-76132.442560,-2534.422667,-1881.492818,724.219680,-55.011442,-2366.802627,-931.083210,71.543290,-16.547834,-290.180408,...,183.348897,-279.655391,-284.008621,-2322.540842,-2072.645932,634.432208,-1033.942702,473.094923,-216.472421,E
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2399995,-70708.144660,-4606.543015,2805.802812,-7312.121123,-3526.939821,-4965.947914,220.135111,1057.678517,2360.636914,1934.651584,...,-14.643096,-632.996628,-775.194681,-1093.950869,-880.464696,-509.537637,-358.298234,6.232325,-456.213452,E
2399996,140275.356530,814.052457,-29512.108767,23504.905499,-9175.280457,5675.125393,16409.620662,2465.542079,-4928.311445,2604.849964,...,-977.313742,-6006.581774,245.148039,-1467.533990,-3041.636512,-2123.278748,2089.569477,-2060.825006,-146.818560,D
2399997,17590.595793,3334.474385,5996.492939,645.269909,-4197.372200,1762.693541,-60.440877,2115.391058,-919.578172,2820.212880,...,684.830802,1682.248555,-405.046693,302.598929,-458.846992,-558.545886,-1726.557745,966.842573,-1173.216523,C
2399998,-75089.653851,-3665.731945,-3620.020117,709.662982,42.502434,-1556.732214,-1652.830372,84.509015,-302.824774,-692.984227,...,68.816194,-250.279003,3.779371,-234.113635,-94.101872,-461.100118,-253.046171,45.248177,-381.512640,E


Unnamed: 0,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,...,p16,p17,p18,p19,p20,p21,p22,p23,p24,p25
0,5449.770391,-15457.519040,7975.048602,-2971.418102,-1137.004767,4011.962178,11551.144025,3357.707117,-2924.619991,-1328.630772,...,-3675.470317,89.699954,192.708076,-2603.045310,6486.179111,5040.282733,884.032817,-776.952494,2111.313036,-694.188774
1,-39956.605754,26232.031298,1163.543109,-8719.802046,-6359.839502,-9991.102021,-4302.131559,15137.268711,10291.412366,815.171481,...,-2610.468074,1837.496243,2076.203376,2799.656589,-765.383540,2007.854072,1813.109968,-817.880168,2123.054119,213.050147
2,91343.368173,20400.129674,49117.164568,18297.225678,-3701.289740,445.182612,-8089.748597,9430.604081,-8090.458230,-63.631268,...,1316.920665,569.088380,-2300.244976,-2180.985710,1976.296387,1498.115258,617.151610,557.424181,281.863780,741.645539
3,-60622.721464,-2779.211139,172.147096,-1520.579234,494.335648,-2245.292748,-2498.491928,4213.196662,2446.223072,-925.403801,...,253.566125,-410.776261,611.909126,677.146880,1175.953033,339.431785,63.668976,-470.056671,-770.600264,-332.992309
4,-48203.789429,188.499776,8676.875460,-8345.019526,-4152.234195,-8421.964779,-2943.931762,4537.336519,1748.248578,-1903.155041,...,-1117.081463,114.225258,538.235135,1567.426466,490.377157,422.267112,1492.421103,482.156591,1349.365467,38.372575
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599995,-75126.065175,-3705.200634,-3607.179077,706.459174,88.174569,-1576.755827,-1673.193119,66.734157,-311.373311,-693.622316,...,115.492325,108.838596,-99.179096,52.185987,25.174890,-525.887342,-346.753069,-283.930403,81.819950,-374.758094
599996,-69393.140789,-2960.239476,-1778.824783,-236.728825,-975.862235,-3279.293119,-1335.075258,1572.949261,733.136012,-850.853994,...,871.562358,-175.810517,1104.853614,-301.709977,-972.853633,-1086.022765,239.215755,-739.975975,154.691256,-982.463430
599997,-75112.821114,-3689.619067,-3601.622684,707.283464,73.604369,-1557.581357,-1663.292209,80.980196,-310.201058,-691.082182,...,138.351906,107.394462,-24.579710,64.067763,53.626928,-627.955001,-311.882147,-294.117050,93.183060,-326.612659
599998,429765.434690,-147776.679200,-45333.927642,-79709.429139,-51300.598740,-19337.740212,4039.117750,2184.738303,42456.455268,14162.848870,...,-1217.003942,16497.104569,-3638.910661,1245.666898,1028.243274,-1718.329770,3274.497209,1430.706916,-5689.597895,-9427.709388


In [7]:
# 데이터 프레임을 합친다.
all_df = pd.concat([train_df, test_df])
all_df.reset_index(inplace=True, drop=True)
all_df

Unnamed: 0,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,...,p17,p18,p19,p20,p21,p22,p23,p24,p25,Segment
0,-2243.552933,73177.822751,-23307.067102,7561.747680,-3243.689945,4528.594709,-4408.328899,8369.204846,-6309.014641,1232.178495,...,-883.032259,7041.455364,-670.054714,1078.355402,1920.870547,2544.053619,1235.750939,-679.654072,-2314.977216,D
1,-4123.593577,-18596.799844,-5281.553928,552.003971,2047.728562,2851.121296,-2005.535562,3877.903215,-889.924568,3031.998145,...,1026.829959,673.134529,657.221496,1475.141165,863.926840,-1177.211648,2831.235120,-68.137893,-409.367207,E
2,93844.494981,123215.256247,-21110.839100,-7111.076138,-5582.733393,13547.760971,-1048.762694,9425.227220,1943.737288,-143.578503,...,-2319.200651,-747.349426,-2800.120608,1230.518402,-1406.511713,-2206.140597,1277.749684,738.283748,1992.859764,C
3,10648.448214,115700.176918,-21742.094573,11510.390672,-7770.532201,-7559.367421,-5333.651781,13026.736609,-76.952401,259.298011,...,-483.344560,5666.289339,-71.778393,949.211049,2950.268504,975.595853,801.520108,-9.139923,-968.875714,D
4,-76132.442560,-2534.422667,-1881.492818,724.219680,-55.011442,-2366.802627,-931.083210,71.543290,-16.547834,-290.180408,...,183.348897,-279.655391,-284.008621,-2322.540842,-2072.645932,634.432208,-1033.942702,473.094923,-216.472421,E
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999995,-75126.065175,-3705.200634,-3607.179077,706.459174,88.174569,-1576.755827,-1673.193119,66.734157,-311.373311,-693.622316,...,108.838596,-99.179096,52.185987,25.174890,-525.887342,-346.753069,-283.930403,81.819950,-374.758094,
2999996,-69393.140789,-2960.239476,-1778.824783,-236.728825,-975.862235,-3279.293119,-1335.075258,1572.949261,733.136012,-850.853994,...,-175.810517,1104.853614,-301.709977,-972.853633,-1086.022765,239.215755,-739.975975,154.691256,-982.463430,
2999997,-75112.821114,-3689.619067,-3601.622684,707.283464,73.604369,-1557.581357,-1663.292209,80.980196,-310.201058,-691.082182,...,107.394462,-24.579710,64.067763,53.626928,-627.955001,-311.882147,-294.117050,93.183060,-326.612659,
2999998,429765.434690,-147776.679200,-45333.927642,-79709.429139,-51300.598740,-19337.740212,4039.117750,2184.738303,42456.455268,14162.848870,...,16497.104569,-3638.910661,1245.666898,1028.243274,-1718.329770,3274.497209,1430.706916,-5689.597895,-9427.709388,


In [8]:
# 결과 데이터는 제거한다.
all_df.drop('Segment', axis=1, inplace=True)
all_df

Unnamed: 0,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,...,p16,p17,p18,p19,p20,p21,p22,p23,p24,p25
0,-2243.552933,73177.822751,-23307.067102,7561.747680,-3243.689945,4528.594709,-4408.328899,8369.204846,-6309.014641,1232.178495,...,3042.690499,-883.032259,7041.455364,-670.054714,1078.355402,1920.870547,2544.053619,1235.750939,-679.654072,-2314.977216
1,-4123.593577,-18596.799844,-5281.553928,552.003971,2047.728562,2851.121296,-2005.535562,3877.903215,-889.924568,3031.998145,...,-3572.373987,1026.829959,673.134529,657.221496,1475.141165,863.926840,-1177.211648,2831.235120,-68.137893,-409.367207
2,93844.494981,123215.256247,-21110.839100,-7111.076138,-5582.733393,13547.760971,-1048.762694,9425.227220,1943.737288,-143.578503,...,2877.510224,-2319.200651,-747.349426,-2800.120608,1230.518402,-1406.511713,-2206.140597,1277.749684,738.283748,1992.859764
3,10648.448214,115700.176918,-21742.094573,11510.390672,-7770.532201,-7559.367421,-5333.651781,13026.736609,-76.952401,259.298011,...,2832.593677,-483.344560,5666.289339,-71.778393,949.211049,2950.268504,975.595853,801.520108,-9.139923,-968.875714
4,-76132.442560,-2534.422667,-1881.492818,724.219680,-55.011442,-2366.802627,-931.083210,71.543290,-16.547834,-290.180408,...,397.133626,183.348897,-279.655391,-284.008621,-2322.540842,-2072.645932,634.432208,-1033.942702,473.094923,-216.472421
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999995,-75126.065175,-3705.200634,-3607.179077,706.459174,88.174569,-1576.755827,-1673.193119,66.734157,-311.373311,-693.622316,...,115.492325,108.838596,-99.179096,52.185987,25.174890,-525.887342,-346.753069,-283.930403,81.819950,-374.758094
2999996,-69393.140789,-2960.239476,-1778.824783,-236.728825,-975.862235,-3279.293119,-1335.075258,1572.949261,733.136012,-850.853994,...,871.562358,-175.810517,1104.853614,-301.709977,-972.853633,-1086.022765,239.215755,-739.975975,154.691256,-982.463430
2999997,-75112.821114,-3689.619067,-3601.622684,707.283464,73.604369,-1557.581357,-1663.292209,80.980196,-310.201058,-691.082182,...,138.351906,107.394462,-24.579710,64.067763,53.626928,-627.955001,-311.882147,-294.117050,93.183060,-326.612659
2999998,429765.434690,-147776.679200,-45333.927642,-79709.429139,-51300.598740,-19337.740212,4039.117750,2184.738303,42456.455268,14162.848870,...,-1217.003942,16497.104569,-3638.910661,1245.666898,1028.243274,-1718.329770,3274.497209,1430.706916,-5689.597895,-9427.709388


In [9]:
# Scaler 학습
scalerX = StandardScaler()
scalerX.fit(all_df)

In [19]:
# 라벨 인코더 생성
le = LabelEncoder()

# 문자열 y를 숫자로 변환
train_df['Segment_encoded'] = le.fit_transform(train_df['Segment'])

In [20]:
# 입력과 결과로 나눈다.
X = train_df.drop(['Segment', 'Segment_encoded'], axis=1)  # 또는 원하는 feature만 사용
y = train_df['Segment_encoded']

In [21]:
# 표준화
X2 = scalerX.transform(X)
X2

array([[-0.02196453,  2.2166774 , -1.28748522, ...,  0.82717119,
        -0.4947715 , -1.69319227],
       [-0.04065735, -0.5625636 , -0.29170579, ...,  1.89487053,
        -0.04934318, -0.29942064],
       [ 0.93341769,  3.7319774 , -1.16615948, ...,  0.85527679,
         0.53805426,  1.45757884],
       ...,
       [ 0.17524203,  0.10158837,  0.33132486, ..., -1.15520823,
         0.70453649, -0.85810343],
       [-0.74625721, -0.11040117, -0.19991804, ..., -0.16913307,
         0.03324722, -0.27904769],
       [-0.35548661,  0.13190854,  0.4412536 , ...,  0.59395851,
        -1.270201  ,  1.01083912]])

In [22]:
train_X = X2
train_y = y

### 기본 모델 사용하기
- 기본 모델 중에 만족하는 것을 찾았다면 하이퍼 파라미터 튜닝 과정은 생략하세요

In [None]:
# KNN
knn_basic_model = KNeighborsClassifier()
# 교차 검증을 수행한다
r1 = cross_val_score(knn_basic_model, train_X, train_y, scoring='f1', cv=kfold)
# 평가 결과를 담아준다.
f1_score_list.append(r1.mean())
# 학습 모델 이름을 담아준다.
model_name_list.append("KNN Basic")

In [13]:
# LogisticRegression
lr_basic_model = LogisticRegression()
# 교차 검증을 수행한다
r1 = cross_val_score(lr_basic_model, train_X, train_y, scoring='f1', cv=kfold)
# 평가 결과를 담아준다.
f1_score_list.append(r1.mean())
# 학습 모델 이름을 담아준다.
model_name_list.append("LogisticRegression Basic")

In [None]:
# SVC
svc_basic_model = SVC(probability=True)
# 교차 검증을 수행한다
r1 = cross_val_score(svc_basic_model, train_X, train_y, scoring='f1', cv=kfold)
# 평가 결과를 담아준다.
f1_score_list.append(r1.mean())
# 학습 모델 이름을 담아준다.
model_name_list.append("SVC Basic")

In [None]:
# DecisionTree
tree_basic_model = DecisionTreeClassifier()
# 교차 검증을 수행한다
r1 = cross_val_score(tree_basic_model, train_X, train_y, scoring='f1', cv=kfold)
# 평가 결과를 담아준다.
f1_score_list.append(r1.mean())
# 학습 모델 이름을 담아준다.
model_name_list.append("DecisionTree Basic")

In [None]:
# RandomForest
rf_basic_model = RandomForestClassifier()
# 교차 검증을 수행한다
r1 = cross_val_score(rf_basic_model, train_X, train_y, scoring='f1', cv=kfold)
# 평가 결과를 담아준다.
f1_score_list.append(r1.mean())
# 학습 모델 이름을 담아준다.
model_name_list.append("RandomForest Basic")

In [None]:
# AdaBoost
ada_basic_model = AdaBoostClassifier()
# 교차 검증을 수행한다
r1 = cross_val_score(ada_basic_model, train_X, train_y, scoring='f1', cv=kfold)
# 평가 결과를 담아준다.
f1_score_list.append(r1.mean())
# 학습 모델 이름을 담아준다.
model_name_list.append("AdaBoost Basic")

In [None]:
# GradientBoost
gb_basic_model = GradientBoostingClassifier()
# 교차 검증을 수행한다
r1 = cross_val_score(gb_basic_model, train_X, train_y, scoring='f1', cv=kfold)
# 평가 결과를 담아준다.
f1_score_list.append(r1.mean())
# 학습 모델 이름을 담아준다.
model_name_list.append("GradientBoost Basic")

In [25]:
# LGBM
lgbm_basic_model = LGBMClassifier(
    device='gpu',
    boosting_type='gbdt',
    objective='multiclass',
    num_class=5,
    verbose=-1

)
# 교차 검증을 수행한다
r1 = cross_val_score(lgbm_basic_model, train_X, train_y, scoring='f1', cv=kfold)
# 평가 결과를 담아준다.
f1_score_list.append(r1.mean())
# 학습 모델 이름을 담아준다.
model_name_list.append("LGBM Basic")

In [23]:
# XGBoost
xgb_basic_model = XGBClassifier(
    tree_method='gpu_hist',
    predictor='gpu_predictor',
    use_label_encoder=False,
    eval_metric='logloss',
    verbosity=0
)
# 교차 검증을 수행한다
r1 = cross_val_score(xgb_basic_model, train_X, train_y, scoring='f1', cv=kfold)
# 평가 결과를 담아준다.
f1_score_list.append(r1.mean())
# 학습 모델 이름을 담아준다.
model_name_list.append("XGBoost Basic")

In [None]:
import pandas as pd

# 결과를 DataFrame으로 묶기
result_df = pd.DataFrame({
    'Model': model_name_list,
    'F1_Score': f1_score_list
})

# CSV로 저장
result_df.to_csv('model_f1_scores.csv', index=False)

In [None]:
# HardVoting 구성
temp_model1 = KNeighborsClassifier()
temp_model2 = LogisticRegression()
temp_model3 = SVC(probability=True)
temp_model4 = DecisionTreeClassifier()
temp_model5 = RandomForestClassifier()
temp_model6 = AdaBoostClassifier()
temp_model7 = GradientBoostingClassifier()
temp_model8 = LGBMClassifier(verbose=-1)
temp_model9 = XGBClassifier(verbose=-1, silent=True)

hard_voting_model_list = [
    ('model1', temp_model1),
    ('model2', temp_model2),
    ('model3', temp_model3),
    ('model4', temp_model4),
    ('model5', temp_model5),
    ('model6', temp_model6),
    ('model7', temp_model7),
    ('model8', temp_model8),
    ('model9', temp_model9),
]

hard_voting_basic_model = VotingClassifier(estimators=hard_voting_model_list, voting='hard')

# 교차 검증을 수행한다
r1 = cross_val_score(hard_voting_basic_model, train_X, train_y, scoring='f1', cv=kfold)
# 평가 결과를 담아준다.
f1_score_list.append(r1.mean())
# 학습 모델 이름을 담아준다.
model_name_list.append("HardVoting Basic")

In [None]:
# SoftVoting 구성
temp_model1 = KNeighborsClassifier()
temp_model2 = LogisticRegression()
temp_model3 = SVC(probability=True)
temp_model4 = DecisionTreeClassifier()
temp_model5 = RandomForestClassifier()
temp_model6 = AdaBoostClassifier()
temp_model7 = GradientBoostingClassifier()
temp_model8 = LGBMClassifier(verbose=-1)
temp_model9 = XGBClassifier(verbose=-1, silent=True)

soft_voting_model_list = [
    ('model1', temp_model1),
    ('model2', temp_model2),
    ('model3', temp_model3),
    ('model4', temp_model4),
    ('model5', temp_model5),
    ('model6', temp_model6),
    ('model7', temp_model7),
    ('model8', temp_model8),
    ('model9', temp_model9),
]

soft_voting_basic_model = VotingClassifier(estimators=soft_voting_model_list, voting='soft')

# 교차 검증을 수행한다
r1 = cross_val_score(soft_voting_basic_model, train_X, train_y, scoring='f1', cv=kfold)
# 평가 결과를 담아준다.
f1_score_list.append(r1.mean())
# 학습 모델 이름을 담아준다.
model_name_list.append("SoftVoting Basic")

In [12]:
10/0

ZeroDivisionError: division by zero

### 하이퍼 파라미터 튜닝

In [None]:
# KNN
params = {
    'n_neighbors' : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
}

temp_model = KNeighborsClassifier()
knn_grid_clf = GridSearchCV(temp_model, param_grid=params, scoring='f1', cv=kfold)
knn_grid_clf.fit(train_X, train_y)

# 평가 결과를 담아준다.
f1_score_list.append(knn_grid_clf.best_score_)
# 학습 모델 이름을 담아준다.
model_name_list.append("KNN Tuning")

In [None]:
# LogisticRegression
params = {
    'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'C' : [0.0001, 0.001, 0.01, 0.1, 0, 10, 100, 1000, 10000]
}

temp_model = LogisticRegression()
lr_grid_clf = GridSearchCV(temp_model, param_grid=params, scoring='f1', cv=kfold)
lr_grid_clf.fit(train_X, train_y)

# 평가 결과를 담아준다.
f1_score_list.append(lr_grid_clf.best_score_)
# 학습 모델 이름을 담아준다.
model_name_list.append("LogisticRegression Tuning")

In [None]:
# SVC
params = {
    'C' : [0.0001, 0.001, 0.01, 0.1, 0, 10, 100, 1000, 10000]
}

temp_model = SVC(probability=True)
svc_grid_clf = GridSearchCV(temp_model, param_grid=params, scoring='f1', cv=kfold)
svc_grid_clf.fit(train_X, train_y)

# 평가 결과를 담아준다.
f1_score_list.append(svc_grid_clf.best_score_)
# 학습 모델 이름을 담아준다.
model_name_list.append("SVC Tuning")

In [None]:
# DecisionTree
params = {
    'max_depth' : [None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
}

temp_model = DecisionTreeClassifier()
tree_grid_clf = GridSearchCV(temp_model, param_grid=params, scoring='f1', cv=kfold)
tree_grid_clf.fit(train_X, train_y)

# 평가 결과를 담아준다.
f1_score_list.append(tree_grid_clf.best_score_)
# 학습 모델 이름을 담아준다.
model_name_list.append("DecisionTree Tuning")

In [None]:
# RandomForest
params = {
    'n_estimators' : [50, 100, 150, 200, 250, 300]
}

temp_model = RandomForestClassifier()
rf_grid_clf = GridSearchCV(temp_model, param_grid=params, scoring='f1', cv=kfold)
rf_grid_clf.fit(train_X, train_y)

# 평가 결과를 담아준다.
f1_score_list.append(rf_grid_clf.best_score_)
# 학습 모델 이름을 담아준다.
model_name_list.append("RandomForest Tuning")

In [None]:
# AdaBoost
params = {
    'n_estimators' : [50, 100, 150, 200, 250, 300],
    'learning_rate' : [0.0001, 0.001, 0.01, 0.1, 1, 0, 10, 100, 1000, 10000]
}

temp_model = AdaBoostClassifier()
ada_grid_clf = GridSearchCV(temp_model, param_grid=params, scoring='f1', cv=kfold)
ada_grid_clf.fit(train_X, train_y)

# 평가 결과를 담아준다.
f1_score_list.append(ada_grid_clf.best_score_)
# 학습 모델 이름을 담아준다.
model_name_list.append("AdaBoost Tuning")

In [None]:
# GradientBoosting
params = {
    'n_estimators' : [50, 100, 150, 200, 250, 300],
    'learning_rate' : [0.0001, 0.001, 0.01, 0.1, 1, 0, 10, 100, 1000, 10000]
}

temp_model = GradientBoostingClassifier()
gb_grid_clf = GridSearchCV(temp_model, param_grid=params, scoring='f1', cv=kfold)
gb_grid_clf.fit(train_X, train_y)

# 평가 결과를 담아준다.
f1_score_list.append(gb_grid_clf.best_score_)
# 학습 모델 이름을 담아준다.
model_name_list.append("GradientBoosting Tuning")

In [None]:
# LGBM
params = {
    'n_estimators' : [50, 100, 150, 200, 250, 300],
    'learning_rate' : [0.0001, 0.001, 0.01, 0.1, 1, 0, 10, 100, 1000, 10000]
}

temp_model = LGBMClassifier(verbose=-1)
lgbm_grid_clf = GridSearchCV(temp_model, param_grid=params, scoring='f1', cv=kfold)
lgbm_grid_clf.fit(train_X, train_y)

# 평가 결과를 담아준다.
f1_score_list.append(lgbm_grid_clf.best_score_)
# 학습 모델 이름을 담아준다.
model_name_list.append("LGBM Tuning")

In [None]:
# XGBoost
params = {
    'booster' : ['gbtree', 'gblinear'],
    'n_estimators' : [50, 100, 150, 200, 250, 300],
    'learning_rate' : [0.0001, 0.001, 0.01, 0.1, 1, 0, 10, 100, 1000, 10000]
}

temp_model = XGBClassifier(verbose=-1, silent=True)
xgboost_grid_clf = GridSearchCV(temp_model, param_grid=params, scoring='f1', cv=kfold)
xgboost_grid_clf.fit(train_X, train_y)

# 평가 결과를 담아준다.
f1_score_list.append(xgboost_grid_clf.best_score_)
# 학습 모델 이름을 담아준다.
model_name_list.append("XGBoost Tuning")

In [None]:
# HardVoting 구성
temp_model1 = KNeighborsClassifier()
temp_model2 = LogisticRegression()
temp_model3 = SVC(probability=True)
temp_model4 = DecisionTreeClassifier()
temp_model5 = RandomForestClassifier()
temp_model6 = AdaBoostClassifier()
temp_model7 = GradientBoostingClassifier()
temp_model8 = LGBMClassifier(verbose=-1)
temp_model9 = XGBClassifier(verbose=-1, silent=True)

# 하이퍼 파라미터 셋팅
temp_model1.set_params(**knn_grid_clf.best_params_)
temp_model2.set_params(**lr_grid_clf.best_params_)
temp_model3.set_params(**svc_grid_clf.best_params_)
temp_model4.set_params(**tree_grid_clf.best_params_)
temp_model5.set_params(**rf_grid_clf.best_params_)
temp_model6.set_params(**ada_grid_clf.best_params_)
temp_model7.set_params(**gb_grid_clf.best_params_)
temp_model8.set_params(**lgbm_grid_clf.best_params_)
temp_model9.set_params(**xgboost_grid_clf.best_params_)

hard_voting_model_list = [
    ('model1', temp_model1),
    ('model2', temp_model2),
    ('model3', temp_model3),
    ('model4', temp_model4),
    ('model5', temp_model5),
    ('model6', temp_model6),
    ('model7', temp_model7),
    ('model8', temp_model8),
    ('model9', temp_model9),
]

hard_voting_tuning_model = VotingClassifier(estimators=hard_voting_model_list, voting='hard')

# 교차 검증을 수행한다
r1 = cross_val_score(hard_voting_tuning_model, train_X, train_y, scoring='f1', cv=kfold)
# 평가 결과를 담아준다.
f1_score_list.append(r1.mean())
# 학습 모델 이름을 담아준다.
model_name_list.append("HardVoting Tuning")

In [None]:
# SoftVoting 구성
temp_model1 = KNeighborsClassifier()
temp_model2 = LogisticRegression()
temp_model3 = SVC(probability=True)
temp_model4 = DecisionTreeClassifier()
temp_model5 = RandomForestClassifier()
temp_model6 = AdaBoostClassifier()
temp_model7 = GradientBoostingClassifier()
temp_model8 = LGBMClassifier(verbose=-1)
temp_model9 = XGBClassifier(verbose=-1, silent=True)

# 하이퍼 파라미터 셋팅
temp_model1.set_params(**knn_grid_clf.best_params_)
temp_model2.set_params(**lr_grid_clf.best_params_)
temp_model3.set_params(**svc_grid_clf.best_params_)
temp_model4.set_params(**tree_grid_clf.best_params_)
temp_model5.set_params(**rf_grid_clf.best_params_)
temp_model6.set_params(**ada_grid_clf.best_params_)
temp_model7.set_params(**gb_grid_clf.best_params_)
temp_model8.set_params(**lgbm_grid_clf.best_params_)
temp_model9.set_params(**xgboost_grid_clf.best_params_)

soft_voting_model_list = [
    ('model1', temp_model1),
    ('model2', temp_model2),
    ('model3', temp_model3),
    ('model4', temp_model4),
    ('model5', temp_model5),
    ('model6', temp_model6),
    ('model7', temp_model7),
    ('model8', temp_model8),
    ('model9', temp_model9),
]

soft_voting_tuning_model = VotingClassifier(estimators=soft_voting_model_list, voting='soft')

# 교차 검증을 수행한다
r1 = cross_val_score(soft_voting_tuning_model, train_X, train_y, scoring='f1', cv=kfold)
# 평가 결과를 담아준다.
f1_score_list.append(r1.mean())
# 학습 모델 이름을 담아준다.
model_name_list.append("SoftVoting Tuning")

In [None]:
d1 = {
    'f1 score' : f1_score_list
}
result_df = pd.DataFrame(d1, index=model_name_list)
result_df.sort_values(by='f1 score', ascending=False, inplace=True)
result_df

In [None]:
soft_voting_tuning_model.fit(train_X, train_y)
# 예측 결과에 대한 확률 값을 가져온다
proba_a1 = soft_voting_tuning_model.predict_proba(train_X)
# 0일 확률
a10 = proba_a1[:, 0]
# 1일 확률
a20 = proba_a1[:, 1]

plt.scatter(list(range(len(a10))), a10, label='0일 확률')
plt.scatter(list(range(len(a20))), a20, label='1일 확률')
plt.ylim(-0.1, 1.1)
plt.legend()
plt.show()

In [None]:
# 예측 결과에 대한 확률 값을 가져온다
proba_a1 = lgbm_grid_clf.best_estimator_.predict_proba(train_X)
# 0일 확률
a10 = proba_a1[:, 0]
# 1일 확률
a20 = proba_a1[:, 1]

plt.scatter(list(range(len(a10))), a10, label='0일 확률')
plt.scatter(list(range(len(a20))), a20, label='1일 확률')
plt.ylim(-0.1, 1.1)
plt.legend()
plt.show()

In [None]:
# 예측 결과에 대한 확률 값을 가져온다
proba_a1 = gb_grid_clf.best_estimator_.predict_proba(train_X)
# 0일 확률
a10 = proba_a1[:, 0]
# 1일 확률
a20 = proba_a1[:, 1]

plt.scatter(list(range(len(a10))), a10, label='0일 확률')
plt.scatter(list(range(len(a20))), a20, label='1일 확률')
plt.ylim(-0.1, 1.1)
plt.legend()
plt.show()

In [None]:
# 예측 결과에 대한 확률 값을 가져온다
proba_a1 = ada_grid_clf.best_estimator_.predict_proba(train_X)
# 0일 확률
a10 = proba_a1[:, 0]
# 1일 확률
a20 = proba_a1[:, 1]

plt.scatter(list(range(len(a10))), a10, label='0일 확률')
plt.scatter(list(range(len(a20))), a20, label='1일 확률')
plt.ylim(-0.1, 1.1)
plt.legend()
plt.show()

In [None]:
# 최종 모델을 생성하고 전체 데이터를 학습 시킨다.
best_model = GradientBoostingClassifier()
best_model.set_params(**gb_grid_clf.best_params_)
best_model.fit(train_X, train_y)
best_model

In [None]:
# 학습 모델 등을 저장한다.
with open(best_model_path, 'wb') as fp :
    pickle.dump(best_model, fp)
    pickle.dump(HomePlanetEndoder, fp)
    pickle.dump(CryoSleepEndoder, fp)
    pickle.dump(DestinationEndoder, fp)
    pickle.dump(VIPEndoder, fp)
    pickle.dump(CabinDeckEndoder, fp)
    pickle.dump(CabinSideEndoder, fp)
    pickle.dump(best_model, fp)
    pickle.dump(scalerX, fp)

print('저장완료')