---

# stacking

In [None]:
from matplotlib import pyplot as plt
from matplotlib import rcParams
import numpy as np
import pandas as pd
import seaborn as sns
import math
import warnings
from tqdm import tqdm
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from scipy.stats import skew, kurtosis
import itertools
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from catboost import CatBoostClassifier, Pool
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from pathlib import Path
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

# 데이터셋 생성

In [None]:
trn = pd.read_csv('C:/python/DACON_DATA/train.csv', index_col=0)
tst = pd.read_csv('C:/python/DACON_DATA/test.csv', index_col=0)

all_fea=['u','g','r','i','z','redshift','dered_u','dered_g','dered_r','dered_i','dered_z','nObserve','nDetect','airmass_u','airmass_g','airmass_r','airmass_i','airmass_z']
fea=['u', 'g', 'r', 'i', 'z', 'redshift', 'dered_u', 'dered_g','dered_r', 'dered_i', 'dered_z']
fea2=['u','g','r','i','z','redshift','dered_u','dered_g','dered_r','dered_i','dered_z','nObserve','nDetect']
names=['u','g','r','i','z']
names_2 = ['dered_u','dered_g','dered_r','dered_i','dered_z']
airmass=['airmass_u','airmass_g','airmass_r','airmass_i','airmass_z']

#이상치 제거
for i in range(len(fea)):
    trn=trn[trn[fea[i]]>np.min(tst[fea[i]], axis=0)]
    trn=trn[trn[fea[i]]<np.max(tst[fea[i]], axis=0)]
    
# 설명변수와 반응변수 분리
trn_target = trn['class']
trn = trn.drop('class', axis=1)

# 옵저브 디텍트 연속형으로 전환
trn['nObserve']=trn['nObserve'].astype('float')
trn['nDetect']=trn['nDetect'].astype('float')
trnnO = trn['nObserve']
trnnD = trn['nDetect']

#카테고리별 max, min, max-min, std, sum을 구한다.
#max-min
trn['max-min'] = trn[all_fea].max(axis=1)-trn[all_fea].min(axis=1)
trn['max-min_ugriz'] = trn[names].max(axis=1)-trn[names].min(axis=1)
trn['max-min_dered'] = trn[names_2].max(axis=1)-trn[names_2].min(axis=1)
#std
trn['std'] = trn[all_fea].std(axis=1)
trn['std_ugriz'] = trn[names].std(axis=1)
trn['std_dered'] = trn[names_2].std(axis=1)
#파장별 합
trn['sum'] = trn[all_fea].sum(axis=1)
trn['sum_ugriz'] = trn[names].sum(axis=1)
trn['sum_dered'] = trn[names_2].sum(axis=1)
#파장별 최대값
trn['max'] = trn[all_fea].max(axis=1)
trn['max_ugriz'] = trn[names].max(axis=1)
trn['max_dered'] = trn[names_2].max(axis=1)
#파장별 최소값
trn['min'] = trn[all_fea].min(axis=1)
trn['min_ugriz'] = trn[names].min(axis=1)
trn['min_dered'] = trn[names_2].min(axis=1)
#파장별 max-max,min=min,sum-sum
trn['max-max']=trn[names].max(axis=1)-trn[names_2].max(axis=1)
trn['min-min']=trn[names].min(axis=1)-trn[names_2].min(axis=1)
trn['sum-sum']=trn[names].sum(axis=1)-trn[names_2].sum(axis=1)

#왜도,첨도 구하기
trn['skew']=skew(trn[names],axis=1)
trn['kurtosis']=kurtosis(trn[names],axis=1)
trn['dered_skew']=skew(trn[names_2],axis=1)
trn['dered_kurtosis']=kurtosis(trn[names_2],axis=1)
trn['airmass_skew']=skew(trn[airmass],axis=1)
trn['airmass_kurtosis']=kurtosis(trn[airmass],axis=1)


#조합으로 연산 피쳐 생성
for c1,c2 in tqdm(itertools.combinations(fea2,2)):
    dif_col=f'diff_{c1}_{c2}'
    div_col=f'div_{c1}_{c2}'
    sum_col=f'sum_{c1}_{c2}'
    mul_col=f'mul_{c1}_{c2}'
    trn[dif_col]=trn[c1]-trn[c2]
    trn[div_col]=trn[c1]/trn[c2]
    trn[sum_col]=trn[c1]+trn[c2]
    trn[mul_col]=trn[c1]*trn[c2]


# 소수점 4자리 까지만 나타내는 asinh 변수 생성
trn['asinh_mu'] = -2.5/np.log(10)*(np.arcsinh(trn.u/24.63/(2.8e-10))-22.689378693319245)
trn['asinh_mg'] = -2.5/np.log(10)*(np.arcsinh(trn.g/25.11/(1.8e-10))-23.131211445598282)
trn['asinh_mr'] = -2.5/np.log(10)*(np.arcsinh(trn.r/24.80/(2.4e-10))-22.843529373146502)
trn['asinh_mi'] = -2.5/np.log(10)*(np.arcsinh(trn.i/24.36/(3.6e-10))-22.43806426503834)
trn['asinh_mz'] = -2.5/np.log(10)*(np.arcsinh(trn.z/22.83/(1.48e-09))-21.024370929730330)


trn['redshift%14'] = trn['redshift']%14
trn['log_redshift']=np.log1p(trn['redshift'])
trn['log_redshift']=trn['log_redshift'].fillna(0)

#도메인에서 얻은 파생변수 생성
#출처: https://www.sdss.org/dr16/algorithms/segue_target_selection/#Legacy
trn['l-color'] = (-0.436*trn['u']) + (1.129*trn['g']) - (0.119*trn['r']) - (0.574*trn['i']) + (0.1984)
trn['s-color'] = (-0.249*trn['u']) + (0.794*trn['g']) - (0.555*trn['r']) + (0.234)
trn['P1'] = (0.91*(trn['u']-trn['g'])) + (0.415*(trn['g']-trn['r'])) - (1.280)

trn['class'] = trn_target

#피쳐 제거
PI30_1=['dered_z','dered_g','div_dered_g_dered_z','sum_dered_i_nDetect','mul_dered_g_dered_z','sum_z_dered_i','diff_r_redshift','sum_u_redshift','sum_dered_r_nDetect','mul_g_dered_z','sum_dered_r_dered_z','div_r_nObserve','sum_i_redshift','diff_dered_g_dered_z','sum_dered_g_dered_r','sum_r_dered_z','diff_u_redshift','mul_u_dered_z','mul_dered_g_dered_r','mul_i_dered_r','div_i_dered_g','sum_r_redshift','div_u_dered_z','mul_r_z','div_g_z','diff_u_dered_z','mul_r_dered_g','sum_redshift_dered_z','u','div_redshift_dered_u']
trn=trn[trn.columns.difference(PI30_1)]
PI30_2=['mul_dered_g_nDetect','sum_r_nObserve','sum_i_dered_r','diff_z_nObserve','mul_nObserve_nDetect','asinh_mz','nObserve','asinh_mg','diff_g_dered_z','mul_g_dered_u','log_redshift','nDetect','mul_u_z','sum_r_nDetect','div_dered_i_nObserve','sum_i_z','sum_u_dered_r','dered_i','mul_g_dered_r','mul_z_dered_i','div_g_nDetect','diff_dered_g_nObserve','mul_r_dered_z','sum_i_nDetect','diff_z_dered_g','mul_g_z','sum_z_dered_z','mul_dered_u_nObserve','div_redshift_nObserve','dered_r']
trn=trn[trn.columns.difference(PI30_2)]
PI30_3=["mul_g_i","sum_redshift_dered_u","sum_g_i","sum_z_dered_g","sum_i_dered_z","mul_r_dered_r","div_g_i","mul_dered_r_dered_i","g","div_dered_i_nDetect","mul_dered_r_nObserve","sum_r_i","min_dered","mul_i_dered_z","div_dered_r_nObserve","diff_dered_z_nDetect","div_i_nDetect","sum_nObserve_nDetect","max_dered","mul_g_r","div_z_nObserve","sum_i_dered_u","mul_dered_r_nDetect","div_dered_g_nObserve","mul_r_i","diff_i_nDetect","sum_u_r","sum_dered_g_nDetect","div_u_nDetect","sum_u_dered_g"]
trn=trn[trn.columns.difference(PI30_3)]
PI30_4=["div_r_nDetect","mul_u_g","diff_dered_i_nDetect","mul_u_i","mul_dered_i_nObserve","div_i_dered_i","mul_i_nDetect","mul_g_dered_g","sum_u_dered_z","mul_r_dered_i","sum_g_dered_g","sum_u_g","mul_dered_u_dered_i","sum_g_nDetect","mul_z_nObserve","mul_g_nDetect","mul_dered_i_nDetect","sum_dered_g_dered_i","mul_u_dered_i","div_dered_z_nDetect","i","sum_g_z","sum_u_dered_u","sum_g_r","sum_dered","mul_u_nDetect","mul_i_dered_i","mul_dered_g_nObserve","div_dered_g_nDetect","div_u_i"]
trn=trn[trn.columns.difference(PI30_4)]
PI30_5=["asinh_mr","div_dered_r_nDetect","mul_z_dered_r","mul_g_nObserve","diff_u_nDetect","sum_g_dered_u","sum_redshift_dered_i","div_i_nObserve","div_g_nObserve","z","mul_dered_z_nObserve","sum_dered_r_nObserve","sum_z_dered_r","sum_u_dered_i","mul_u_dered_g","mul_dered_u_dered_r","diff_z_nDetect","sum_r_dered_r","sum_dered_u_dered_g","asinh_mi","diff_i_redshift","diff_r_nObserve","mul_i_nObserve","diff_dered_r_nObserve","sum_i_dered_i","sum_r_dered_i","diff_dered_r_nDetect","sum_u_nObserve","div_dered_u_nDetect","sum_z_dered_u"]
trn=trn[trn.columns.difference(PI30_5)]
PI30_6=["sum_dered_i_nObserve","diff_nObserve_nDetect","mul_dered_u_dered_z","div_u_nObserve","diff_u_nObserve","diff_redshift_nDetect","mul_u_nObserve","diff_dered_g_nDetect","sum_u_z","max_ugriz","sum_g_dered_r","mul_r_nObserve","div_z_nDetect","max","mul_dered_u_nDetect","sum_i_nObserve","diff_g_nDetect","sum_dered_z_nDetect","mul_z_nDetect","mul_i_z","diff_r_nDetect","diff_g_nObserve","div_g_dered_i","mul_dered_z_nDetect","mul_i_dered_u","diff_dered_u_nObserve","div_dered_z_nObserve","mul_u_r","diff_i_nObserve","sum_ugriz"]
trn=trn[trn.columns.difference(PI30_6)]
PI30_7=["dered_u","mul_i_dered_g","mul_r_dered_u","mul_r_nDetect","sum_dered_g_nObserve","sum-sum","diff_dered_u_dered_z","diff_dered_u_nDetect","mul_dered_r_dered_z","sum_dered_u_nObserve","diff_dered_i_nObserve","diff_g_z","diff_z_redshift","diff_i_dered_g","mul_z_dered_z","min_ugriz","diff_dered_z_nObserve","mul_g_dered_i","asinh_mu","sum_dered_u_dered_r","diff_u_z","sum_u_nDetect","std_ugriz","sum_dered_r_dered_i","sum_dered_z_nObserve","mul_dered_u_dered_g","sum_r_dered_g","mul_u_dered_r","max-min_ugriz","diff_redshift_dered_r"]
trn=trn[trn.columns.difference(PI30_7)]
PI30_8=["div_i_redshift","sum_g_redshift","div_r_dered_r","mul_z_dered_g","sum_redshift_nDetect","diff_u_dered_u","diff_i_dered_i","diff_i_dered_u","sum_g_dered_i","diff_u_dered_i","div_g_redshift","sum_u_i","div_z_dered_r","sum_i_dered_g","sum_redshift_dered_r","sum_z_nObserve","sum_dered_u_nDetect","diff_redshift_nObserve","sum_dered_u_dered_z","div_g_dered_z","sum_z_nDetect","diff_redshift_dered_z","div_dered_g_dered_i","div_dered_u_nObserve","mul_u_dered_u","sum_r_z","r","max-min_dered","mul_dered_g_dered_i","sum_g_nObserve"]
trn=trn[trn.columns.difference(PI30_8)]

In [None]:
all_fea=['u','g','r','i','z','redshift','dered_u','dered_g','dered_r','dered_i','dered_z','nObserve','nDetect','airmass_u','airmass_g','airmass_r','airmass_i','airmass_z']
fea=['u', 'g', 'r', 'i', 'z', 'redshift', 'dered_u', 'dered_g','dered_r', 'dered_i', 'dered_z']
fea2=['u','g','r','i','z','redshift','dered_u','dered_g','dered_r','dered_i','dered_z','nObserve','nDetect']
names=['u','g','r','i','z']
names_2 = ['dered_u','dered_g','dered_r','dered_i','dered_z']
airmass=['airmass_u','airmass_g','airmass_r','airmass_i','airmass_z']

# #이상치 제거
# for i in range(len(fea)):
#     tst=tst[tst[fea[i]]>np.min(tst[fea[i]], axis=0)]
#     tst=tst[tst[fea[i]]<np.max(tst[fea[i]], axis=0)]
    
# # 설명변수와 반응변수 분리
# tst_target = tst['class']
# tst = tst.drop('class', axis=1)

# 옵저브 디텍트 연속형으로 전환
tst['nObserve']=tst['nObserve'].astype('float')
tst['nDetect']=tst['nDetect'].astype('float')
tstnO = tst['nObserve']
tstnD = tst['nDetect']

#카테고리별 max, min, max-min, std, sum을 구한다.
#max-min
tst['max-min'] = tst[all_fea].max(axis=1)-tst[all_fea].min(axis=1)
tst['max-min_ugriz'] = tst[names].max(axis=1)-tst[names].min(axis=1)
tst['max-min_dered'] = tst[names_2].max(axis=1)-tst[names_2].min(axis=1)
#std
tst['std'] = tst[all_fea].std(axis=1)
tst['std_ugriz'] = tst[names].std(axis=1)
tst['std_dered'] = tst[names_2].std(axis=1)
#파장별 합
tst['sum'] = tst[all_fea].sum(axis=1)
tst['sum_ugriz'] = tst[names].sum(axis=1)
tst['sum_dered'] = tst[names_2].sum(axis=1)
#파장별 최대값
tst['max'] = tst[all_fea].max(axis=1)
tst['max_ugriz'] = tst[names].max(axis=1)
tst['max_dered'] = tst[names_2].max(axis=1)
#파장별 최소값
tst['min'] = tst[all_fea].min(axis=1)
tst['min_ugriz'] = tst[names].min(axis=1)
tst['min_dered'] = tst[names_2].min(axis=1)
#파장별 max-max,min=min,sum-sum
tst['max-max']=tst[names].max(axis=1)-tst[names_2].max(axis=1)
tst['min-min']=tst[names].min(axis=1)-tst[names_2].min(axis=1)
tst['sum-sum']=tst[names].sum(axis=1)-tst[names_2].sum(axis=1)

#왜도,첨도 구하기
tst['skew']=skew(tst[names],axis=1)
tst['kurtosis']=kurtosis(tst[names],axis=1)
tst['dered_skew']=skew(tst[names_2],axis=1)
tst['dered_kurtosis']=kurtosis(tst[names_2],axis=1)
tst['airmass_skew']=skew(tst[airmass],axis=1)
tst['airmass_kurtosis']=kurtosis(tst[airmass],axis=1)


#조합으로 연산 피쳐 생성
for c1,c2 in tqdm(itertools.combinations(fea2,2)):
    dif_col=f'diff_{c1}_{c2}'
    div_col=f'div_{c1}_{c2}'
    sum_col=f'sum_{c1}_{c2}'
    mul_col=f'mul_{c1}_{c2}'
    tst[dif_col]=tst[c1]-tst[c2]
    tst[div_col]=tst[c1]/tst[c2]
    tst[sum_col]=tst[c1]+tst[c2]
    tst[mul_col]=tst[c1]*tst[c2]


# 소수점 4자리 까지만 나타내는 asinh 변수 생성
tst['asinh_mu'] = -2.5/np.log(10)*(np.arcsinh(tst.u/24.63/(2.8e-10))-22.689378693319245)
tst['asinh_mg'] = -2.5/np.log(10)*(np.arcsinh(tst.g/25.11/(1.8e-10))-23.131211445598282)
tst['asinh_mr'] = -2.5/np.log(10)*(np.arcsinh(tst.r/24.80/(2.4e-10))-22.843529373146502)
tst['asinh_mi'] = -2.5/np.log(10)*(np.arcsinh(tst.i/24.36/(3.6e-10))-22.43806426503834)
tst['asinh_mz'] = -2.5/np.log(10)*(np.arcsinh(tst.z/22.83/(1.48e-09))-21.024370929730330)


tst['redshift%14'] = tst['redshift']%14
tst['log_redshift']=np.log1p(tst['redshift'])
tst['log_redshift']=tst['log_redshift'].fillna(0)

#도메인에서 얻은 파생변수 생성
#출처: https://www.sdss.org/dr16/algorithms/segue_target_selection/#Legacy
tst['l-color'] = (-0.436*tst['u']) + (1.129*tst['g']) - (0.119*tst['r']) - (0.574*tst['i']) + (0.1984)
tst['s-color'] = (-0.249*tst['u']) + (0.794*tst['g']) - (0.555*tst['r']) + (0.234)
tst['P1'] = (0.91*(tst['u']-tst['g'])) + (0.415*(tst['g']-tst['r'])) - (1.280)


# tst['class'] = tst_target

#피쳐 제거
PI30_1=['dered_z','dered_g','div_dered_g_dered_z','sum_dered_i_nDetect','mul_dered_g_dered_z','sum_z_dered_i','diff_r_redshift','sum_u_redshift','sum_dered_r_nDetect','mul_g_dered_z','sum_dered_r_dered_z','div_r_nObserve','sum_i_redshift','diff_dered_g_dered_z','sum_dered_g_dered_r','sum_r_dered_z','diff_u_redshift','mul_u_dered_z','mul_dered_g_dered_r','mul_i_dered_r','div_i_dered_g','sum_r_redshift','div_u_dered_z','mul_r_z','div_g_z','diff_u_dered_z','mul_r_dered_g','sum_redshift_dered_z','u','div_redshift_dered_u']
tst=tst[tst.columns.difference(PI30_1)]
PI30_2=['mul_dered_g_nDetect','sum_r_nObserve','sum_i_dered_r','diff_z_nObserve','mul_nObserve_nDetect','asinh_mz','nObserve','asinh_mg','diff_g_dered_z','mul_g_dered_u','log_redshift','nDetect','mul_u_z','sum_r_nDetect','div_dered_i_nObserve','sum_i_z','sum_u_dered_r','dered_i','mul_g_dered_r','mul_z_dered_i','div_g_nDetect','diff_dered_g_nObserve','mul_r_dered_z','sum_i_nDetect','diff_z_dered_g','mul_g_z','sum_z_dered_z','mul_dered_u_nObserve','div_redshift_nObserve','dered_r']
tst=tst[tst.columns.difference(PI30_2)]
PI30_3=["mul_g_i","sum_redshift_dered_u","sum_g_i","sum_z_dered_g","sum_i_dered_z","mul_r_dered_r","div_g_i","mul_dered_r_dered_i","g","div_dered_i_nDetect","mul_dered_r_nObserve","sum_r_i","min_dered","mul_i_dered_z","div_dered_r_nObserve","diff_dered_z_nDetect","div_i_nDetect","sum_nObserve_nDetect","max_dered","mul_g_r","div_z_nObserve","sum_i_dered_u","mul_dered_r_nDetect","div_dered_g_nObserve","mul_r_i","diff_i_nDetect","sum_u_r","sum_dered_g_nDetect","div_u_nDetect","sum_u_dered_g"]
tst=tst[tst.columns.difference(PI30_3)]
PI30_4=["div_r_nDetect","mul_u_g","diff_dered_i_nDetect","mul_u_i","mul_dered_i_nObserve","div_i_dered_i","mul_i_nDetect","mul_g_dered_g","sum_u_dered_z","mul_r_dered_i","sum_g_dered_g","sum_u_g","mul_dered_u_dered_i","sum_g_nDetect","mul_z_nObserve","mul_g_nDetect","mul_dered_i_nDetect","sum_dered_g_dered_i","mul_u_dered_i","div_dered_z_nDetect","i","sum_g_z","sum_u_dered_u","sum_g_r","sum_dered","mul_u_nDetect","mul_i_dered_i","mul_dered_g_nObserve","div_dered_g_nDetect","div_u_i"]
tst=tst[tst.columns.difference(PI30_4)]
PI30_5=["asinh_mr","div_dered_r_nDetect","mul_z_dered_r","mul_g_nObserve","diff_u_nDetect","sum_g_dered_u","sum_redshift_dered_i","div_i_nObserve","div_g_nObserve","z","mul_dered_z_nObserve","sum_dered_r_nObserve","sum_z_dered_r","sum_u_dered_i","mul_u_dered_g","mul_dered_u_dered_r","diff_z_nDetect","sum_r_dered_r","sum_dered_u_dered_g","asinh_mi","diff_i_redshift","diff_r_nObserve","mul_i_nObserve","diff_dered_r_nObserve","sum_i_dered_i","sum_r_dered_i","diff_dered_r_nDetect","sum_u_nObserve","div_dered_u_nDetect","sum_z_dered_u"]
tst=tst[tst.columns.difference(PI30_5)]
PI30_6=["sum_dered_i_nObserve","diff_nObserve_nDetect","mul_dered_u_dered_z","div_u_nObserve","diff_u_nObserve","diff_redshift_nDetect","mul_u_nObserve","diff_dered_g_nDetect","sum_u_z","max_ugriz","sum_g_dered_r","mul_r_nObserve","div_z_nDetect","max","mul_dered_u_nDetect","sum_i_nObserve","diff_g_nDetect","sum_dered_z_nDetect","mul_z_nDetect","mul_i_z","diff_r_nDetect","diff_g_nObserve","div_g_dered_i","mul_dered_z_nDetect","mul_i_dered_u","diff_dered_u_nObserve","div_dered_z_nObserve","mul_u_r","diff_i_nObserve","sum_ugriz"]
tst=tst[tst.columns.difference(PI30_6)]
PI30_7=["dered_u","mul_i_dered_g","mul_r_dered_u","mul_r_nDetect","sum_dered_g_nObserve","sum-sum","diff_dered_u_dered_z","diff_dered_u_nDetect","mul_dered_r_dered_z","sum_dered_u_nObserve","diff_dered_i_nObserve","diff_g_z","diff_z_redshift","diff_i_dered_g","mul_z_dered_z","min_ugriz","diff_dered_z_nObserve","mul_g_dered_i","asinh_mu","sum_dered_u_dered_r","diff_u_z","sum_u_nDetect","std_ugriz","sum_dered_r_dered_i","sum_dered_z_nObserve","mul_dered_u_dered_g","sum_r_dered_g","mul_u_dered_r","max-min_ugriz","diff_redshift_dered_r"]
tst=tst[tst.columns.difference(PI30_7)]
PI30_8=["div_i_redshift","sum_g_redshift","div_r_dered_r","mul_z_dered_g","sum_redshift_nDetect","diff_u_dered_u","diff_i_dered_i","diff_i_dered_u","sum_g_dered_i","diff_u_dered_i","div_g_redshift","sum_u_i","div_z_dered_r","sum_i_dered_g","sum_redshift_dered_r","sum_z_nObserve","sum_dered_u_nDetect","diff_redshift_nObserve","sum_dered_u_dered_z","div_g_dered_z","sum_z_nDetect","diff_redshift_dered_z","div_dered_g_dered_i","div_dered_u_nObserve","mul_u_dered_u","sum_r_z","r","max-min_dered","mul_dered_g_dered_i","sum_g_nObserve"]
tst=tst[tst.columns.difference(PI30_8)]

---

# Base learner modeling
- 1) Light GBM
- 2) XGBoost
- 3) Random Forest
- 4) Multi Layer Perceptron
- 5) Naive Bayes
- 6) Support Vector Machine
- 7) Hist GBM
- 8) Cat Boost
- 9) GBM
- 10) Xtree

In [None]:
n_fold = 5
n_class = 3
seed = 42
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

### 1) Light GBM

In [None]:
ftr=trn.drop("class",axis=1)
target=trn['class']
ftr=ftr.values
tst_ar=tst.values
target=target.values

In [None]:
lgb_p_val = np.zeros((ftr.shape[0], n_class))
lgb_p_tst = np.zeros((tst_ar.shape[0], n_class))
for i, (i_trn, i_val) in enumerate(cv.split(ftr, target), 1):
    print(f'training model for CV #{i}')
    lgb_clf = lgb.LGBMClassifier(objective='multiclass',
                             n_estimators=600,
                             boosting_type ='dart',
                             num_leaves=100,
                             learning_rate=0.1,
                             max_depth = 30,
                              feature_fraction=0.8,
                             random_state=seed,                             
                             n_jobs=-1)
    lgb_clf.fit(ftr[i_trn], target[i_trn],
            eval_set=[(ftr[i_val], target[i_val])],
            eval_metric='multi_error')
    
    lgb_p_val[i_val, :] = lgb_clf.predict_proba(ftr[i_val])
    lgb_p_tst += lgb_clf.predict_proba(tst_ar) / n_fold
print(f'{accuracy_score(target, np.argmax(lgb_p_val, axis=1)) * 100:.4f}%')
print(f'{confusion_matrix(target, np.argmax(lgb_p_val, axis=1))}%')

In [None]:
tst_dir = Path('C:/Users/ATIV/python/stacking/tst')
val_dir = Path('C:/Users/ATIV/python/stacking/val')

algo_name = 'lgb'
feature_name = '124'
model_name = f'{algo_name}_{feature_name}'

p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'

np.savetxt(p_val_file, lgb_p_val, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, lgb_p_tst, fmt='%.6f', delimiter=',')

### 2) XGBoost

In [None]:
ftr=trn.drop("class",axis=1)
target=trn['class']
ftr=ftr.values
tst_ar=tst.values
target=target.values

In [None]:
xgb_p_val = np.zeros((ftr.shape[0], n_class))
xgb_p_tst = np.zeros((tst_ar.shape[0], n_class))
for i, (i_trn, i_val) in enumerate(cv.split(ftr, target), 1):
    print(f'training model for CV #{i}')
    xgb_clf = xgb.XGBClassifier(learning_rate=0.1,
                          n_estimators=220,
                          max_depth=10,
                           feature_fraction=0.8,
                          booster='dart',
                            random_state=seed,
                          tree_method='exact',
                            objective='multiclass',
                            num_class=3,
                            n_jobs=-1)
    xgb_clf.fit(ftr[i_trn], target[i_trn],
            eval_set=[(ftr[i_val], target[i_val])])
    
    xgb_p_val[i_val, :] = xgb_clf.predict_proba(ftr[i_val])
    xgb_p_tst += xgb_clf.predict_proba(tst_ar) / n_fold
print(f'{accuracy_score(target, np.argmax(xgb_p_val, axis=1)) * 100:.4f}%')
print(f'{confusion_matrix(target, np.argmax(xgb_p_val, axis=1))}%')

In [None]:
tst_dir = Path('C:/Users/ATIV/python/stacking/tst')
val_dir = Path('C:/Users/ATIV/python/stacking/val')

algo_name = 'xgb'
feature_name = '124'
model_name = f'{algo_name}_{feature_name}'

p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'

np.savetxt(p_val_file, xgb_p_val, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, xgb_p_tst, fmt='%.6f', delimiter=',')

### 3) Random Forest

In [None]:
ftr=trn.drop("class",axis=1)
target=trn['class']
ftr=ftr.values
tst_ar=tst.values
target=target.values

In [None]:
rf_p_val = np.zeros((ftr.shape[0], n_class))
rf_p_tst = np.zeros((tst_ar.shape[0], n_class))
for i, (i_trn, i_val) in enumerate(cv.split(ftr, target), 1):
    print(f'training model for CV #{i}')
    rf_clf = RandomForestClassifier(n_estimators = 200, 
                                random_state=seed,
                                verbose=True,
                                oob_score=True,
                                n_jobs=-1,
                               max_depth=25)
    rf_clf.fit(ftr[i_trn], target[i_trn])
    rf_p_val[i_val, :] = rf_clf.predict_proba(ftr[i_val])
    rf_p_tst += rf_clf.predict_proba(tst_ar) / n_fold
print(f'{accuracy_score(target, np.argmax(rf_p_val, axis=1)) * 100:.4f}%')
print(f'{confusion_matrix(target, np.argmax(rf_p_val, axis=1))}%')

In [None]:
tst_dir = Path('C:/Users/ATIV/python/stacking/tst')
val_dir = Path('C:/Users/ATIV/python/stacking/val')

algo_name = 'rf'
feature_name = '124'
model_name = f'{algo_name}_{feature_name}'

p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'

np.savetxt(p_val_file, rf_p_val, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, rf_p_tst, fmt='%.6f', delimiter=',')

### 4) Multi Layer Perceptron

In [None]:
#상관계수 높은 변수 제거
corr = trn.loc[trn['class']!=0,:].corr()
cor_list=[]
for i in range(0,trn.loc[trn['class']!=0,:].shape[1]):
    for j in range(0,trn.loc[trn['class']!=0,:].shape[1]):
        if abs(corr.iloc[i,j])>=0.9 and corr.index[i]!=corr.columns[j]:
            cor_list += [corr.columns[j]]
            cor_list += [corr.index[i]]
groupby=cor_list
cor_result=dict()
for ip in tqdm(cor_list):
    cor_result[ip]=cor_list.count(ip)
final_corr= []
for i in range(0,len(cor_result)):
    if list(cor_result.values())[i] > 20 :
        final_corr.append(list(cor_result.keys())[i])
print(cor_result)
print()
print(final_corr)
cols = list(trn.columns[~trn.columns.isin(list(cor_result.keys()))])
for i in ['diff_u_dered_g','P1','diff_u_dered_r','airmass_i','skew','kurtosis','diff_g_i','sum_dered_u_dered_i',
          'sum_r_dered_u','diff_redshift_dered_u','sum_z_redshift', 'mul_redshift_dered_i','mul_redshift_nObserve']:
    cols.append(i)
print(cols)

In [None]:
trn_mlp=trn[cols].copy()
cols.remove("class")
tst_mlp=tst[cols].copy()
ftr=trn_mlp.drop("class", axis=1)
target=trn_mlp['class']

In [None]:
ftr=ftr.values
tst_ar=tst_mlp.values
target=target.values

In [None]:
scaler = StandardScaler()
scaler.fit(ftr)
ftr = scaler.transform(ftr)
tst_ar = scaler.transform(tst_ar)

In [None]:
mlp_p_val = np.zeros((ftr.shape[0], n_class))
mlp_p_tst = np.zeros((tst_ar.shape[0], n_class))
for i, (i_trn, i_val) in enumerate(cv.split(ftr, target), 1):
    print(f'training model for CV #{i}')
    mlp_clf = MLPClassifier(hidden_layer_sizes=(30,30,10),
                            max_iter=10000,
                            learning_rate_init=0.001,
                            activation='relu',
                            verbose=10,
                            n_iter_no_change=20,
                            random_state=seed
                           )
    mlp_clf.fit(ftr[i_trn], target[i_trn])
    
    mlp_p_val[i_val, :] = mlp_clf.predict_proba(ftr[i_val])
    mlp_p_tst += mlp_clf.predict_proba(tst_ar) / n_fold
print(f'{accuracy_score(target, np.argmax(mlp_p_val, axis=1)) * 100:.4f}%')
print(confusion_matrix(target, np.argmax(mlp_p_val, axis=1)))

In [None]:
tst_dir = Path('C:/Users/ATIV/python/stacking/tst')
val_dir = Path('C:/Users/ATIV/python/stacking/val')

algo_name = 'mlp'
feature_name = '41_scale'
model_name = f'{algo_name}_{feature_name}'

p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'

np.savetxt(p_val_file, mlp_p_val, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, mlp_p_tst, fmt='%.6f', delimiter=',')

### 5) Naive Bayes

In [None]:
scaler = StandardScaler()
scaler.fit(ftr)
ftr_scale = scaler.transform(ftr)
tst_ar_scale = scaler.transform(tst_ar)

In [None]:
nb_p_val = np.zeros((ftr_scale.shape[0], n_class))
nb_p_tst = np.zeros((tst_ar_scale.shape[0], n_class))
for i, (i_trn, i_val) in enumerate(cv.split(ftr_scale, target), 1):
    print(f'training model for CV #{i}')
    nb_clf = GaussianNB()
    nb_clf.fit(ftr_scale[i_trn], target[i_trn])
    
    nb_p_val[i_val, :] = nb_clf.predict_proba(ftr_scale[i_val])
    nb_p_tst += nb_clf.predict_proba(tst_ar_scale) / n_fold
print(f'{accuracy_score(target, np.argmax(nb_p_val, axis=1)) * 100:.4f}%')
print(confusion_matrix(target, np.argmax(nb_p_val, axis=1)))

In [None]:
tst_dir = Path('C:/Users/ATIV/python/stacking/tst')
val_dir = Path('C:/Users/ATIV/python/stacking/val')

algo_name = 'nb'
feature_name = '124_scale'
model_name = f'{algo_name}_{feature_name}'

p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'

np.savetxt(p_val_file, nb_p_val, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, nb_p_tst, fmt='%.6f', delimiter=',')

### 6) Support Vector Machine

In [None]:
# 5번째 필터링
clist5 = ['diff_u_i','mul_dered_i_dered_z','div_redshift_dered_r','mul_u_redshift']
corr = trn.loc[trn['class']!=0,:].corr()

cor_list=[]

for i in range(0,trn.shape[1]):
    for j in range(0,trn.shape[1]):
        if abs(corr.iloc[i,j])>=0.9 and corr.index[i]!=corr.columns[j]:
            cor_list += [corr.columns[j]]
            cor_list += [corr.index[i]]

groupby=cor_list
cor_result=dict()
for ip in tqdm(cor_list):
    cor_result[ip]=cor_list.count(ip)
print(cor_result)

cols = list(trn.columns[~trn.columns.isin(list(cor_result.keys()))])

for i in clist5:
    cols.append(i)

print(len(cols))

In [None]:
trn_svm=trn[cols].copy()
cols.remove("class")
tst_svm=tst[cols].copy()
ftr=trn_svm.drop("class", axis=1)
target=trn_svm['class']

In [None]:
ftr=ftr.values
tst_ar=tst_svm.values
target=target.values
scaler = StandardScaler()
scaler.fit(ftr)
ftr = scaler.transform(ftr)
tst_ar = scaler.transform(tst_ar)

In [None]:
svm_p_val = np.zeros((ftr.shape[0], n_class))
svm_p_tst = np.zeros((tst_ar.shape[0], n_class))
for i, (i_trn, i_val) in enumerate(cv.split(ftr, target), 1):
    print(f'training model for CV #{i}')
    estimator = SVC(verbose=True)
    svm_clf = BaggingClassifier(base_estimator=estimator,
                          n_estimators=10,
                          n_jobs=-1)
    svm_clf.fit(ftr[i_trn], target[i_trn])
    
    svm_p_val[i_val, :] = svm_clf.predict_proba(ftr[i_val])
    svm_p_tst += svm_clf.predict_proba(tst_ar) / n_fold
print(f'{accuracy_score(target, np.argmax(svm_p_val, axis=1)) * 100:.4f}%')
print(confusion_matrix(target, np.argmax(svm_p_val, axis=1)))

In [None]:
tst_dir = Path('C:/Users/ATIV/python/stacking/tst')
val_dir = Path('C:/Users/ATIV/python/stacking/val')

algo_name = 'svm'
feature_name = '124'
model_name = f'{algo_name}_{feature_name}'

p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'

np.savetxt(p_val_file, svm_p_val, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, svm_p_tst, fmt='%.6f', delimiter=',')

### 7) Hist GBM

In [None]:
ftr=trn.drop("class",axis=1)
target=trn['class']
ftr=ftr.values
tst_ar=tst.values
target=target.values

In [None]:
hgb_p_val = np.zeros((ftr.shape[0], n_class))
hgb_p_tst = np.zeros((tst_ar.shape[0], n_class))
# tree_model = DecisionTreeClassifier(max_depth = 10)

for i, (i_trn, i_val) in enumerate(cv.split(ftr, target), 1):
    print(f'training model for CV #{i}')
    hgb_clf = HistGradientBoostingClassifier(max_iter=2000,
                                      # validation_fraction=0.1,
                                      # n_iter_no_change=15,
                                      verbose=True,
                                      random_state=42,
                                          learning_rate = 0.01
                                        )
    
    hgb_clf.fit(ftr[i_trn], target[i_trn])
    hgb_p_val[i_val, :] = hgb_clf.predict_proba(ftr[i_val])
    hgb_p_tst += hgb_clf.predict_proba(tst_ar) / 5
    
    y_pred = hgb_clf.predict(ftr[i_val])
    y_pred = pd.Series(y_pred)
    print(accuracy_score(pd.Series(target[i_val]), y_pred))
    print()
print(f'{accuracy_score(target, np.argmax(hgb_p_val, axis=1)) * 100:.4f}%')
print(confusion_matrix(target, np.argmax(hgb_p_val, axis=1)))

In [None]:
tst_dir = Path('C:/Users/ATIV/python/stacking/tst')
val_dir = Path('C:/Users/ATIV/python/stacking/val')

algo_name = 'hgb'
feature_name = '124'
model_name = f'{algo_name}_{feature_name}'

p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'

np.savetxt(p_val_file, hgb_p_val, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, hgb_p_tst, fmt='%.6f', delimiter=',')

### 8) Cat Boost

In [None]:
ftr=trn.drop("class",axis=1)
target=trn['class']
ftr=ftr.values
tst_ar=tst.values
target=target.values

In [None]:
cb_p_val = np.zeros((ftr.shape[0], n_class))
cb_p_tst = np.zeros((tst_ar.shape[0], n_class))
for i, (i_trn, i_val) in enumerate(cv.split(ftr, target), 1):
    print(f'training model for CV #{i}')
    cb_clf = CatBoostClassifier(iterations=10000,
                               loss_function='MultiClass',
                                random_seed = seed,
                                task_type = "GPU" , 
                                eval_metric='Accuracy')
    cb_clf.fit(ftr[i_trn], target[i_trn],
            eval_set=[(ftr[i_val], target[i_val])])
    
    cb_p_val[i_val, :] = cb_clf.predict_proba(ftr[i_val])
    cb_p_tst += cb_clf.predict_proba(tst_ar) / n_fold
print(f'{accuracy_score(target, np.argmax(cb_p_val, axis=1)) * 100:.4f}%')
print(confusion_matrix(target, np.argmax(cb_p_val, axis=1)))

In [None]:
tst_dir = Path('C:/Users/ATIV/python/stacking/tst')
val_dir = Path('C:/Users/ATIV/python/stacking/val')

algo_name = 'cb'
feature_name = '124'
model_name = f'{algo_name}_{feature_name}'

p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'

np.savetxt(p_val_file, cb_p_val, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, cb_p_tst, fmt='%.6f', delimiter=',')

### 9) GBM

### 10) Xtree

In [None]:
ftr=trn.drop("class",axis=1)
target=trn['class']
ftr=ftr.values
tst_ar=tst.values
target=target.values

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

ext_p_val = np.zeros((ftr.shape[0], n_class))
ext_p_tst = np.zeros((tst_ar.shape[0], n_class))
# tree_model = DecisionTreeClassifier(max_depth = 10)

for i, (i_trn, i_val) in enumerate(cv.split(ftr, target), 1):
    print(f'training model for CV #{i}')
    ext_clf = ExtraTreesClassifier(n_estimators=500, random_state=42)
    
    ext_clf.fit(ftr[i_trn], target[i_trn])
    ext_p_val[i_val, :] = ext_clf.predict_proba(ftr[i_val])
    ext_p_tst += ext_clf.predict_proba(tst_ar) / 5
    
    y_pred = ext_clf.predict(ftr[i_val])
    y_pred = pd.Series(y_pred)
    print(accuracy_score(pd.Series(target[i_val]), y_pred))
    print()
print(f'{accuracy_score(target, np.argmax(ext_p_val, axis=1)) * 100:.4f}%')
print(confusion_matrix(target, np.argmax(ext_p_val, axis=1)))

In [None]:
tst_dir = Path('C:/Users/ATIV/python/stacking/tst')
val_dir = Path('C:/Users/ATIV/python/stacking/val')

algo_name = 'ext'
feature_name = '124'
model_name = f'{algo_name}_{feature_name}'

p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'

np.savetxt(p_val_file, ext_p_val, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, ext_p_tst, fmt='%.6f', delimiter=',')