# RF / LGBM / XGBOOST

In [1]:
from matplotlib import pyplot as plt
from matplotlib import rcParams
import numpy as np
import pandas as pd
import seaborn as sns
import math
import warnings
from tqdm import tqdm
import lightgbm as lgbm
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from scipy.stats import skew, kurtosis
import itertools
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

In [2]:
trn = pd.read_csv('C:/python/DACON_DATA/train.csv', index_col=0)
tst = pd.read_csv('C:/python/DACON_DATA/test.csv', index_col=0)

In [3]:
all_fea=['u','g','r','i','z','redshift','dered_u','dered_g','dered_r','dered_i','dered_z','nObserve','nDetect','airmass_u','airmass_g','airmass_r','airmass_i','airmass_z']
fea=['u', 'g', 'r', 'i', 'z', 'redshift', 'dered_u', 'dered_g','dered_r', 'dered_i', 'dered_z']
fea2=['u','g','r','i','z','redshift','dered_u','dered_g','dered_r','dered_i','dered_z','nObserve','nDetect']
names=['u','g','r','i','z']
names_2 = ['dered_u','dered_g','dered_r','dered_i','dered_z']
airmass=['airmass_u','airmass_g','airmass_r','airmass_i','airmass_z']

#이상치 제거
for i in range(len(fea)):
    trn=trn[trn[fea[i]]>np.min(tst[fea[i]], axis=0)]
    trn=trn[trn[fea[i]]<np.max(tst[fea[i]], axis=0)]
    
# 설명변수와 반응변수 분리
trn_target = trn['class']
trn = trn.drop('class', axis=1)

# 옵저브 디텍트 연속형으로 전환
trn['nObserve']=trn['nObserve'].astype('float')
trn['nDetect']=trn['nDetect'].astype('float')
trnnO = trn['nObserve']
trnnD = trn['nDetect']

#카테고리별 max, min, max-min, std, sum을 구한다.
#max-min
trn['max-min'] = trn[all_fea].max(axis=1)-trn[all_fea].min(axis=1)
trn['max-min_ugriz'] = trn[names].max(axis=1)-trn[names].min(axis=1)
trn['max-min_dered'] = trn[names_2].max(axis=1)-trn[names_2].min(axis=1)
#std
trn['std'] = trn[all_fea].std(axis=1)
trn['std_ugriz'] = trn[names].std(axis=1)
trn['std_dered'] = trn[names_2].std(axis=1)
#파장별 합
trn['sum'] = trn[all_fea].sum(axis=1)
trn['sum_ugriz'] = trn[names].sum(axis=1)
trn['sum_dered'] = trn[names_2].sum(axis=1)
#파장별 최대값
trn['max'] = trn[all_fea].max(axis=1)
trn['max_ugriz'] = trn[names].max(axis=1)
trn['max_dered'] = trn[names_2].max(axis=1)
#파장별 최소값
trn['min'] = trn[all_fea].min(axis=1)
trn['min_ugriz'] = trn[names].min(axis=1)
trn['min_dered'] = trn[names_2].min(axis=1)
#파장별 max-max,min=min,sum-sum
trn['max-max']=trn[names].max(axis=1)-trn[names_2].max(axis=1)
trn['min-min']=trn[names].min(axis=1)-trn[names_2].min(axis=1)
trn['sum-sum']=trn[names].sum(axis=1)-trn[names_2].sum(axis=1)

#왜도,첨도 구하기
trn['skew']=skew(trn[names],axis=1)
trn['kurtosis']=kurtosis(trn[names],axis=1)
trn['dered_skew']=skew(trn[names_2],axis=1)
trn['dered_kurtosis']=kurtosis(trn[names_2],axis=1)
trn['airmass_skew']=skew(trn[airmass],axis=1)
trn['airmass_kurtosis']=kurtosis(trn[airmass],axis=1)


#조합으로 연산 피쳐 생성
for c1,c2 in tqdm(itertools.combinations(fea2,2)):
    dif_col=f'diff_{c1}_{c2}'
    div_col=f'div_{c1}_{c2}'
    sum_col=f'sum_{c1}_{c2}'
    mul_col=f'mul_{c1}_{c2}'
    trn[dif_col]=trn[c1]-trn[c2]
    trn[div_col]=trn[c1]/trn[c2]
    trn[sum_col]=trn[c1]+trn[c2]
    trn[mul_col]=trn[c1]*trn[c2]


# 소수점 4자리 까지만 나타내는 asinh 변수 생성
trn['asinh_mu'] = -2.5/np.log(10)*(np.arcsinh(trn.u/24.63/(2.8e-10))-22.689378693319245)
trn['asinh_mg'] = -2.5/np.log(10)*(np.arcsinh(trn.g/25.11/(1.8e-10))-23.131211445598282)
trn['asinh_mr'] = -2.5/np.log(10)*(np.arcsinh(trn.r/24.80/(2.4e-10))-22.843529373146502)
trn['asinh_mi'] = -2.5/np.log(10)*(np.arcsinh(trn.i/24.36/(3.6e-10))-22.43806426503834)
trn['asinh_mz'] = -2.5/np.log(10)*(np.arcsinh(trn.z/22.83/(1.48e-09))-21.024370929730330)


trn['redshift%14'] = trn['redshift']%14
trn['log_redshift']=np.log1p(trn['redshift'])
trn['log_redshift']=trn['log_redshift'].fillna(0)

#도메인에서 얻은 파생변수 생성
#출처: https://www.sdss.org/dr16/algorithms/segue_target_selection/#Legacy
trn['l-color'] = (-0.436*trn['u']) + (1.129*trn['g']) - (0.119*trn['r']) - (0.574*trn['i']) + (0.1984)
trn['s-color'] = (-0.249*trn['u']) + (0.794*trn['g']) - (0.555*trn['r']) + (0.234)
trn['P1'] = (0.91*(trn['u']-trn['g'])) + (0.415*(trn['g']-trn['r'])) - (1.280)


trn['class'] = trn_target

#피쳐 제거
PI30_1=['dered_z','dered_g','div_dered_g_dered_z','sum_dered_i_nDetect','mul_dered_g_dered_z','sum_z_dered_i','diff_r_redshift','sum_u_redshift','sum_dered_r_nDetect','mul_g_dered_z','sum_dered_r_dered_z','div_r_nObserve','sum_i_redshift','diff_dered_g_dered_z','sum_dered_g_dered_r','sum_r_dered_z','diff_u_redshift','mul_u_dered_z','mul_dered_g_dered_r','mul_i_dered_r','div_i_dered_g','sum_r_redshift','div_u_dered_z','mul_r_z','div_g_z','diff_u_dered_z','mul_r_dered_g','sum_redshift_dered_z','u','div_redshift_dered_u']
trn=trn[trn.columns.difference(PI30_1)]
PI30_2=['mul_dered_g_nDetect','sum_r_nObserve','sum_i_dered_r','diff_z_nObserve','mul_nObserve_nDetect','asinh_mz','nObserve','asinh_mg','diff_g_dered_z','mul_g_dered_u','log_redshift','nDetect','mul_u_z','sum_r_nDetect','div_dered_i_nObserve','sum_i_z','sum_u_dered_r','dered_i','mul_g_dered_r','mul_z_dered_i','div_g_nDetect','diff_dered_g_nObserve','mul_r_dered_z','sum_i_nDetect','diff_z_dered_g','mul_g_z','sum_z_dered_z','mul_dered_u_nObserve','div_redshift_nObserve','dered_r']
trn=trn[trn.columns.difference(PI30_2)]
PI30_3=["mul_g_i","sum_redshift_dered_u","sum_g_i","sum_z_dered_g","sum_i_dered_z","mul_r_dered_r","div_g_i","mul_dered_r_dered_i","g","div_dered_i_nDetect","mul_dered_r_nObserve","sum_r_i","min_dered","mul_i_dered_z","div_dered_r_nObserve","diff_dered_z_nDetect","div_i_nDetect","sum_nObserve_nDetect","max_dered","mul_g_r","div_z_nObserve","sum_i_dered_u","mul_dered_r_nDetect","div_dered_g_nObserve","mul_r_i","diff_i_nDetect","sum_u_r","sum_dered_g_nDetect","div_u_nDetect","sum_u_dered_g"]
trn=trn[trn.columns.difference(PI30_3)]
PI30_4=["div_r_nDetect","mul_u_g","diff_dered_i_nDetect","mul_u_i","mul_dered_i_nObserve","div_i_dered_i","mul_i_nDetect","mul_g_dered_g","sum_u_dered_z","mul_r_dered_i","sum_g_dered_g","sum_u_g","mul_dered_u_dered_i","sum_g_nDetect","mul_z_nObserve","mul_g_nDetect","mul_dered_i_nDetect","sum_dered_g_dered_i","mul_u_dered_i","div_dered_z_nDetect","i","sum_g_z","sum_u_dered_u","sum_g_r","sum_dered","mul_u_nDetect","mul_i_dered_i","mul_dered_g_nObserve","div_dered_g_nDetect","div_u_i"]
trn=trn[trn.columns.difference(PI30_4)]
PI30_5=["asinh_mr","div_dered_r_nDetect","mul_z_dered_r","mul_g_nObserve","diff_u_nDetect","sum_g_dered_u","sum_redshift_dered_i","div_i_nObserve","div_g_nObserve","z","mul_dered_z_nObserve","sum_dered_r_nObserve","sum_z_dered_r","sum_u_dered_i","mul_u_dered_g","mul_dered_u_dered_r","diff_z_nDetect","sum_r_dered_r","sum_dered_u_dered_g","asinh_mi","diff_i_redshift","diff_r_nObserve","mul_i_nObserve","diff_dered_r_nObserve","sum_i_dered_i","sum_r_dered_i","diff_dered_r_nDetect","sum_u_nObserve","div_dered_u_nDetect","sum_z_dered_u"]
trn=trn[trn.columns.difference(PI30_5)]
PI30_6=["sum_dered_i_nObserve","diff_nObserve_nDetect","mul_dered_u_dered_z","div_u_nObserve","diff_u_nObserve","diff_redshift_nDetect","mul_u_nObserve","diff_dered_g_nDetect","sum_u_z","max_ugriz","sum_g_dered_r","mul_r_nObserve","div_z_nDetect","max","mul_dered_u_nDetect","sum_i_nObserve","diff_g_nDetect","sum_dered_z_nDetect","mul_z_nDetect","mul_i_z","diff_r_nDetect","diff_g_nObserve","div_g_dered_i","mul_dered_z_nDetect","mul_i_dered_u","diff_dered_u_nObserve","div_dered_z_nObserve","mul_u_r","diff_i_nObserve","sum_ugriz"]
trn=trn[trn.columns.difference(PI30_6)]
PI30_7=["dered_u","mul_i_dered_g","mul_r_dered_u","mul_r_nDetect","sum_dered_g_nObserve","sum-sum","diff_dered_u_dered_z","diff_dered_u_nDetect","mul_dered_r_dered_z","sum_dered_u_nObserve","diff_dered_i_nObserve","diff_g_z","diff_z_redshift","diff_i_dered_g","mul_z_dered_z","min_ugriz","diff_dered_z_nObserve","mul_g_dered_i","asinh_mu","sum_dered_u_dered_r","diff_u_z","sum_u_nDetect","std_ugriz","sum_dered_r_dered_i","sum_dered_z_nObserve","mul_dered_u_dered_g","sum_r_dered_g","mul_u_dered_r","max-min_ugriz","diff_redshift_dered_r"]
trn=trn[trn.columns.difference(PI30_7)]
PI30_8=["div_i_redshift","sum_g_redshift","div_r_dered_r","mul_z_dered_g","sum_redshift_nDetect","diff_u_dered_u","diff_i_dered_i","diff_i_dered_u","sum_g_dered_i","diff_u_dered_i","div_g_redshift","sum_u_i","div_z_dered_r","sum_i_dered_g","sum_redshift_dered_r","sum_z_nObserve","sum_dered_u_nDetect","diff_redshift_nObserve","sum_dered_u_dered_z","div_g_dered_z","sum_z_nDetect","diff_redshift_dered_z","div_dered_g_dered_i","div_dered_u_nObserve","mul_u_dered_u","sum_r_z","r","max-min_dered","mul_dered_g_dered_i","sum_g_nObserve"]
trn=trn[trn.columns.difference(PI30_8)]

78it [00:01, 40.54it/s] 
  result = getattr(ufunc, method)(*inputs, **kwargs)


# 데이터셋 분할

In [4]:
trn.shape

(319954, 125)

In [5]:
ftr=trn.drop("class",axis=1)
target=trn['class']
X_train_1,X_val_1,y_train_1,y_val_1=train_test_split(ftr,target,test_size=0.3,random_state=8282)
evals_1=[(X_val_1,y_val_1)]

In [6]:
def get_clf_eval(y_test, y_pred):
    confusion = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
#    precision = precision_score(y_test, y_pred)
#     recall = recall_score(y_test, y_pred)
#     F1 = f1_score(y_test, y_pred)
#     AUC = roc_auc_score(y_test, y_pred)
    
    print('오차행렬:\n', confusion)
    print('\n정확도: {:.4f}'.format(accuracy))
#     print('정밀도: {:.4f}'.format(precision))
#     print('재현율: {:.4f}'.format(recall))
#     print('F1: {:.4f}'.format(F1))
#     print('AUC: {:.4f}'.format(AUC))

# lgbm

In [7]:
lgbm_clf=LGBMClassifier(
        n_estimators = 500,
        boosting_type ='dart',
        learning_rate =0.1,
        num_leaves = 95,
        max_depth = 28,
        objective='multiclass',
        num_class=3
        )

In [300]:
lgbm_clf.fit(X_train_1,y_train_1,eval_set=evals_1,eval_metric='multi_error',verbose=True)

[1]	valid_0's multi_error: 0.136956	valid_0's multi_logloss: 0.839609
[2]	valid_0's multi_error: 0.134925	valid_0's multi_logloss: 0.729963
[3]	valid_0's multi_error: 0.134602	valid_0's multi_logloss: 0.642837
[4]	valid_0's multi_error: 0.0904393	valid_0's multi_logloss: 0.572077
[5]	valid_0's multi_error: 0.0828029	valid_0's multi_logloss: 0.513769
[6]	valid_0's multi_error: 0.0778543	valid_0's multi_logloss: 0.465094
[7]	valid_0's multi_error: 0.0753852	valid_0's multi_logloss: 0.424194
[8]	valid_0's multi_error: 0.0770208	valid_0's multi_logloss: 0.443262
[9]	valid_0's multi_error: 0.0747393	valid_0's multi_logloss: 0.405629
[10]	valid_0's multi_error: 0.0728536	valid_0's multi_logloss: 0.373602
[11]	valid_0's multi_error: 0.0713326	valid_0's multi_logloss: 0.346145
[12]	valid_0's multi_error: 0.072166	valid_0's multi_logloss: 0.356575
[13]	valid_0's multi_error: 0.0706346	valid_0's multi_logloss: 0.33155
[14]	valid_0's multi_error: 0.0699053	valid_0's multi_logloss: 0.310035
[15]	v

[227]	valid_0's multi_error: 0.0635919	valid_0's multi_logloss: 0.15394
[228]	valid_0's multi_error: 0.0637274	valid_0's multi_logloss: 0.153799
[229]	valid_0's multi_error: 0.063717	valid_0's multi_logloss: 0.153875
[230]	valid_0's multi_error: 0.0637795	valid_0's multi_logloss: 0.153775
[231]	valid_0's multi_error: 0.0637065	valid_0's multi_logloss: 0.153638
[232]	valid_0's multi_error: 0.0636649	valid_0's multi_logloss: 0.153509
[233]	valid_0's multi_error: 0.0636649	valid_0's multi_logloss: 0.15357
[234]	valid_0's multi_error: 0.0635815	valid_0's multi_logloss: 0.153471
[235]	valid_0's multi_error: 0.0636024	valid_0's multi_logloss: 0.153534
[236]	valid_0's multi_error: 0.0636128	valid_0's multi_logloss: 0.153445
[237]	valid_0's multi_error: 0.0635086	valid_0's multi_logloss: 0.153379
[238]	valid_0's multi_error: 0.0636545	valid_0's multi_logloss: 0.153306
[239]	valid_0's multi_error: 0.0636232	valid_0's multi_logloss: 0.153184
[240]	valid_0's multi_error: 0.0637274	valid_0's multi

[451]	valid_0's multi_error: 0.0635711	valid_0's multi_logloss: 0.153519
[452]	valid_0's multi_error: 0.0635503	valid_0's multi_logloss: 0.153498
[453]	valid_0's multi_error: 0.0635815	valid_0's multi_logloss: 0.153483
[454]	valid_0's multi_error: 0.0635711	valid_0's multi_logloss: 0.153467
[455]	valid_0's multi_error: 0.0636024	valid_0's multi_logloss: 0.153483
[456]	valid_0's multi_error: 0.0636336	valid_0's multi_logloss: 0.153466
[457]	valid_0's multi_error: 0.0636128	valid_0's multi_logloss: 0.153449
[458]	valid_0's multi_error: 0.0636336	valid_0's multi_logloss: 0.153476
[459]	valid_0's multi_error: 0.0637065	valid_0's multi_logloss: 0.153487
[460]	valid_0's multi_error: 0.0636753	valid_0's multi_logloss: 0.153476
[461]	valid_0's multi_error: 0.0636336	valid_0's multi_logloss: 0.153495
[462]	valid_0's multi_error: 0.0636128	valid_0's multi_logloss: 0.153476
[463]	valid_0's multi_error: 0.0636545	valid_0's multi_logloss: 0.153458
[464]	valid_0's multi_error: 0.0635919	valid_0's mu

LGBMClassifier(boosting_type='dart', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=28,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=500, n_jobs=-1, num_class=3, num_leaves=95,
               objective='multiclass', random_state=None, reg_alpha=0.0,
               reg_lambda=0.0, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)

In [330]:
#accuracy
preds_val_1=lgbm_clf.predict(X_val_1)
preds_train_1=lgbm_clf.predict(X_train_1)
print('***valid_1***')
print(get_clf_eval(y_val_1,preds_val_1))
print()
print('***train_1***')
print(get_clf_eval(y_train_1,preds_train_1))

***valid_1***
오차행렬:
 [[35959    34    60]
 [   16  8963  3788]
 [   30  2164 44973]]

정확도: 0.9365
None

***train_1***
오차행렬:
 [[ 83913      0      0]
 [     0  24928   5083]
 [     0   1646 108397]]

정확도: 0.9700
None


# random forest

In [11]:
trn.shape

(319954, 125)

In [12]:
rf_clf = RandomForestClassifier(n_estimators = 200, 
                                random_state=2020,
                                verbose=True,
                                oob_score=True,
                                n_jobs=-1)

In [281]:
rf_clf.fit(X_train_1,y_train_1,)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   41.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  3.7min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=-1, oob_score=False, random_state=2020,
                       verbose=True, warm_start=False)

In [283]:
#accuracy
preds_val_1=rf_clf.predict(X_val_1)
preds_train_1=rf_clf.predict(X_train_1)
print('***valid_1***')
print(get_clf_eval(y_val_1,preds_val_1))
print()
print('***train_1***')
print(get_clf_eval(y_train_1,preds_train_1))

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.3s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    1.4s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    3.0s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    3.2s finished


***valid_1***
오차행렬:
 [[35963    35    55]
 [   13  8919  3835]
 [   30  2099 45038]]

정확도: 0.9368
None

***train_1***
오차행렬:
 [[ 83913      0      0]
 [     0  30011      0]
 [     0      0 110043]]

정확도: 1.0000
None


## xgboost

In [13]:
import xgboost as xgb

In [14]:
xgb_clf = xgb.XGBClassifier(learning_rate=0.1,
                          n_estimators=220,
                          max_depth=10,
                          feature_fraction=0.9,
                          booster='dart',
                          tree_method='exact',
                            objective='multiclass',
                            num_class=3,
                            n_jobs=-1)
xgb_clf

XGBClassifier(booster='dart', feature_fraction=0.9, max_depth=10,
              n_estimators=220, n_jobs=-1, num_class=3, objective='multiclass',
              tree_method='exact')

In [296]:
xgb_clf.fit(X_train_1,y_train_1,eval_set=evals_1,verbose=True)

[0]	validation_0-merror:0.070687
[1]	validation_0-merror:0.069509
[2]	validation_0-merror:0.06852
[3]	validation_0-merror:0.068238
[4]	validation_0-merror:0.067874
[5]	validation_0-merror:0.067415
[6]	validation_0-merror:0.067051
[7]	validation_0-merror:0.067072
[8]	validation_0-merror:0.066769
[9]	validation_0-merror:0.066582
[10]	validation_0-merror:0.066217
[11]	validation_0-merror:0.066321
[12]	validation_0-merror:0.066207
[13]	validation_0-merror:0.065894
[14]	validation_0-merror:0.065811
[15]	validation_0-merror:0.065801
[16]	validation_0-merror:0.065623
[17]	validation_0-merror:0.065509
[18]	validation_0-merror:0.065373
[19]	validation_0-merror:0.065509
[20]	validation_0-merror:0.065415
[21]	validation_0-merror:0.065228
[22]	validation_0-merror:0.06528
[23]	validation_0-merror:0.065228
[24]	validation_0-merror:0.065175
[25]	validation_0-merror:0.064978
[26]	validation_0-merror:0.065061
[27]	validation_0-merror:0.06504
[28]	validation_0-merror:0.064769
[29]	validation_0-merror:0.

XGBClassifier(base_score=0.5, booster='dart', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, feature_fraction=0.9,
              gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=None, n_estimators=220, n_jobs=-1,
              nthread=None, num_class=3, objective='multi:softprob',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=None, subsample=1, tree_method='exact',
              verbosity=1)

In [297]:
#accuracy
preds_val_1=xgb_clf.predict(X_val_1)
preds_train_1=xgb_clf.predict(X_train_1)
print('***valid_1***')
print(get_clf_eval(y_val_1,preds_val_1))
print()
print('***train_1***')
print(get_clf_eval(y_train_1,preds_train_1))

***valid_1***
오차행렬:
 [[35967    36    50]
 [   15  9019  3733]
 [   26  2198 44943]]

정확도: 0.9369
None

***train_1***
오차행렬:
 [[ 83913      0      0]
 [     0  27349   2662]
 [     0    737 109306]]

정확도: 0.9848
None



## ensemble

In [15]:
from sklearn.ensemble import VotingClassifier

In [34]:
lgbm_clf=LGBMClassifier(
        n_estimators = 500,
        boosting_type ='dart',
        learning_rate =0.1,
        num_leaves = 95,
        max_depth = 28,
        objective='multiclass',
        num_class=3
        )

In [35]:
xgb_clf = xgb.XGBClassifier(learning_rate=0.1,
                          n_estimators=220,
                          max_depth=10,
                          feature_fraction=0.9,
                          booster='dart',
                          tree_method='exact',
                            objective='multiclass',
                            num_class=3,
                            n_jobs=-1)
xgb_clf

XGBClassifier(booster='dart', feature_fraction=0.9, max_depth=10,
              n_estimators=220, n_jobs=-1, num_class=3, objective='multiclass',
              tree_method='exact')

In [36]:
rf_clf = RandomForestClassifier(n_estimators = 200, 
                                random_state=2020,
                                verbose=True,
                                oob_score=True,
                                n_jobs=-1)

In [37]:
voting_clf = VotingClassifier(estimators=[('lgbm',lgbm_clf),('rf',rf_clf),('xgb',xgb_clf)],voting='soft',verbose=True,n_jobs=-1)
voting_clf

VotingClassifier(estimators=[('lgbm',
                              LGBMClassifier(boosting_type='dart', max_depth=28,
                                             n_estimators=500, num_class=3,
                                             num_leaves=95,
                                             objective='multiclass')),
                             ('rf',
                              RandomForestClassifier(n_estimators=200,
                                                     n_jobs=-1, oob_score=True,
                                                     random_state=2020,
                                                     verbose=True)),
                             ('xgb',
                              XGBClassifier(booster='dart',
                                            feature_fraction=0.9, max_depth=10,
                                            n_estimators=220, n_jobs=-1,
                                            num_class=3, objective='multiclass',
               

In [None]:
voting_clf.fit(X_train_1,y_train_1)

In [21]:
print(voting_clf)
print(X_train_1.shape)
#accuracy
preds_val_1=voting_clf.predict(X_val_1)
preds_train_1=voting_clf.predict(X_train_1)
print('***valid_1***')
print(get_clf_eval(y_val_1,preds_val_1))
print()
print('***train_1***')
print(get_clf_eval(y_train_1,preds_train_1))

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.2s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    1.3s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    3.5s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    3.7s finished


***valid_1***
오차행렬:
 [[35965    36    52]
 [   13  8967  3787]
 [   27  2093 45047]]

정확도: 0.9374
None

***train_1***
오차행렬:
 [[ 83913      0      0]
 [     0  27473   2538]
 [     0    630 109413]]

정확도: 0.9859
None


In [22]:
trn.shape

(319954, 125)

In [23]:
ftr=trn.drop("class",axis=1)
target=trn['class']

In [24]:
from sklearn.ensemble import VotingClassifier

In [25]:
voting_clf = VotingClassifier(estimators=[('lgbm',lgbm_clf),('rf',rf_clf),('xgb',xgb_clf)],voting='hard',verbose=True,n_jobs=-1)
voting_clf

VotingClassifier(estimators=[('lgbm',
                              LGBMClassifier(boosting_type='dart', max_depth=28,
                                             n_estimators=500, num_class=3,
                                             num_leaves=95,
                                             objective='multiclass')),
                             ('rf',
                              RandomForestClassifier(n_estimators=200,
                                                     n_jobs=-1, oob_score=True,
                                                     random_state=2020,
                                                     verbose=True)),
                             ('xgb',
                              XGBClassifier(booster='dart',
                                            feature_fraction=0.9, max_depth=10,
                                            n_estimators=220, n_jobs=-1,
                                            num_class=3, objective='multiclass',
               

In [26]:
voting_clf.fit(ftr,target)

VotingClassifier(estimators=[('lgbm',
                              LGBMClassifier(boosting_type='dart', max_depth=28,
                                             n_estimators=500, num_class=3,
                                             num_leaves=95,
                                             objective='multiclass')),
                             ('rf',
                              RandomForestClassifier(n_estimators=200,
                                                     n_jobs=-1, oob_score=True,
                                                     random_state=2020,
                                                     verbose=True)),
                             ('xgb',
                              XGBClassifier(booster='dart',
                                            feature_fraction=0.9, max_depth=10,
                                            n_estimators=220, n_jobs=-1,
                                            num_class=3, objective='multiclass',
               

# 제출

In [27]:
# 테스트 데이터 정리
tst = pd.read_csv('C:/python/DACON_DATA/test.csv', index_col=0)

In [28]:
all_fea=['u','g','r','i','z','redshift','dered_u','dered_g','dered_r','dered_i','dered_z','nObserve','nDetect','airmass_u','airmass_g','airmass_r','airmass_i','airmass_z']
fea=['u', 'g', 'r', 'i', 'z', 'redshift', 'dered_u', 'dered_g','dered_r', 'dered_i', 'dered_z']
fea2=['u','g','r','i','z','redshift','dered_u','dered_g','dered_r','dered_i','dered_z','nObserve','nDetect']
names=['u','g','r','i','z']
names_2 = ['dered_u','dered_g','dered_r','dered_i','dered_z']

# #이상치 제거
# for i in range(len(fea)):
#     tst=tst[tst[fea[i]]>np.min(tst[fea[i]], axis=0)]
#     tst=tst[tst[fea[i]]<np.max(tst[fea[i]], axis=0)]
    
# # 설명변수와 반응변수 분리
# tst_target = tst['class']
# tst = tst.drop('class', axis=1)

# 옵저브 디텍트 연속형으로 전환
tst['nObserve']=tst['nObserve'].astype('float')
tst['nDetect']=tst['nDetect'].astype('float')
tstnO = tst['nObserve']
tstnD = tst['nDetect']

#카테고리별 max, min, max-min, std, sum을 구한다.
#max-min
tst['max-min'] = tst[all_fea].max(axis=1)-tst[all_fea].min(axis=1)
tst['max-min_ugriz'] = tst[names].max(axis=1)-tst[names].min(axis=1)
tst['max-min_dered'] = tst[names_2].max(axis=1)-tst[names_2].min(axis=1)
#std
tst['std'] = tst[all_fea].std(axis=1)
tst['std_ugriz'] = tst[names].std(axis=1)
tst['std_dered'] = tst[names_2].std(axis=1)
#파장별 합
tst['sum'] = tst[all_fea].sum(axis=1)
tst['sum_ugriz'] = tst[names].sum(axis=1)
tst['sum_dered'] = tst[names_2].sum(axis=1)
#파장별 최대값
tst['max'] = tst[all_fea].max(axis=1)
tst['max_ugriz'] = tst[names].max(axis=1)
tst['max_dered'] = tst[names_2].max(axis=1)
#파장별 최소값
tst['min'] = tst[all_fea].min(axis=1)
tst['min_ugriz'] = tst[names].min(axis=1)
tst['min_dered'] = tst[names_2].min(axis=1)
#파장별 max-max,min=min,sum-sum
tst['max-max']=tst[names].max(axis=1)-tst[names_2].max(axis=1)
tst['min-min']=tst[names].min(axis=1)-tst[names_2].min(axis=1)
tst['sum-sum']=tst[names].sum(axis=1)-tst[names_2].sum(axis=1)

#왜도,첨도 구하기
tst['skew']=skew(tst[names],axis=1)
tst['kurtosis']=kurtosis(tst[names],axis=1)
tst['dered_skew']=skew(tst[names_2],axis=1)
tst['dered_kurtosis']=kurtosis(tst[names_2],axis=1)
tst['airmass_skew']=skew(tst[airmass],axis=1)
tst['airmass_kurtosis']=kurtosis(tst[airmass],axis=1)

#조합으로 연산 피쳐 생성
for c1,c2 in tqdm(itertools.combinations(fea2,2)):
    dif_col=f'diff_{c1}_{c2}'
    div_col=f'div_{c1}_{c2}'
    sum_col=f'sum_{c1}_{c2}'
    mul_col=f'mul_{c1}_{c2}'
    tst[dif_col]=tst[c1]-tst[c2]
    tst[div_col]=tst[c1]/tst[c2]
    tst[sum_col]=tst[c1]+tst[c2]
    tst[mul_col]=tst[c1]*tst[c2]


# 소수점 4자리 까지만 나타내는 asinh 변수 생성
tst['asinh_mu'] = -2.5/np.log(10)*(np.arcsinh(tst.u/24.63/(2.8e-10))-22.689378693319245)
tst['asinh_mg'] = -2.5/np.log(10)*(np.arcsinh(tst.g/25.11/(1.8e-10))-23.131211445598282)
tst['asinh_mr'] = -2.5/np.log(10)*(np.arcsinh(tst.r/24.80/(2.4e-10))-22.843529373146502)
tst['asinh_mi'] = -2.5/np.log(10)*(np.arcsinh(tst.i/24.36/(3.6e-10))-22.43806426503834)
tst['asinh_mz'] = -2.5/np.log(10)*(np.arcsinh(tst.z/22.83/(1.48e-09))-21.024370929730330)


tst['redshift%14'] = tst['redshift']%14
tst['log_redshift']=np.log1p(tst['redshift'])
tst['log_redshift']=tst['log_redshift'].fillna(0)

#도메인에서 얻은 파생변수 생성
#출처: https://www.sdss.org/dr16/algorithms/segue_target_selection/#Legacy
tst['l-color'] = (-0.436*tst['u']) + (1.129*tst['g']) - (0.119*tst['r']) - (0.574*tst['i']) + (0.1984)
tst['s-color'] = (-0.249*tst['u']) + (0.794*tst['g']) - (0.555*tst['r']) + (0.234)
tst['P1'] = (0.91*(tst['u']-tst['g'])) + (0.415*(tst['g']-tst['r'])) - (1.280)


# tst['class'] = tst_target
PI30_1=['dered_z','dered_g','div_dered_g_dered_z','sum_dered_i_nDetect','mul_dered_g_dered_z','sum_z_dered_i','diff_r_redshift','sum_u_redshift','sum_dered_r_nDetect','mul_g_dered_z','sum_dered_r_dered_z','div_r_nObserve','sum_i_redshift','diff_dered_g_dered_z','sum_dered_g_dered_r','sum_r_dered_z','diff_u_redshift','mul_u_dered_z','mul_dered_g_dered_r','mul_i_dered_r','div_i_dered_g','sum_r_redshift','div_u_dered_z','mul_r_z','div_g_z','diff_u_dered_z','mul_r_dered_g','sum_redshift_dered_z','u','div_redshift_dered_u']
tst=tst[tst.columns.difference(PI30_1)]
PI30_2=['mul_dered_g_nDetect','sum_r_nObserve','sum_i_dered_r','diff_z_nObserve','mul_nObserve_nDetect','asinh_mz','nObserve','asinh_mg','diff_g_dered_z','mul_g_dered_u','log_redshift','nDetect','mul_u_z','sum_r_nDetect','div_dered_i_nObserve','sum_i_z','sum_u_dered_r','dered_i','mul_g_dered_r','mul_z_dered_i','div_g_nDetect','diff_dered_g_nObserve','mul_r_dered_z','sum_i_nDetect','diff_z_dered_g','mul_g_z','sum_z_dered_z','mul_dered_u_nObserve','div_redshift_nObserve','dered_r']
tst=tst[tst.columns.difference(PI30_2)]
PI30_3=["mul_g_i","sum_redshift_dered_u","sum_g_i","sum_z_dered_g","sum_i_dered_z","mul_r_dered_r","div_g_i","mul_dered_r_dered_i","g","div_dered_i_nDetect","mul_dered_r_nObserve","sum_r_i","min_dered","mul_i_dered_z","div_dered_r_nObserve","diff_dered_z_nDetect","div_i_nDetect","sum_nObserve_nDetect","max_dered","mul_g_r","div_z_nObserve","sum_i_dered_u","mul_dered_r_nDetect","div_dered_g_nObserve","mul_r_i","diff_i_nDetect","sum_u_r","sum_dered_g_nDetect","div_u_nDetect","sum_u_dered_g"]
tst=tst[tst.columns.difference(PI30_3)]
PI30_4=["div_r_nDetect","mul_u_g","diff_dered_i_nDetect","mul_u_i","mul_dered_i_nObserve","div_i_dered_i","mul_i_nDetect","mul_g_dered_g","sum_u_dered_z","mul_r_dered_i","sum_g_dered_g","sum_u_g","mul_dered_u_dered_i","sum_g_nDetect","mul_z_nObserve","mul_g_nDetect","mul_dered_i_nDetect","sum_dered_g_dered_i","mul_u_dered_i","div_dered_z_nDetect","i","sum_g_z","sum_u_dered_u","sum_g_r","sum_dered","mul_u_nDetect","mul_i_dered_i","mul_dered_g_nObserve","div_dered_g_nDetect","div_u_i"]
tst=tst[tst.columns.difference(PI30_4)]
PI30_5=["asinh_mr","div_dered_r_nDetect","mul_z_dered_r","mul_g_nObserve","diff_u_nDetect","sum_g_dered_u","sum_redshift_dered_i","div_i_nObserve","div_g_nObserve","z","mul_dered_z_nObserve","sum_dered_r_nObserve","sum_z_dered_r","sum_u_dered_i","mul_u_dered_g","mul_dered_u_dered_r","diff_z_nDetect","sum_r_dered_r","sum_dered_u_dered_g","asinh_mi","diff_i_redshift","diff_r_nObserve","mul_i_nObserve","diff_dered_r_nObserve","sum_i_dered_i","sum_r_dered_i","diff_dered_r_nDetect","sum_u_nObserve","div_dered_u_nDetect","sum_z_dered_u"]
tst=tst[tst.columns.difference(PI30_5)]
PI30_6=["sum_dered_i_nObserve","diff_nObserve_nDetect","mul_dered_u_dered_z","div_u_nObserve","diff_u_nObserve","diff_redshift_nDetect","mul_u_nObserve","diff_dered_g_nDetect","sum_u_z","max_ugriz","sum_g_dered_r","mul_r_nObserve","div_z_nDetect","max","mul_dered_u_nDetect","sum_i_nObserve","diff_g_nDetect","sum_dered_z_nDetect","mul_z_nDetect","mul_i_z","diff_r_nDetect","diff_g_nObserve","div_g_dered_i","mul_dered_z_nDetect","mul_i_dered_u","diff_dered_u_nObserve","div_dered_z_nObserve","mul_u_r","diff_i_nObserve","sum_ugriz"]
tst=tst[tst.columns.difference(PI30_6)]
PI30_7=["dered_u","mul_i_dered_g","mul_r_dered_u","mul_r_nDetect","sum_dered_g_nObserve","sum-sum","diff_dered_u_dered_z","diff_dered_u_nDetect","mul_dered_r_dered_z","sum_dered_u_nObserve","diff_dered_i_nObserve","diff_g_z","diff_z_redshift","diff_i_dered_g","mul_z_dered_z","min_ugriz","diff_dered_z_nObserve","mul_g_dered_i","asinh_mu","sum_dered_u_dered_r","diff_u_z","sum_u_nDetect","std_ugriz","sum_dered_r_dered_i","sum_dered_z_nObserve","mul_dered_u_dered_g","sum_r_dered_g","mul_u_dered_r","max-min_ugriz","diff_redshift_dered_r"]
tst=tst[tst.columns.difference(PI30_7)]
PI30_8=["div_i_redshift","sum_g_redshift","div_r_dered_r","mul_z_dered_g","sum_redshift_nDetect","diff_u_dered_u","diff_i_dered_i","diff_i_dered_u","sum_g_dered_i","diff_u_dered_i","div_g_redshift","sum_u_i","div_z_dered_r","sum_i_dered_g","sum_redshift_dered_r","sum_z_nObserve","sum_dered_u_nDetect","diff_redshift_nObserve","sum_dered_u_dered_z","div_g_dered_z","sum_z_nDetect","diff_redshift_dered_z","div_dered_g_dered_i","div_dered_u_nObserve","mul_u_dered_u","sum_r_z","r","max-min_dered","mul_dered_g_dered_i","sum_g_nObserve"]
tst=tst[tst.columns.difference(PI30_8)]

78it [00:00, 87.29it/s] 
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [29]:
tst.shape

(80000, 124)

In [30]:
tst_data = tst.copy()

In [31]:
preds=voting_clf.predict(tst_data)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.1s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    1.2s finished


In [32]:
sub = pd.read_csv('C:/python/DACON_DATA/sample_submission.csv')
sub['class'] = preds.reshape(-1,1)
sub.to_csv('submission_file/submission_LGB_RF_XG_ensemble09281930.csv')

In [33]:
pd.Series(preds).value_counts()

2    40823
0    29981
1     9196
dtype: int64

## 신경망 1,2 분류

In [None]:
ftr=trn[trn["class"]!=0].drop("class",axis=1)
target=trn[trn["class"]!=0]['class']

In [24]:
corr = trn.loc[trn['class']!=0,:].corr()

cor_list=[]

for i in range(0,trn.loc[trn['class']!=0,:].shape[1]):
    for j in range(0,trn.loc[trn['class']!=0,:].shape[1]):
        if abs(corr.iloc[i,j])>=0.9 and corr.index[i]!=corr.columns[j]:
            cor_list += [corr.columns[j]]
            cor_list += [corr.index[i]]


groupby=cor_list
cor_result=dict()
for ip in tqdm(cor_list):
    cor_result[ip]=cor_list.count(ip)

    
final_corr= []
for i in range(0,len(cor_result)):
    if list(cor_result.values())[i] > 20 :
        final_corr.append(list(cor_result.keys())[i])

print(cor_result)
print()
print(final_corr)



cols = list(trn.columns[~trn.columns.isin(list(cor_result.keys()))])

for i in ['diff_u_dered_g','P1','diff_u_dered_r','airmass_i','skew','kurtosis','diff_g_i','sum_dered_u_dered_i','sum_r_dered_u','diff_redshift_dered_u','sum_z_redshift', 'mul_redshift_dered_i','mul_redshift_nObserve']:
    cols.append(i)

print(cols)

100%|███████████████████████████████████████████████████████████████████████████| 1060/1060 [00:00<00:00, 55940.24it/s]


{'diff_u_dered_g': 6, 'P1': 14, 'diff_u_dered_r': 12, 'diff_u_g': 6, 'diff_u_i': 14, 'diff_u_r': 14, 'div_u_g': 6, 'div_u_r': 14, 'airmass_i': 8, 'airmass_g': 8, 'airmass_r': 8, 'airmass_u': 8, 'airmass_z': 8, 'airmass_skew': 2, 'airmass_kurtosis': 2, 'kurtosis': 2, 'dered_kurtosis': 2, 'skew': 2, 'dered_skew': 2, 'diff_g_dered_i': 4, 'diff_dered_g_dered_i': 4, 'diff_g_i': 4, 'diff_g_r': 6, 'diff_dered_g_dered_r': 6, 'diff_r_dered_g': 2, 'div_dered_g_dered_r': 2, 'diff_i_dered_z': 4, 'diff_dered_i_dered_z': 4, 'div_dered_i_dered_z': 2, 'diff_r_i': 6, 'diff_dered_r_dered_i': 6, 'div_dered_r_dered_i': 6, 'div_r_i': 6, 'diff_r_dered_z': 4, 'diff_dered_r_dered_z': 4, 'div_dered_r_dered_z': 2, 'diff_g_dered_u': 2, 'diff_dered_u_dered_g': 4, 'div_dered_u_dered_g': 2, 'diff_dered_u_dered_r': 16, 'diff_dered_u_dered_i': 12, 'diff_r_dered_u': 14, 'div_dered_u_dered_i': 14, 'div_dered_u_dered_z': 8, 'div_i_dered_u': 8, 'div_dered_u_dered_r': 8, 'div_r_dered_u': 6, 'diff_g_dered_r': 4, 'div_g_r':

In [209]:
trn_nn=trn[cols]

In [210]:
trn_nn=trn_nn[trn_nn["class"]!=0]

In [211]:
X=trn_nn.drop('class', axis=1)
y=trn_nn['class']-1

In [212]:
X.shape

(199988, 41)

In [213]:
X_train, X_val, y_train, y_val =  train_test_split(X,y,test_size=0.3,random_state=0)

In [214]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [215]:
scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [216]:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_val)

In [217]:
from sklearn.neural_network import MLPClassifier

In [222]:
mlp = MLPClassifier(hidden_layer_sizes=(60,50),max_iter=100000,learning_rate_init=0.001,activation='relu')

In [223]:
mlp.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(60, 50), learning_rate='constant',
              learning_rate_init=0.001, max_iter=100000, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [None]:
prob=mlp.predict_proba(X_val)
prob

In [None]:
predictions = mlp.predict(X_val)
predictions_train = mlp.predict(X_train)
print('***valid_1***')
print(get_clf_eval(y_val,predictions))
print()
print('***train_1***')
print(get_clf_eval(y_train,predictions_train))