In [13]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from pycaret.classification import *
from sklearn.metrics import f1_score

In [14]:
import platform
platform.platform()

'Windows-10-10.0.22621-SP0'

In [15]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [16]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [17]:
train_x = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Quality'])

test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

In [18]:
# qualitative to quantitative
qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i]) 
print('Done.')

Done.


우선 가장 기본적인 Feature selection을 진행했다.(결측치가 많은 column drop)
1. Correlation 분석: feature의 수가 워낙 많기에 correlation을 일일이 따지기는 무리라고 판단했다.
2. na값이 많은 feature에 대해서(na값이 80퍼센트 이상인 column) drop을 시킨다
3. 이 때, train set에서 떨궈낸 column들을 test에도 동일하게 적용시켰다

In [19]:
col = train_x.columns
col = col[3:]
for i in col:
    if train_x.loc[:, i].isna().sum() > len(train_x) * 0.8:
        train_x.drop(i, axis = 1, inplace = True)

In [22]:
train_x

Unnamed: 0,Y_Class,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,...,X_2856,X_2857,X_2858,X_2859,X_2860,X_2861,X_2862,X_2863,X_2864,X_2865
0,1,2,0,,,,,,,,...,181.6,139.6,131.646667,115.4,209.0,197.286667,189.0,383.0,368.296296,353.0
1,2,3,0,,,,,,,,...,177.1,145.5,128.748276,119.7,198.0,193.296552,185.6,383.0,367.735849,353.0
2,1,2,0,,,,,,,,...,183.6,128.0,115.365517,104.0,193.4,179.820690,165.5,383.0,367.320755,353.0
3,2,3,0,,,,,,,,...,179.5,126.2,112.082759,94.5,190.3,181.920690,165.8,384.0,369.188679,353.0
4,1,2,0,,,,,,,,...,188.0,152.1,138.066667,109.7,208.6,196.393333,182.6,383.0,367.351852,352.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,1,5,2,2.0,95.0,0.0,45.0,10.0,0.0,50.0,...,,,,,,,,,,
594,0,2,0,,,,,,,,...,168.2,185.1,159.527586,144.2,198.3,180.810345,168.7,384.0,369.811321,353.0
595,0,2,0,,,,,,,,...,153.4,187.6,178.248276,159.6,186.5,176.486207,156.6,383.0,367.018868,352.0
596,1,4,1,40.0,94.0,0.0,45.0,11.0,0.0,45.0,...,,,,,,,,,,


In [20]:
test_x = test_x.loc[:, train_x.columns[1:]]
test_x

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2856,X_2857,X_2858,X_2859,X_2860,X_2861,X_2862,X_2863,X_2864,X_2865
0,5,2,2.0,94.0,0.0,45.0,10.0,0.0,51.0,10.0,...,,,,,,,,,,
1,4,2,2.0,93.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
2,4,2,2.0,95.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
3,0,0,,,,,,,,,...,159.4,201.3,179.739286,149.9,198.0,191.450000,183.8,467.0,444.192308,423.0
4,1,0,,,,,,,,,...,175.3,227.9,190.664286,162.4,210.2,193.082143,179.7,465.0,443.211539,423.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,5,2,2.0,91.0,0.0,45.0,10.0,0.0,51.0,10.0,...,,,,,,,,,,
306,4,2,2.0,96.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
307,5,2,2.0,91.0,0.0,45.0,10.0,0.0,50.0,10.0,...,,,,,,,,,,
308,5,2,2.0,95.0,0.0,45.0,10.0,0.0,51.0,10.0,...,,,,,,,,,,


In [23]:
train_x = train_x.fillna(0)
test_x = test_x.fillna(0)

밑의 코드는 이산형 변수와 연속형 변수를 분리한 과정이다.
1. feature의 unique 값이 12개 이하이면 이산형 변수로 판단했다.
2. 그 외에는 연속형(numeric) 변수로 판단했다.

In [12]:
idx = train_x.columns[3:]
feat_discrete = []
feat_numeric = []
for i in idx:
    check = train_x.loc[:, i]
    if len(np.unique(check)) + 1 <= 13:
        feat_discrete.append(i)
    else:
        feat_numeric.append(i)

In [13]:
len(feat_discrete)

1053

결측치 처리 방식으로는 0으로 대체 / KNNImputer / IterativeImputer 또는 mlp을 이용한 회귀 분석 / SimpleImputer를 진행해봤다. 그 중 가장 좋은 결과를 낸 simpleimputer를 최종적으로 이용하였다.
1. 이산형 변수에 대해서는 strategy로 최빈값을 연속형 변수에는 default인 mean으로 채워 넣었다.
2. 연속형 변수에 대해서는 mean으로 채워 넣었다.
3. SimpleImputer를 우선 trainset으로 학습을 시키고 동일하게 testset에도 적용시켰다.

In [14]:
from sklearn.impute import SimpleImputer
imp_continuous = SimpleImputer()
imp_discrete = SimpleImputer(strategy = "most_frequent")
target = train_x["Y_Class"]
train_x.drop("Y_Class", axis = 1, inplace = True)
imp_continuous.fit(train_x.loc[:, feat_numeric])
imp_discrete.fit(train_x.loc[:, feat_discrete])
imputed_df_train_num = pd.DataFrame(imp_continuous.transform(train_x.loc[:, feat_numeric]), columns = feat_numeric)
imputed_df_train_dis = pd.DataFrame(imp_discrete.transform(train_x.loc[:, feat_discrete]), columns = feat_discrete)
imputed_df_train = pd.concat([imputed_df_train_num, imputed_df_train_dis], axis = 1)
imputed_df_test_num = pd.DataFrame(imp_continuous.transform(test_x.loc[:, feat_numeric]), columns = feat_numeric)
imputed_df_test_dis = pd.DataFrame(imp_discrete.transform(test_x.loc[:, feat_discrete]), columns = feat_discrete)
imputed_df_test = pd.concat([imputed_df_test_num, imputed_df_test_dis], axis = 1)

In [15]:
add_cols = train_x.iloc[:, 0:2]
add_col = test_x.iloc[:, 0:2]
imputed_df_train = pd.concat([add_cols, imputed_df_train], axis = 1)
imputed_df_test = pd.concat([add_col, imputed_df_test], axis = 1)

In [16]:
train = imputed_df_train
train["Y_Class"] = target
test = imputed_df_test

In [17]:
idx = train.columns
col = []
# 하나의 값만 들어있는 column 들은 나가리 시킨다.
for i in idx:
    check = train.loc[:, i]
    if len(np.unique(check)) == 1:
        col.append(i)

In [18]:
train.drop(col, axis = 1, inplace = True)
test.drop(col, axis = 1, inplace = True)

### automl
1. 모델 튜닝 등을 진행해 보면서 가장 best인 3개의 모델을 확인한다.
2. StratifiedKFold는 10회, random 값은 42로 진행했다.
3. 최종적으로는 f1 기준으로 3개의 가장 좋은 모델을 blend한다.

In [25]:
clf = setup(data = train_x, target = "Y_Class", session_id = 42, fold = 10, use_gpu = True)
best_model = compare_models(sort = "F1", n_select = 3)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,Y_Class
2,Target type,Multiclass
3,Original data shape,"(598, 2186)"
4,Transformed data shape,"(598, 2186)"
5,Transformed train set shape,"(418, 2186)"
6,Transformed test set shape,"(180, 2186)"
7,Numeric features,2185
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.7657,0.7816,0.7657,0.7646,0.7343,0.4233,0.4554,76.333
gbc,Gradient Boosting Classifier,0.756,0.7649,0.756,0.7514,0.7305,0.4221,0.4429,7.905
lightgbm,Light Gradient Boosting Machine,0.7536,0.7838,0.7536,0.7546,0.7268,0.4078,0.4399,5.21
xgboost,Extreme Gradient Boosting,0.7416,0.7741,0.7416,0.7344,0.7101,0.3711,0.4055,4.397
rf,Random Forest Classifier,0.744,0.7759,0.744,0.7244,0.701,0.3569,0.3868,0.874
et,Extra Trees Classifier,0.7441,0.7826,0.7441,0.7221,0.6983,0.3467,0.3827,0.569
ada,Ada Boost Classifier,0.6988,0.6184,0.6988,0.6452,0.6503,0.2524,0.2758,0.711
lr,Logistic Regression,0.6988,0.6995,0.6988,0.6411,0.6456,0.2407,0.2741,2.783
dt,Decision Tree Classifier,0.6413,0.6419,0.6413,0.6573,0.6427,0.2723,0.275,0.203
ridge,Ridge Classifier,0.706,0.0,0.706,0.6448,0.6421,0.2291,0.2738,0.159


Processing:   0%|          | 0/71 [00:00<?, ?it/s]

In [26]:
# top3 모델은 다음과 같다. GradientBoostingClassifier(f1기준: 0.7492), catboost(f1기준: 0.7363), lightgbm(f1기준: 0.7298)
# 추가적으로 tune_model을 이용하여 튜닝을 진행하였으나 기본 basemodel을 ensemble 시킨 것이 가장 좋은 결과를 나타냈다.
best_model

[<catboost.core.CatBoostClassifier at 0x1e9807ffe20>,
 GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                            learning_rate=0.1, loss='deviance', max_depth=3,
                            max_features=None, max_leaf_nodes=None,
                            min_impurity_decrease=0.0, min_samples_leaf=1,
                            min_samples_split=2, min_weight_fraction_leaf=0.0,
                            n_estimators=100, n_iter_no_change=None,
                            random_state=42, subsample=1.0, tol=0.0001,
                            validation_fraction=0.1, verbose=0,
                            warm_start=False),
 LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
                device='gpu', importance_type='split', learning_rate=0.1,
                max_depth=-1, min_child_samples=20, min_child_weight=0.001,
                min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31,
      

In [27]:
# model blending(votingclassifier 형성)
ensembled = blend_models(estimator_list = best_model, fold = 10, method = "soft")
pred_holdout = predict_model(ensembled)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7143,0.802,0.7143,0.7052,0.6997,0.3271,0.3361
1,0.7619,0.8342,0.7619,0.7275,0.7176,0.3842,0.4276
2,0.6667,0.648,0.6667,0.6337,0.6284,0.166,0.179
3,0.8095,0.8327,0.8095,0.8268,0.7919,0.5496,0.5679
4,0.7619,0.851,0.7619,0.7794,0.7316,0.4444,0.4747
5,0.7143,0.6956,0.7143,0.7237,0.6566,0.25,0.3203
6,0.8333,0.7973,0.8333,0.8667,0.8108,0.6005,0.6572
7,0.8095,0.8198,0.8095,0.8519,0.7738,0.5307,0.6052
8,0.8049,0.7601,0.8049,0.8236,0.8009,0.5724,0.579
9,0.8049,0.8384,0.8049,0.8033,0.8001,0.574,0.5781


Processing:   0%|          | 0/6 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.7778,0.7496,0.7778,0.7705,0.7529,0.4536,0.4852


In [29]:
# 최종 모델 형성
final_model = finalize_model(ensembled)
preds = predict_model(final_model, data = test_x)

In [30]:
pred = preds["prediction_label"]

In [31]:
np.unique(pred, return_counts = True)

(array([0, 1, 2]), array([ 37, 263,  10], dtype=int64))

In [32]:
submit = pd.read_csv('./sample_submission.csv')

In [33]:
submit['Y_Class'] = pred

In [34]:
submit.to_csv('./baseline_submission.csv', index=False)

In [35]:
np.unique(submit["Y_Class"], return_counts = True)

(array([0, 1, 2]), array([ 37, 263,  10], dtype=int64))

대회를 진행하면서 얻은 결론
이번 대회에서 최종적으로는 일반화를 못 시켰다는 결론이 나왔다. 다음과 같은 교훈을 얻었다.
1. 데이터 전처리의 중요성. 결측치를 내 임의대로 처리하는 경우 항상 주의해야 한다.
2. automl로는 우선 간단하게 어떤 모델이 괜찮게 나왔는지 정도만 대략적으로 파악한다.
3. 일반화를 잘 시킨다.
4. 추가 사항: 데이콘이나 캐글을 진행하면서 항상 고려할 점은 Public 점수로부터 내 cv가 신빙성있는 지 혹은 내 cv로부터 Public 점수가 신빙성이 있는지 확인하는 정도로 사용한다는 점이다. 논리적인 가설을 바탕으로 구축된 cv가 아닌 오직 Public만 높이기 위한 cv는 이번 대회처럼 신경을 썼다고 해도 오버피팅이 나올 수 있다. 내 논리적 가설과 public을 통해 적절한 cv를 구축했는지를 항상 체크한다.