In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
NFOLDS = 10
SEED = 22

In [3]:
# !pip install catboost
# !pip install category_encoders
# !pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, ShuffleSplit
from sklearn.metrics import mean_squared_error
from sklearn import set_config
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import TransformedTargetRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

import lightgbm as lgb
from category_encoders import CatBoostEncoder
import optuna
from optuna.distributions import CategoricalDistribution, IntDistribution, FloatDistribution
from optuna.integration import OptunaSearchCV
from optuna.integration.lightgbm import LightGBMTunerCV
from scipy.sparse import csr_matrix
import re

#### Load data

In [5]:
X_train = pd.read_csv('/content/drive/MyDrive/감귤착과량공모전/data/X_train_set.csv')
X_test = pd.read_csv('/content/drive/MyDrive/감귤착과량공모전/data/X_test_set.csv')

y_train = pd.read_csv('/content/drive/MyDrive/감귤착과량공모전/data/train.csv')['착과량(int)']

#### 수치형/범주형 피처 분리

In [6]:
numeric_features = list(X_train.columns)

X_train = X_train[numeric_features]  # 순서 주의!!!
X_test = X_test[numeric_features]

X_train.shape, X_test.shape

((2207, 564), (2208, 564))

####  파이프라인 구축

In [7]:
def NMAE_CV(clf, x, y):
    pred = clf.predict(x)
    mae = np.mean(np.abs(y - pred))
    score = mae / np.mean(np.abs(y))
    return score

In [8]:
def remove_outlier(X, q=0.04):  
    df = pd.DataFrame(X)
    return df.apply(lambda x: x.clip(x.quantile(q), x.quantile(1-q)), axis=0).values

numeric_transformer = Pipeline(
    steps=[
        ("outlier", FunctionTransformer(remove_outlier, kw_args={'q':0.04})), 
        ("scaler", MaxAbsScaler()),
    ]
)
 
column_transformer = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
   ]
)

preprocessor = Pipeline(
    steps=[
        ("column", column_transformer), 
    ]
)

model = Pipeline(
    steps=[
        ("preprocessor", preprocessor), 
        ("classifier", LGBMRegressor(objective="regression", metric="mae", random_state=SEED)),
    ]
)

set_config(display="diagram")  # To view the text pipeline, change to display='text'.
model

#### LGBM Baseline 성능 확인

In [None]:
sscv = ShuffleSplit(test_size=.3, n_splits=NFOLDS, random_state=SEED) # Public LB 사이즈와 동일하게 평가데이터 사이즈 설정 
scores = cross_val_score(model, X_train, y_train, scoring=NMAE_CV, cv=sscv).mean()

print("Score : ", scores)

# 'q'값과 Scaler 조합별 성능
## StandardScaler
- 'q' = 0.01 : 0.07911
- 'q' = 0.01 : 0.07917
- 'q' = 0.01 : 0.07921
- 'q' = 0.01 : 0.07926  
## MINMAX
- 'q' = 0.01 : 0.07910
- 'q' = 0.01 : **0.07887** => 채택
- 'q' = 0.01 : 0.07917
- 'q' = 0.01 : 0.07930  
## PowerTransformer
- 'q' = 0.01 : 0.07921
- 'q' = 0.01 : 0.07893
- 'q' = 0.01 : 0.07954
- 'q' = 0.01 : 0.07924  
## Normalizer
- 'q' = 0.01 : 0.1032  
## RobustScaler 
- 'q' = 0.01 : 0.0790
- 'q' = 0.01 : 0.07894
- 'q' = 0.01 : 0.07929
- 'q' = 0.01 : 0.07933  
## MaxAbsScaler 
- 'q' = 0.01 : 0.07960
- 'q' = 0.01 : 0.07897
- 'q' = 0.01 : 0.07921
- 'q' = 0.01 : 0.07921

- 'q'가 2일때 모든 스케일러에서 성능이 좋았음
- MinMAX Scaler가 가장 성능이좋았음

#### LGBM 하이퍼파라미터 최적화
- 1단계) 전처리 파라미터 최적값 탐색: OptunaSearchCV를 통해 전처리 관련 최적의 파라미터을 찾음 (LGBM은 No Tuning)

### 실패이유
- optuna.create_study의 direction이라는 파라미터가 minimize가 안써짐
- negative_mean_absolute_error를 사용해도 maximize가 뽑힘 -> 오류해결안됨

In [None]:
"""
%%time

param_distributions = {
    "preprocessor__column__num__outlier__kw_args": CategoricalDistribution([{'q':0.01},{'q':0.02},{'q':0.03},{'q':0.04}]),
    "preprocessor__column__num__scaler": CategoricalDistribution([StandardScaler(),MinMaxScaler(), PowerTransformer(), Normalizer(), RobustScaler(), MaxAbsScaler()]),
}

optuna_search = OptunaSearchCV(model, 
                               param_distributions, 
                               cv=sscv, scoring=NMAE_CV, 
                               n_trials=1,
                               study=optuna.create_study(sampler=optuna.samplers.TPESampler(seed=100), direction= 'maximize'))

optuna.logging.set_verbosity(optuna.logging.WARNING)
optuna_search.fit(X_train, y_train)
"""

In [None]:
""""
print(f"Best params: {optuna_search.best_params_}")
print(optuna_search.best_score_)
"""

- 2단계) 전처리 최적화 수행: 최적의 파라미터 값으로 파이프라인 재설정

In [9]:
# 최적값으로 파이프라인 재설정
model.set_params(preprocessor__column__num__outlier__kw_args =  {'q': 0.02}, preprocessor__column__num__scaler = MinMaxScaler())

# 전처리 파이프라인만 수행
X_train = preprocessor.fit_transform(X_train, y_train)
X_test = preprocessor.transform(X_test)

In [10]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

In [11]:
X_train.to_csv('X_train_scale.csv', index = False)
X_test.to_csv('X_test_scale.csv', index = False)