In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_percentage_error
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from tqdm import tqdm
from lightgbm import early_stopping

In [2]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
sample = pd.read_csv('./data/sample_submission.csv')

In [3]:
nan_columns = train.columns[train.isna().any()].tolist() # NaN값을 가진 컬럼 확인
print(nan_columns)

['num_sold']


In [4]:
train = train.dropna(subset=['num_sold']) # num_sold 열에서 결측값이 있는 행만 선택적으로 제거
nan_columns = train.columns[train.isna().any()].tolist()
print(nan_columns)

[]


In [5]:
# id 컬럼 삭제
train.drop('id', axis=1)
test.drop('id', axis=1)

Unnamed: 0,date,country,store,product
0,2017-01-01,Canada,Discount Stickers,Holographic Goose
1,2017-01-01,Canada,Discount Stickers,Kaggle
2,2017-01-01,Canada,Discount Stickers,Kaggle Tiers
3,2017-01-01,Canada,Discount Stickers,Kerneler
4,2017-01-01,Canada,Discount Stickers,Kerneler Dark Mode
...,...,...,...,...
98545,2019-12-31,Singapore,Premium Sticker Mart,Holographic Goose
98546,2019-12-31,Singapore,Premium Sticker Mart,Kaggle
98547,2019-12-31,Singapore,Premium Sticker Mart,Kaggle Tiers
98548,2019-12-31,Singapore,Premium Sticker Mart,Kerneler


In [6]:
def date(df):
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['day'] = df['date'].dt.day
    df['month'] = df['date'].dt.month
    df['month_name'] = df['date'].dt.month_name()
    df['day_of_week'] = df['date'].dt.day_name()
    df['week'] = df['date'].dt.isocalendar().week
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12) 
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df['day_sin'] = np.sin(2 * np.pi * df['day'] / 31)  
    df['day_cos'] = np.cos(2 * np.pi * df['day'] / 31)
    df['group'] = (df['year'] - 2020) * 48 + df['month'] * 4 + df['day'] // 7
    
    df.drop('date', axis=1, inplace=True)

    df['cos_year'] = np.cos(df['year'] * (2 * np.pi) / 100)
    df['sin_year'] = np.sin(df['year'] * (2 * np.pi) / 100)
    df['year_lag_1'] = df['year'].shift(1)
    df['year_diff'] = df['year'] - df['year_lag_1']

    return df


def ohe_transform(train, test, cat_cols):
    ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore') # 예를 들어, 열 color에 값 ['red', 'blue', 'green']이 있다면, red, blue, green 각각에 해당하는 새로운 열이 만들어집니다.

    train_ohe = pd.DataFrame(ohe.fit_transform(train[cat_cols]), 
                                columns=ohe.get_feature_names_out(cat_cols), 
                                index=train.index)
    
    test_ohe = pd.DataFrame(ohe.transform(test[cat_cols]), 
                            columns=ohe.get_feature_names_out(cat_cols), 
                            index=test.index)
    
    print(f"train_ohe: {len(train_ohe.columns)}")
    print(f"test_ohe: {len(test_ohe.columns)}")

    train = train.drop(columns=cat_cols).reset_index(drop=True)
    test = test.drop(columns=cat_cols).reset_index(drop=True)

    train = pd.concat([train, train_ohe.reset_index(drop=True)], axis=1)
    test = pd.concat([test, test_ohe.reset_index(drop=True)], axis=1)

    return train, test

In [7]:
train = date(df=train)
test = date(df=test)

cat_c = ['country', 'store', 'product', 'month_name', 'day_of_week']
ohe_cols = {'cat_c': cat_c}

cat_c = ohe_cols.get('cat_c', [])

train, test = ohe_transform(
    train=train,
    test=test,
    cat_cols=cat_c
)

train_ohe: 33
test_ohe: 33


In [8]:
X_train_ = train.drop('num_sold', axis=1) # num_sold 컬럼을 제외한 나머지를 반환
y_train_ = train['num_sold']
X_test = test

In [9]:
y_log = True
n_splits = 5
SEED = 114514
e_stop = 200
early_stop = True
gpu = False
optuna = False
g_col='group'

In [10]:
def mape(y_true, y_pred):
    return mean_absolute_percentage_error(y_true, y_pred)

In [11]:
import optuna
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import GroupKFold
from tqdm import tqdm

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
        'subsample': trial.suggest_float('subsample', 0.3, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'random_state': SEED,
        'device': 'gpu' if gpu else 'cpu'
    }

    kfold = GroupKFold(n_splits=n_splits)

    oof_scores = []
    reported_steps = set()

    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train_, y_train_, groups=X_train_[g_col])):
        X_train, X_val = X_train_.iloc[train_idx], X_train_.iloc[val_idx]
        y_train, y_val = y_train_.iloc[train_idx], y_train_.iloc[val_idx]

        if y_log:
            y_train = np.log1p(y_train)
            y_val = np.log1p(y_val)

        model = lgb.LGBMRegressor(**params, verbose=-1)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric="l2",
        )

        y_val_pred = model.predict(X_val)

        if y_log:
            y_val_pred = np.expm1(y_val_pred)
            y_val = np.expm1(y_val)

        for step in range(len(y_val_pred)):
            if step not in reported_steps:
                trial.report(mape(y_val, y_val_pred), step)
                reported_steps.add(step)

        oof_scores.append(mape(y_val, y_val_pred))

    mean_oof_score = np.mean(oof_scores)
    return mean_oof_score

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5, show_progress_bar=True)

# 최적의 하이퍼파라미터 출력
print("Best trial:")
best_trial = study.best_trial
print(f"  Value: {best_trial.value}")
print("  Params:")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")

# 최적의 파라미터로 모델 학습
def train_best_model(params):
    kfold = GroupKFold(n_splits=n_splits)

    train_scores = []
    oof_scores = []
    all_models = []
    oof_predictions = np.zeros(len(y_train_))
    test_preds = (
        np.zeros((len(X_test), n_splits))
    )

    for fold, (train_idx, val_idx) in enumerate(tqdm(kfold.split(X_train_, y_train_, groups=X_train_[g_col]), desc="Training Folds", total=n_splits)):
        X_train, X_val = X_train_.iloc[train_idx], X_train_.iloc[val_idx]
        y_train, y_val = y_train_.iloc[train_idx], y_train_.iloc[val_idx]

        if y_log:
            y_train = np.log1p(y_train)
            y_val = np.log1p(y_val)

        callbacks = [early_stopping(stopping_rounds=e_stop, verbose=False)] if early_stop else None

        device = 'gpu' if gpu else 'cpu'

        model = lgb.LGBMRegressor(**params, random_state=SEED, verbose=-1, device=device)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric=None, callbacks=callbacks)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        if y_log:
            y_train_pred = np.expm1(y_train_pred)
            y_val_pred = np.expm1(y_val_pred)
            y_train = np.expm1(y_train)
            y_val = np.expm1(y_val)

        oof_predictions[val_idx] = y_val_pred

        train_scores.append(mape(y_train, y_train_pred))
        oof_scores.append(mape(y_val, y_val_pred))

        test_preds[:, fold] = model.predict(X_test)

        print(f"Fold {fold + 1} - Train MAPE: {train_scores[-1]:.4f}, OOF MAPE: {oof_scores[-1]:.4f}")
        all_models.append(model)

    mean_train_scores = f"{np.mean(train_scores):.4f}"
    mean_off_scores = f"{np.mean(oof_scores):.4f}"

    print(f"Overall Train MAPE: {mean_train_scores}")
    print(f"Overall OOF MAPE: {mean_off_scores}")

    mean_test_preds = test_preds.mean(axis=1) if X_test is not None else None

    if y_log:
        mean_test_preds = np.expm1(mean_test_preds)

    return {
        "oof_predictions": oof_predictions,
        "mean_test_preds": mean_test_preds,
        "all_models": all_models,
        "mean_train_scores": mean_train_scores,
        "mean_off_scores": mean_off_scores
    }

best_params = study.best_trial.params
final_results = train_best_model(best_params)

[I 2025-01-15 01:38:26,303] A new study created in memory with name: no-name-1bf0c359-1ff6-49a8-9b50-ed2e2dbe5a0f


  0%|          | 0/5 [00:00<?, ?it/s]

[I 2025-01-15 01:38:43,569] Trial 0 finished with value: 0.04813832373297207 and parameters: {'n_estimators': 782, 'max_depth': 3, 'colsample_bytree': 0.3104180922503826, 'subsample': 0.42728404663044894, 'learning_rate': 0.16840381276654476, 'min_child_samples': 84}. Best is trial 0 with value: 0.04813832373297207.
[I 2025-01-15 01:39:03,140] Trial 1 finished with value: 0.05086232418784435 and parameters: {'n_estimators': 652, 'max_depth': 7, 'colsample_bytree': 0.6901407323157852, 'subsample': 0.8597853597755878, 'learning_rate': 0.021448427417877846, 'min_child_samples': 96}. Best is trial 0 with value: 0.04813832373297207.
[I 2025-01-15 01:39:22,908] Trial 2 finished with value: 0.046177525762663554 and parameters: {'n_estimators': 729, 'max_depth': 9, 'colsample_bytree': 0.3834726548784321, 'subsample': 0.36812219233164606, 'learning_rate': 0.14973910338099403, 'min_child_samples': 64}. Best is trial 2 with value: 0.046177525762663554.
[I 2025-01-15 01:39:42,995] Trial 3 finished

Training Folds:  20%|██        | 1/5 [00:01<00:05,  1.47s/it]

Fold 1 - Train MAPE: 0.0409, OOF MAPE: 0.0454


Training Folds:  40%|████      | 2/5 [00:02<00:04,  1.48s/it]

Fold 2 - Train MAPE: 0.0409, OOF MAPE: 0.0467


Training Folds:  60%|██████    | 3/5 [00:04<00:02,  1.49s/it]

Fold 3 - Train MAPE: 0.0407, OOF MAPE: 0.0466


Training Folds:  80%|████████  | 4/5 [00:05<00:01,  1.49s/it]

Fold 4 - Train MAPE: 0.0407, OOF MAPE: 0.0460


Training Folds: 100%|██████████| 5/5 [00:07<00:00,  1.49s/it]

Fold 5 - Train MAPE: 0.0407, OOF MAPE: 0.0462
Overall Train MAPE: 0.0408
Overall OOF MAPE: 0.0462





In [13]:
sample["num_sold"] = final_results['mean_test_preds']
sample.to_csv(f"submission_{final_results['mean_off_scores']}.csv", index=False)