In [68]:
import os
import numpy as np
import pandas as pd
import optuna
import category_encoders as ce
import joblib

from glob import glob
from warnings import simplefilter
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, LabelEncoder

In [69]:
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
simplefilter(action="ignore", category=FutureWarning)

In [70]:
PATH = os.getcwd()
train = pd.read_csv(f'{PATH}/data/train.csv').drop(columns=['ID'], axis=1)
test = pd.read_csv(f'{PATH}/data/test.csv').drop(columns=['ID'], axis=1)

In [71]:
rename_columns = {
    "제조사": "Manufacturer",           "모델": "Model",
    "차량상태": "VehicleCondition",     "배터리용량": "BatteryCapacity",
    "구동방식": "DriveType",            "주행거리(km)": "MileageKm",
    "보증기간(년)": "WarrantyYears",    "사고이력": "AccidentHistory",
    "연식(년)": "Year",                 "가격(백만원)": "Price",
}

train = train.rename(columns=rename_columns)
test = test.rename(columns=rename_columns)

In [72]:
# 배터리 용량 결측치 처리
upper7 = train[train['WarrantyYears']>=7]['BatteryCapacity'].mean()
lower7 = train[train['WarrantyYears']<7]['BatteryCapacity'].mean()

train.fillna(-1, inplace=True)
test.fillna(-1, inplace=True)

def fill_battery(row):
    if row['BatteryCapacity'] == -1:
        if row['WarrantyYears'] >= 7:
            return upper7
        else:
            return lower7
    return row['BatteryCapacity']

train['BatteryCapacity'] = train.apply(fill_battery, axis=1)
test['BatteryCapacity'] = test.apply(fill_battery, axis=1)

In [73]:
bins = [0, 60, 80, 100]
labels = ["s", "m", "l"]
train["BatteryCapacity_cut"] = pd.cut(train["BatteryCapacity"], bins=bins, labels=labels, right=False, include_lowest=True)
test["BatteryCapacity_cut"] = pd.cut(test["BatteryCapacity"], bins=bins, labels=labels, right=False, include_lowest=True)

In [74]:
train['f1'] = train['BatteryCapacity'] / train['MileageKm']
test['f1'] = test['BatteryCapacity'] / test['MileageKm']

train['Year'] = 2024 - train['Year']
test['Year'] = 2024 -  test['Year']

train['WarrantyYears'] = 2024 - train['WarrantyYears']
test['WarrantyYears'] = 2024 -  test['WarrantyYears']

In [75]:
without_columns = ['ID', 'Price']
categorical_columns = [col for col in train.columns if (train[col].dtype in ['object', 'category']) and (col not in without_columns)]
numerical_columns  = [col for col in train.columns if col not in categorical_columns and (col not in without_columns)]

for i in range(len(numerical_columns)):
    for j in range(i, len(numerical_columns)):
        train[f'{numerical_columns[i]}*{numerical_columns[j]}'] = train[numerical_columns[i]] * train[numerical_columns[j]]
        test[f'{numerical_columns[i]}*{numerical_columns[j]}'] = test[numerical_columns[i]] * test[numerical_columns[j]]
        
numerical_columns  = [col for col in train.columns if col not in categorical_columns and (col not in without_columns)]

In [76]:
# https://github.com/rapidsai/deeplearning/blob/main/RecSys2020Tutorial/03_3_TargetEncoding.ipynb
def target_encode(X_train, X_valid, X_test, encode_col, target_col, smooth=0.0, agg="mean"):
    encoded_col = f'TE_{agg.upper()}_' + '_'.join(encode_col)
    
    df_tmp = X_train[encode_col + [target_col]].groupby(encode_col).agg([agg, 'count']).reset_index()
    if agg=="mean": mn = X_train[target_col].mean()
    elif agg=="median": mn = X_train[target_col].median()
    elif agg=="std": mn = X_train[target_col].std()
    elif agg=="min": mn = X_train[target_col].min()
    elif agg=="max": mn = X_train[target_col].max()
    
    df_tmp.columns = encode_col + [agg, 'count']
    df_tmp['TE_tmp'] = ((df_tmp[agg] * df_tmp['count']) + (mn * smooth)) / (df_tmp['count'] + smooth)
    
    X_train = X_train.merge(df_tmp[encode_col + ['TE_tmp']], how='left', left_on=encode_col, right_on=encode_col)
    X_train[encoded_col] = X_train['TE_tmp'].fillna(mn)
    X_train = X_train.drop(columns=['TE_tmp'])
    # X_train[encoded_col] = X_train[encoded_col].astype("float32")
    
    df_tmp_m = X_valid[encode_col].merge(df_tmp, how='left', left_on=encode_col, right_on=encode_col)
    X_valid[encoded_col] = df_tmp_m['TE_tmp'].fillna(mn).values
    # X_valid[encoded_col] = X_valid[encoded_col].astype("float32")

    df_tmp_m = X_test[encode_col].merge(df_tmp, how='left', left_on=encode_col, right_on=encode_col)
    X_test[encoded_col] = df_tmp_m['TE_tmp'].fillna(mn).values
    # X_test[encoded_col] = X_test[encoded_col].astype("float32")
    
    return X_train, X_valid, X_test

In [77]:
def trainer(params, log=False):
    target = "Price"

    SEED = 909
    FOLDS = 10
    kf = KFold(n_splits=FOLDS, random_state=SEED, shuffle=True)

    # SAVE OOF AND TEST PREDS
    oof = np.zeros( len(train) )
    pred = np.zeros( len(test) )
    rmse = 0.0
    
    if log: print(f"parameters : {params}")

    # TRAIN/INFER K-FOLD
    for fold, (train_idx, valid_idx) in enumerate(kf.split(train)):
        
        # PRINT FOLD NUMBER
        if log: print(f"### Fold {fold+1} / {FOLDS} ###")
        
        # GET TRAIN, VALID, TEST
        X_train = train.loc[train_idx, :].copy()
        y_train = train.loc[train_idx, target]
        
        X_valid = train.loc[valid_idx, :].copy()
        y_valid = train.loc[valid_idx, target]
        
        X_test = test.copy()        
      
        ## Target Encoder        
        encoder_columns = ['Manufacturer', 'Model']
        for column in encoder_columns:
            X_train, X_valid, X_test = target_encode(X_train, X_valid, X_test, encode_col=[column], target_col='Price', smooth=0.0, agg="mean")

        # Preprocess 
        X_train = X_train.drop(columns=['Price'] + ['Manufacturer', 'Model'], axis=1)
        X_valid = X_valid.drop(columns=['Price'] + ['Manufacturer', 'Model'], axis=1)
        X_test = X_test.drop(columns=['Manufacturer', 'Model'], axis=1)
 
        ## Category Encoder
        encoder_columns = ['VehicleCondition', 'DriveType', 'AccidentHistory', 'BatteryCapacity_cut']
        enc = ce.PolynomialEncoder(cols=encoder_columns)
        X_train = enc.fit_transform(X_train)
        X_valid = enc.transform(X_valid)
        X_test = enc.transform(X_test)

        ## Scaler
        scaler_columns = numerical_columns
        ss = StandardScaler()
        X_train[scaler_columns] = ss.fit_transform(X_train[scaler_columns])
        X_valid[scaler_columns] = ss.transform(X_valid[scaler_columns])
        X_test[scaler_columns] = ss.transform(X_test[scaler_columns])
        
        # FIT MODEL    
        model = RandomForestRegressor(
            **params,
            random_state=SEED,
        )
        
        model.fit(
            X_train, y_train,
        ) 
        
        # INFER OOF AND TEST
        oof[valid_idx] = model.predict(X_valid)    
        pred += model.predict(X_test)
        
        rmse_fold = np.sqrt(np.mean((y_valid - oof[valid_idx]) ** 2))
        if log: print(f"-> Fold {fold+1} RMSE = {rmse_fold:.5f}")
        rmse += rmse_fold
        
    pred /= FOLDS
    rmse /= FOLDS
    if log: print(f"-> Folds RMSE = {rmse:.5f}")
    
    return rmse, pred, model

In [None]:
def rf_objective(trial):
    # Define parameter search space
    params = {
        # "n_estimators": trial.suggest_int("n_estimators", 10, 1000, step=10),
        "max_depth": 10, # trial.suggest_int("max_depth", 2, 16),
        "min_samples_split": 35, # trial.suggest_int("min_samples_split", 30, 64),
        "min_samples_leaf": 35 # trial.suggest_int("min_samples_leaf", 30, 64),
    }

    rmse, _, _ = trainer(params)

    return rmse

rf_study = optuna.create_study(direction="minimize")
rf_study.optimize(rf_objective, n_trials=100)
rf_study.best_params

[I 2025-01-23 10:05:23,568] A new study created in memory with name: no-name-26adcf83-d007-4b39-8eda-2d1d51dcdee1


[I 2025-01-23 10:06:49,587] Trial 0 finished with value: 1.7816287077285782 and parameters: {'min_samples_leaf': 41}. Best is trial 0 with value: 1.7816287077285782.
[I 2025-01-23 10:08:17,146] Trial 1 finished with value: 1.7816287077285782 and parameters: {'min_samples_leaf': 41}. Best is trial 0 with value: 1.7816287077285782.
[I 2025-01-23 10:09:39,432] Trial 2 finished with value: 1.9757671855653682 and parameters: {'min_samples_leaf': 55}. Best is trial 0 with value: 1.7816287077285782.
[I 2025-01-23 10:10:59,461] Trial 3 finished with value: 1.804072331311776 and parameters: {'min_samples_leaf': 43}. Best is trial 0 with value: 1.7816287077285782.
[I 2025-01-23 10:12:22,537] Trial 4 finished with value: 1.8987207525155605 and parameters: {'min_samples_leaf': 50}. Best is trial 0 with value: 1.7816287077285782.
[I 2025-01-23 10:13:45,761] Trial 5 finished with value: 1.8603517989373408 and parameters: {'min_samples_leaf': 48}. Best is trial 0 with value: 1.7816287077285782.
[I 20

{'min_samples_leaf': 30}

In [None]:
optuna.visualization.plot_param_importances(rf_study)

In [62]:
optuna.visualization.plot_optimization_history(rf_study)

In [63]:
rf_study.best_params

{'n_estimators': 430,
 'max_depth': 28,
 'min_samples_split': 41,
 'min_samples_leaf': 3}

In [84]:
params = {'n_estimators': 1000,
 'max_depth': 32,
 'min_samples_split': 35,
 'min_samples_leaf': 35}

rmse, pred, model = trainer(params, log=True)

# saved_model_file = glob(f'{PATH}/result/rf/weights/*.pkl')[0]
# if float(os.path.splitext(os.path.basename(saved_model_file))[0].split('_')[1]) > rmse:
#     os.remove(saved_model_file)
joblib.dump(model, f'{PATH}/result/rf/rf_{rmse:.8f}.pkl')

parameters : {'n_estimators': 1000, 'max_depth': 32, 'min_samples_split': 35, 'min_samples_leaf': 35}
### Fold 1 / 10 ###
-> Fold 1 RMSE = 1.63062
### Fold 2 / 10 ###
-> Fold 2 RMSE = 1.70193
### Fold 3 / 10 ###
-> Fold 3 RMSE = 1.64607
### Fold 4 / 10 ###
-> Fold 4 RMSE = 1.75216
### Fold 5 / 10 ###
-> Fold 5 RMSE = 1.49404
### Fold 6 / 10 ###
-> Fold 6 RMSE = 1.79342
### Fold 7 / 10 ###
-> Fold 7 RMSE = 1.86196
### Fold 8 / 10 ###
-> Fold 8 RMSE = 1.56610
### Fold 9 / 10 ###
-> Fold 9 RMSE = 1.74983
### Fold 10 / 10 ###
-> Fold 10 RMSE = 1.62082
-> Folds RMSE = 1.68170


['c:\\Users\\kgw\\Desktop\\kgw\\projects\\dacon\\elec_car_price_forecasting/result/rf/rf_1.68169522.pkl']

In [66]:
submit = pd.read_csv(f'{PATH}/data/sample_submission.csv')
submit['가격(백만원)'] = pred
submit.head()

Unnamed: 0,ID,가격(백만원)
0,TEST_000,130.880257
1,TEST_001,80.156522
2,TEST_002,65.074535
3,TEST_003,34.613418
4,TEST_004,47.945115


In [67]:
save_file_name = f'{PATH}/result/rf/rf_CV-{rmse}_LB-.csv'
submit.to_csv(save_file_name, index=False)
print(save_file_name)

c:\Users\kgw\Desktop\kgw\projects\dacon\elec_car_price_forecasting/result/rf/rf_CV-1.3358464328254693_LB-.csv
