In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import optuna
from my_encoders import TargetEncoder, FrequencyEncoder
from my_functions import build_feature_cols, build_preprocessor
from sklearn.model_selection import KFold, cross_val_score
import cloudpickle
from pathlib import Path

In [29]:
ship_method_name = [
    'AMAZON_FREIGHT', 'AMAZON_LTL', 'AMAZON_UPS_PARCEL', 'ESTES',
    'HOUR_LOOP_FEDEX_PARCEL', 'UBER_LTL', 'WWE_PARCEL'
]

# 如果所有檔案都在目前工作目錄
df_train_full = pd.concat(
    [pd.read_csv(f"{m}_train.csv") for m in ship_method_name],
    ignore_index=True
)

df_test_final = pd.concat(
    [pd.read_csv(f"{m}_test.csv") for m in ship_method_name],
    ignore_index=True
)

print(df_train_full.shape, df_test_final.shape)

(12844, 33) (1427, 33)


In [30]:
cols = {
    'AMAZON_FREIGHT' : ['log_weight', 'log_Hdis','to_state'],
    'AMAZON_LTL' : ['log_weight', 'log_Hdis', 'vendor_name','from_state','across_state'],
    'AMAZON_UPS_PARCEL' : ['log_weight', 'log_Hdis', 'log_volume'],
    'ESTES' : ['log_weight', 'to_state'],
    'HOUR_LOOP_FEDEX_PARCEL' : ['log_weight', 'log_Mdis', 'vendor_name', 'to_state'],
    'UBER_LTL': ['log_weight', 'log_Mdis', 'from_state'],
    'WWE_PARCEL' : ['log_weight', 'log_Hdis']
}

In [31]:
vars = {
    'AMAZON_FREIGHT' : {"C": 55.736076560251256,"epsilon": 0.11231822165041438,"gamma": 0.23417860093980628},
    'AMAZON_LTL' : {"C": 81.58311403532339,"epsilon": 0.39059242077416534,"gamma": 0.16158971758347687},
    'AMAZON_UPS_PARCEL' : {"C": 8.904533233040649,"epsilon": 0.1739978188492485,"gamma": 0.1547630356902939},
    'ESTES' : {"C": 755.6857857093129,"epsilon": 0.0228933541579891,"gamma": 0.028654501103860433},
    'HOUR_LOOP_FEDEX_PARCEL' : {"C": 17.86094781544563,"epsilon": 0.014085184521747276,"gamma": 0.03983996372116524},
    'UBER_LTL': {"C": 256.9555810753195,"epsilon": 0.3931105382575026,"gamma": 0.016519129924529242},
    'WWE_PARCEL' : {"C": 3.748705218525931,"epsilon": 0.0018305110274133042,"gamma": 0.040580180904780175}
}

In [32]:
target_col = 'log_cost'

def make_feature_config(cols_dict):
    cfg = {}
    for method, feats in cols_dict.items():
        num_cols = [f for f in feats if f.startswith('log_')]
        te_cols  = [f for f in feats if not f.startswith('log_')]
        fe_cols  = []

        cfg[method] = {
            'numeric_cols'      : num_cols,
            'target_encode_cols': te_cols,
            'freq_encode_cols'  : fe_cols,
            'feature_cols'      : build_feature_cols(num_cols, te_cols, fe_cols),
            'preprocessor'      : build_preprocessor(num_cols, te_cols, fe_cols)
        }
    return cfg

feature_config = make_feature_config(cols)

#### 建立模型並輸出指標結果

In [33]:
results_baseline = []

for method in df_train_full['ship_method'].unique():
    df_method = df_train_full[df_train_full['ship_method'] == method].copy()
    
    cfg = feature_config[method]
    feature_cols = cfg['feature_cols']
    preprocessor = cfg['preprocessor']
    numeric_cols = cfg['numeric_cols']
    target_encode_cols = cfg['target_encode_cols']
    freq_encode_cols = cfg['freq_encode_cols']

    X = df_method[feature_cols]
    y = df_method[target_col]

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # 使用預設參數
    svr = SVR(kernel='rbf')
    svr_model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('scaler_post', StandardScaler(with_mean=False)),
        ('regressor', svr)
    ])

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(svr_model, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')
    mse_cv = -scores.mean()

    svr_model.fit(X_train, y_train)

    y_val_pred = svr_model.predict(X_val)
    mse_val = mean_squared_error(y_val, y_val_pred)
    r2_val = r2_score(y_val, y_val_pred)
    
    df_test_method = df_test_final[df_test_final['ship_method'] == method].copy()
    if not df_test_method.empty:
        X_test = df_test_method[feature_cols]
        y_test = df_test_method[target_col]
        y_test_pred = svr_model.predict(X_test)
        mse_test = mean_squared_error(y_test, y_test_pred)
        r2_test = r2_score(y_test, y_test_pred)
    else:
        mse_test = r2_test = None

    results_baseline.append({
        'ship_method': method,
        'mse_cv': mse_cv,
        'mse_val': mse_val,
        'r2_val': r2_val,
        'mse_test': mse_test,
        'r2_test': r2_test,
        'best_params': {'C': 1.0, 'epsilon': 0.1, 'gamma': 'scale'}
    })

baseline_results= pd.DataFrame(results_baseline)
        

In [34]:
print("\nResults with Baseline:")
print(baseline_results.sort_values(by='mse_test'))


Results with Baseline:
              ship_method    mse_cv   mse_val    r2_val  mse_test   r2_test  \
2       AMAZON_UPS_PARCEL  0.075654  0.083776  0.925854  0.101571  0.901698   
1              AMAZON_LTL  0.190838  0.214385  0.843648  0.154047  0.606238   
4  HOUR_LOOP_FEDEX_PARCEL  0.201951  0.167015  0.850018  0.175911  0.799376   
6              WWE_PARCEL  0.129717  0.128389  0.866717  0.232287  0.715230   
5                UBER_LTL  0.343826  0.237869  0.832960  0.284515  0.821755   
3                   ESTES  0.217613  0.176954  0.904801  0.368477  0.748736   
0          AMAZON_FREIGHT  0.210428  0.146166  0.945446  1.048730  0.577314   

                                    best_params  
2  {'C': 1.0, 'epsilon': 0.1, 'gamma': 'scale'}  
1  {'C': 1.0, 'epsilon': 0.1, 'gamma': 'scale'}  
4  {'C': 1.0, 'epsilon': 0.1, 'gamma': 'scale'}  
6  {'C': 1.0, 'epsilon': 0.1, 'gamma': 'scale'}  
5  {'C': 1.0, 'epsilon': 0.1, 'gamma': 'scale'}  
3  {'C': 1.0, 'epsilon': 0.1, 'gamma': 'sca

In [None]:
results_optuna = []

for method in df_train_full['ship_method'].unique():
    df_method = df_train_full[df_train_full['ship_method'] == method].copy()
    
    cfg = feature_config[method]
    feature_cols = cfg['feature_cols']
    preprocessor = cfg['preprocessor']
    numeric_cols = cfg['numeric_cols']
    target_encode_cols = cfg['target_encode_cols']
    freq_encode_cols = cfg['freq_encode_cols']
    best_params = vars[method]  # 直接使用 vars 中的參數

    X = df_method[feature_cols]
    y = df_method[target_col]

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # ✨ Step 1: 編碼器 fit_transform（注意：TargetEncoder 需要 y）
    X_train_proc = preprocessor.fit_transform(X_train, y_train)
    X_val_proc   = preprocessor.transform(X_val)

    # ✨ Step 2: 標準化（這部分仍包含在模型中）
    scaler = StandardScaler(with_mean=False)
    X_train_scaled = scaler.fit_transform(X_train_proc)
    X_val_scaled   = scaler.transform(X_val_proc)

    # ✨ Step 3: 訓練 SVR 模型
    svr = SVR(kernel='rbf', **best_params)
    svr.fit(X_train_scaled, y_train)

    # ✨ Step 3: 組成純模型 pipeline（不包含 encoder）
    best_svr_model = Pipeline([
        ('scaler_post', scaler),
        ('regressor', svr)
    ])

    # ✨ Step 4: cross_val_score（使用轉換後資料）
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(best_svr_model, X_train_proc, y_train, cv=kf, scoring='neg_mean_squared_error')
    mse_cv = -scores.mean()

    # 儲存模型
    output_dir = Path("model_artifacts_01")
    output_dir.mkdir(parents=True, exist_ok=True)
    model_path = output_dir / f"{method}_model.pkl"
    with open(model_path, "wb") as f:
        cloudpickle.dump(best_svr_model, f)

    # ✨ Step 6: 驗證集預測（需手動 transform）
    y_val_pred = best_svr_model.predict(X_val_proc)
    mse_val = mean_squared_error(y_val, y_val_pred)
    r2_val = r2_score(y_val, y_val_pred) 

    # ✨ Step 7: 測試集評估
    df_test_method = df_test_final[df_test_final['ship_method'] == method].copy()
    if not df_test_method.empty:
        X_test = df_test_method[feature_cols]
        y_test = df_test_method[target_col]
        X_test_proc   = preprocessor.transform(X_test)
        X_test_scaled = scaler.transform(X_test_proc)
        y_test_pred   = best_svr_model.predict(X_test_scaled)

        mse_test = mean_squared_error(y_test, y_test_pred)
        r2_test  = r2_score(y_test, y_test_pred)
    else:
        mse_test = r2_test = None

    # 儲存經過編碼的資料
    encoded_df = pd.DataFrame(X_train_proc)
    encoded_df["log_cost"] = y_train.values  # 加上目標欄位（可選）
    encoded_df.to_csv(f"encoded_{method}.csv", index=False)

    results_optuna.append({
        'ship_method': method,
        'mse_cv': mse_cv,
        'mse_val': mse_val,
        'r2_val': r2_val,
        'mse_test': mse_test,
        'r2_test': r2_test,
        'best_params': best_params
    })


optuna_results= pd.DataFrame(results_optuna)

In [None]:
print("\nResults with Optuna optimization:")
print(optuna_results.sort_values(by='mse_test'))


Results with Optuna optimization:
              ship_method    mse_cv   mse_val    r2_val  mse_test   r2_test  \
2       AMAZON_UPS_PARCEL  0.074693  0.081329  0.928019  0.097274  0.905857   
1              AMAZON_LTL  0.161407  0.170339  0.875771  0.149549  0.617735   
4  HOUR_LOOP_FEDEX_PARCEL  0.162398  0.145677  0.869180  0.160495  0.816957   
6              WWE_PARCEL  0.123628  0.141278  0.853337  0.215929  0.735284   
5                UBER_LTL  0.326313  0.243017  0.829345  0.256660  0.839205   
3                   ESTES  0.202877  0.149603  0.919515  0.350116  0.761256   
0          AMAZON_FREIGHT  0.138616  0.215691  0.919497  0.514058  0.792811   

                                         best_params  
2  {'C': 8.904533233040649, 'epsilon': 0.17399781...  
1  {'C': 81.58311403532339, 'epsilon': 0.39059242...  
4  {'C': 17.86094781544563, 'epsilon': 0.01408518...  
6  {'C': 3.748705218525931, 'epsilon': 0.00183051...  
5  {'C': 256.9555810753195, 'epsilon': 0.39311053...  
3 