In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
import optuna
import itertools
from sklearn.model_selection import cross_val_score, KFold
import pickle
import json

In [2]:
ship_method_name = ['AMAZON_FREIGHT', 'AMAZON_LTL', 'AMAZON_UPS_PARCEL', 'ESTES',
                    'HOUR_LOOP_FEDEX_PARCEL', 'UBER_LTL', 'WWE_LTL', 'WWE_PARCEL']

In [3]:
train_data_set = {}
train_X_data_set = {}
train_Y_data_set = {}
test_X_data_set = {}
test_Y_data_set = {}
final_validation_data_set = {}

for name in ship_method_name:
    final_validation_data_set[name] = pd.read_csv(name+'_test.csv')
    data = pd.read_csv(name+'_train.csv')
    train_data_set[name] = data
    Y = data[['cost', 'log_cost']]
    X = data.drop(columns=['cost', 'log_cost'])
    X_train, X_test, y_train, y_test = train_test_split(X, Y['log_cost'], test_size=0.2, random_state=42)

    #target encoding
    vendor_mean = X_train.copy()
    vendor_mean['log_cost'] = y_train
    vendor_mean = vendor_mean.groupby('vendor_name')['log_cost'].mean()  
    X_train['vendor_name_encoded'] = X_train['vendor_name'].map(vendor_mean)
    X_test['vendor_name_encoded'] = X_test['vendor_name'].map(vendor_mean).fillna(y_train.mean())
    
    train_X_data_set[name] = X_train
    train_Y_data_set[name] = y_train
    test_X_data_set[name] = X_test
    test_Y_data_set[name] = y_test

In [4]:
cols = {
    'AMAZON_FREIGHT' : ['log_weight', 'log_Mdis'],
    'AMAZON_LTL' : ['log_weight', 'log_Hdis', 'vendor_name_encoded'],
    'AMAZON_UPS_PARCEL' : ['log_weight', 'log_Hdis', 'log_volume', 'log_TVP', 'vendor_name_encoded'],
    'ESTES' : ['log_weight', 'log_TVP', 'log_volume', 'vendor_name_encoded'],
    'HOUR_LOOP_FEDEX_PARCEL' : ['log_weight', 'log_TVP', 'log_Hdis'],
    'UBER_LTL': ['log_weight', 'log_Mdis', 'vendor_name_encoded'],
    'WWE_LTL' : ['log_weight', 'log_Hdis'],
    'WWE_PARCEL' : ['log_weight', 'log_Mdis']
}

In [5]:
vars = {
    'AMAZON_FREIGHT' : {'n_estimators': 2000, 'max_depth': 12, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None},
    'AMAZON_LTL' : {'n_estimators': 2000, 'max_depth': 17, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'sqrt'},
    'AMAZON_UPS_PARCEL' : {'n_estimators': 300, 'max_depth': 17, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt'},
    'ESTES' : {'n_estimators': 1800, 'max_depth': 17, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'sqrt'},
    'HOUR_LOOP_FEDEX_PARCEL' : {'n_estimators': 1400, 'max_depth': 8, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None},
    'UBER_LTL': {'n_estimators': 2000, 'max_depth': 5, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': None},
    'WWE_LTL' : {'n_estimators': 2000, 'max_depth': 9, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_features': None},
    'WWE_PARCEL' : {'n_estimators': 1200, 'max_depth': 5, 'min_samples_split': 8, 'min_samples_leaf': 4, 'max_features': None}
}

In [6]:
def adjusted_r2(r2, n, k):
    if n <= k + 1:
        return np.nan
    return 1 - (1 - r2) * (n - 1) / (n - k - 1)

rf_val_predictions = {}
rf_val_metrics = []

for method in ship_method_name:
    print(f"\n📦 建立模型並預測：{method}")

    best_features = cols[method]
    best_params = vars[method]

    X_train = train_X_data_set[method][best_features]
    y_train = train_Y_data_set[method]
    X_test = test_X_data_set[method][best_features]
    y_test = test_Y_data_set[method]

    model = RandomForestRegressor(
        n_estimators=best_params['n_estimators'],
        max_depth=best_params['max_depth'],
        min_samples_split=best_params['min_samples_split'],
        min_samples_leaf=best_params['min_samples_leaf'],
        max_features=best_params['max_features'],
        random_state=42,
        n_jobs=-1
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    rf_val_predictions[method] = y_pred

    # 計算 MSE、R² 和 adjusted R²
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    adjr2 = adjusted_r2(r2, n=len(y_test), k=len(best_features))

    print(f"✅ Val MSE: {mse:.4f}")
    print(f"✅ Val R²: {r2:.4f}")
    print(f"✅ Val Adjusted R²: {adjr2:.4f}")

    rf_val_metrics.append({
        'ship_method': method,
        'val_mse': mse,
        'val_r2': r2,
        'val_adj_r2': adjr2
    })

# 統整成 DataFrame
rf_val_metrics_df = pd.DataFrame(rf_val_metrics)


📦 建立模型並預測：AMAZON_FREIGHT
✅ Val MSE: 0.1802
✅ Val R²: 0.9327
✅ Val Adjusted R²: 0.9303

📦 建立模型並預測：AMAZON_LTL
✅ Val MSE: 0.1407
✅ Val R²: 0.8974
✅ Val Adjusted R²: 0.8970

📦 建立模型並預測：AMAZON_UPS_PARCEL
✅ Val MSE: 0.0674
✅ Val R²: 0.9404
✅ Val Adjusted R²: 0.9402

📦 建立模型並預測：ESTES
✅ Val MSE: 0.1580
✅ Val R²: 0.9150
✅ Val Adjusted R²: 0.9014

📦 建立模型並預測：HOUR_LOOP_FEDEX_PARCEL
✅ Val MSE: 0.1625
✅ Val R²: 0.8541
✅ Val Adjusted R²: 0.8514

📦 建立模型並預測：UBER_LTL
✅ Val MSE: 0.3382
✅ Val R²: 0.7625
✅ Val Adjusted R²: 0.7532

📦 建立模型並預測：WWE_LTL
✅ Val MSE: 0.2648
✅ Val R²: 0.8644
✅ Val Adjusted R²: 0.8574

📦 建立模型並預測：WWE_PARCEL
✅ Val MSE: 0.1528
✅ Val R²: 0.8413
✅ Val Adjusted R²: 0.8379


In [7]:
rf_val_metrics_df

Unnamed: 0,ship_method,val_mse,val_r2,val_adj_r2
0,AMAZON_FREIGHT,0.180221,0.932736,0.93029
1,AMAZON_LTL,0.140741,0.897357,0.896964
2,AMAZON_UPS_PARCEL,0.067363,0.94038,0.940158
3,ESTES,0.157986,0.915005,0.901406
4,HOUR_LOOP_FEDEX_PARCEL,0.162523,0.854052,0.851415
5,UBER_LTL,0.338237,0.762478,0.753224
6,WWE_LTL,0.26477,0.864359,0.857403
7,WWE_PARCEL,0.152841,0.841333,0.837884


In [8]:
rf_final_predictions = {}
rf_final_metrics = []

print('使用 after 資料做最終驗證：')


for method in ship_method_name:
    print(f"\n🔍 最終驗證：{method}")
    
    best_features = cols[method]
    best_params = vars[method]

    full_train = train_data_set[method].copy()
    y_train = full_train['log_cost']
    X_train = full_train.drop(columns=['cost', 'log_cost'])

    vendor_mean = full_train.groupby('vendor_name')['log_cost'].mean()
    X_train['vendor_name_encoded'] = X_train['vendor_name'].map(vendor_mean).fillna(y_train.mean())

    val_data = final_validation_data_set[method].copy()
    y_val = val_data['log_cost']
    X_val = val_data.drop(columns=['cost', 'log_cost'])
    X_val['vendor_name_encoded'] = X_val['vendor_name'].map(vendor_mean).fillna(y_train.mean())

    # 建立一個 dict，把 encoder 資訊打包
    vendor_encoder_package = {
        'map': vendor_mean.to_dict(),           # target encoding 映射表
        'fallback': y_train.mean()              # fallback 值（可根據你策略更改）
    }

    # 存進 .pkl 檔
    with open(f'vendor_encoder_{method}.pkl', 'wb') as f:
        pickle.dump(vendor_encoder_package, f)


    model = RandomForestRegressor(
        n_estimators=best_params['n_estimators'],
        max_depth=best_params['max_depth'],
        min_samples_split=best_params['min_samples_split'],
        min_samples_leaf=best_params['min_samples_leaf'],
        max_features=best_params['max_features'],
        random_state=42,
        n_jobs=-1
    )

    model.fit(X_train[best_features], y_train)


    with open(f'rf_{method}.pkl', 'wb') as f:
        pickle.dump(model, f)

    model_info = {
        "ship_method": method,
        "model_type": "RandomForestRegressor",
        "input_features": cols[method],
        "parameters": vars[method],
        "trained_at": "2025-06-11"
    }
    if "vendor_name_encoded" in cols[method]:
        model_info['encoding'] = {
            "vendor_name": {
                "method": "target_encoding",
                "source": f"vendor_encoder_{method}.pkl",
                "fallback_value": 4.8729,
                "fallback_strategy": "mean(log_cost) in training set"
            }
        }


    with open(f'rf_{method}_info.json', 'w') as f:
        json.dump(model_info, f, indent=4)


    y_pred = model.predict(X_val[best_features])
    rf_final_predictions[method] = y_pred

    mse = mean_squared_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    adjr2 = adjusted_r2(r2, n=len(y_val), k=len(best_features))

    print(f"📊 Test MSE: {mse:.4f}")
    print(f"📊 Test R²: {r2:.4f}")
    print(f"📊 Test Adjusted R²: {adjr2:.4f}")

    rf_final_metrics.append({
        'ship_method': method,
        'test_mse': mse,
        'test_r2': r2,
        'test_adj_r2': adjr2
    })

rf_final_metrics_df = pd.DataFrame(rf_final_metrics)

使用 after 資料做最終驗證：

🔍 最終驗證：AMAZON_FREIGHT
📊 Test MSE: 0.3522
📊 Test R²: 0.8580
📊 Test Adjusted R²: 0.8483

🔍 最終驗證：AMAZON_LTL
📊 Test MSE: 0.1407
📊 Test R²: 0.6405
📊 Test Adjusted R²: 0.6380

🔍 最終驗證：AMAZON_UPS_PARCEL
📊 Test MSE: 0.0761
📊 Test R²: 0.9264
📊 Test Adjusted R²: 0.9259

🔍 最終驗證：ESTES
📊 Test MSE: 0.4357
📊 Test R²: 0.7029
📊 Test Adjusted R²: 0.6038

🔍 最終驗證：HOUR_LOOP_FEDEX_PARCEL
📊 Test MSE: 0.1722
📊 Test R²: 0.8036
📊 Test Adjusted R²: 0.7971

🔍 最終驗證：UBER_LTL
📊 Test MSE: 0.3697
📊 Test R²: 0.7684
📊 Test Adjusted R²: 0.7510

🔍 最終驗證：WWE_LTL
📊 Test MSE: 0.4409
📊 Test R²: 0.4353
📊 Test Adjusted R²: 0.3788

🔍 最終驗證：WWE_PARCEL
📊 Test MSE: 0.2363
📊 Test R²: 0.7103
📊 Test Adjusted R²: 0.6985


In [9]:
rf_final_metrics_df

Unnamed: 0,ship_method,test_mse,test_r2,test_adj_r2
0,AMAZON_FREIGHT,0.352208,0.858044,0.848254
1,AMAZON_LTL,0.140657,0.640464,0.637979
2,AMAZON_UPS_PARCEL,0.076095,0.926355,0.92586
3,ESTES,0.435716,0.702886,0.603848
4,HOUR_LOOP_FEDEX_PARCEL,0.172202,0.803606,0.79706
5,UBER_LTL,0.369656,0.768415,0.751046
6,WWE_LTL,0.440928,0.435252,0.378777
7,WWE_PARCEL,0.236294,0.710318,0.698494


In [None]:
# 使用 .pkl 方法

# with open(f'vendor_encoder_{method}.pkl', 'rb') as f:
#     encoder = pickle.load(f)

# vendor_map = encoder['map']
# fallback = encoder['fallback']

# X_val['vendor_name_encoded'] = X_val['vendor_name'].map(vendor_map).fillna(fallback)


# with open(f'rf_{method}.pkl', 'rb') as f:
#     model = pickle.load(f)

# y_pred = model.predict(X_val[features])