In [1]:
%pip install category_encoders
import pandas as pd
import numpy as np

df = pd.read_csv("Shopping Trends And Customer Behaviour Dataset.csv")

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
df['Subscription Status']=df['Subscription Status'].map({'Yes':1,'No':0}).astype('Int64')
df.head()

Unnamed: 0.1,Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,0,1,55,Male,Blouse,Clothing,53,Kentucky,Gray,Winter,3.1,1,Express,Yes,Yes,14,Venmo,Fortnightly
1,1,2,19,Male,Sweater,Clothing,64,Maine,Maroon,Winter,3.1,1,Express,Yes,Yes,2,Cash,Fortnightly
2,2,3,50,Male,Jeans,Clothing,73,Massachusetts,Maroon,Spring,3.1,1,Free Shipping,Yes,Yes,23,Credit Card,Weekly
3,3,4,21,Male,Sandals,Footwear,90,Rhode Island,Maroon,Spring,3.5,1,Next Day Air,Yes,Yes,49,PayPal,Weekly
4,4,5,45,Male,Blouse,Clothing,49,Oregon,Turquoise,Spring,2.7,1,Free Shipping,Yes,Yes,31,PayPal,Annually


In [4]:
# X, y
X = df.drop(columns=['Subscription Status'])
y = df['Subscription Status']  # 0/1（二元）

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.metrics import roc_auc_score
import category_encoders as ce

# 欄位分群
num_cols      = ['Age','Purchase Amount (USD)','Review Rating','Previous Purchases']
low_card_cols = ['Gender','Category','Season','Shipping Type','Payment Method','Discount Applied','Promo Code Used']
high_card_cols= ['Location','Color','Item Purchased']
order_cols    = ['Frequency of Purchases']

# 有序類別順序
ord_categories = [['Weekly','Bi-Weekly','Fortnightly','Monthly','Every 3 Months','Quarterly','Annually']]
ord_enc = OrdinalEncoder(categories=ord_categories, handle_unknown='use_encoded_value', unknown_value=-1)

# 前處理
pre = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
        ]), num_cols),

        ('low_card', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ]), low_card_cols),

        ('high_card', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('ce', ce.CountEncoder(handle_unknown=0))
        ]), high_card_cols),

        ('order', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('ord', ord_enc)
        ]), order_cols),
    ],
    remainder='drop'
)

# 模型
rf = RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=42)

pipe = Pipeline([
    ('pre', pre),
    ('rf', rf),
])

# 交叉驗證（ROC AUC）
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_auc = cross_val_score(pipe, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print('CV ROC AUC (mean ± std):', cv_auc.mean(), cv_auc.std())

# OOF 機率與 AUC
y_oof_proba = cross_val_predict(pipe, X, y, cv=cv, method='predict_proba', n_jobs=-1)[:, 1]
print('OOF ROC AUC:', roc_auc_score(y, y_oof_proba))


CV ROC AUC (mean ± std): 0.9022417654260673 0.007290504308973167
OOF ROC AUC: 0.9018424952741777


In [8]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

rf_base=RandomForestClassifier(
    n_estimators=600,
    max_depth=10,
    max_features='sqrt',
    min_samples_split=2,
    min_samples_leaf=2,
    bootstrap=True,
    class_weight='balanced', 
    n_jobs=-1, 
    random_state=42)

pipe_rf_ran=Pipeline([
    ('pre', pre),
    ('rf', rf_base)
])

param_dist = {
    'rf__n_estimators': randint(200, 1201),
    'rf__max_depth': randint(5, 31),
    'rf__max_features': ['sqrt', 'log2', None],
    'rf__min_samples_split': randint(2, 21),
    'rf__min_samples_leaf': randint(1, 11),
    'rf__bootstrap': [True]        
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

ran_search = RandomizedSearchCV(
    pipe_rf_ran, 
    param_distributions=param_dist,
    n_iter=30,
    scoring='roc_auc',
    cv=cv,
    n_jobs=-1,
    random_state=42,
    verbose=1,
)

ran_search.fit(X,y)
print('Best CV score:',ran_search.best_score_)
print('Best params:',ran_search.best_params_)

best_rf=ran_search.best_estimator_

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best CV score: 0.9028548965792181
Best params: {'rf__bootstrap': True, 'rf__max_depth': 24, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 3, 'rf__min_samples_split': 6, 'rf__n_estimators': 1018}


In [9]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import numpy as np
import pandas as pd

# --- 1) CV 與共同設定 ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
common_gs_kwargs = dict(
    estimator=pipe_rf_ran,   
    scoring='roc_auc',
    cv=cv,
    n_jobs=-1,
    verbose=1,
    refit=True                  
)

# --- 2) 第一輪：粗格子(鎖定鄰域但保持一定彈性) ---
param_grid_coarse = {
    'rf__bootstrap': [True],               
    'rf__max_features': ['sqrt'],          
    'rf__max_depth': [20,22,24,26,28], 
    'rf__min_samples_split': [2, 3, 4],    
    'rf__min_samples_leaf': [4,5,6,7,8],  
    'rf__n_estimators': [950,1000,1050,1100,1150]
}

gs_coarse = GridSearchCV(param_grid=param_grid_coarse, **common_gs_kwargs)  #**->展開工具箱
gs_coarse.fit(X, y)
print("Coarse best AUC:", gs_coarse.best_score_)
print("Coarse best params:", gs_coarse.best_params_)
best_coarse = gs_coarse.best_estimator_

# --- 3) 第二輪：細格子(以第一輪最佳參數為中心做微調) ---
best = gs_coarse.best_params_
md = best['rf__max_depth']
msl = best['rf__min_samples_leaf']
mss = best['rf__min_samples_split']
nest = best['rf__n_estimators']

param_grid_fine = {
    'rf__bootstrap': [True],
    'rf__max_features': ['sqrt'],                  
    'rf__max_depth': [max(2, md-2), md-1, md, md+1, md+2],
    'rf__min_samples_split': sorted(set([mss-1, mss, mss+1]) - {0}),
    'rf__min_samples_leaf': sorted(set([msl-1, msl, msl+1]) - {0}),
    'rf__n_estimators': [max(100, nest-100), nest-50, nest, nest+50, nest+100]
}

gs_fine = GridSearchCV(param_grid=param_grid_fine, **common_gs_kwargs)
gs_fine.fit(X, y)
print("Fine best AUC:", gs_fine.best_score_)
print("Fine best params:", gs_fine.best_params_)
best_final = gs_fine.best_estimator_


Fitting 5 folds for each of 375 candidates, totalling 1875 fits
Coarse best AUC: 0.9022905860480475
Coarse best params: {'rf__bootstrap': True, 'rf__max_depth': 20, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 7, 'rf__min_samples_split': 2, 'rf__n_estimators': 950}
Fitting 5 folds for each of 225 candidates, totalling 1125 fits


375 fits failed out of a total of 1125.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
375 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\USER\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\USER\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\USER\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\pipeline.py", line 663, in fit
    self._final_estimator.fit(Xt, y, **last_

Fine best AUC: 0.9025276510143583
Fine best params: {'rf__bootstrap': True, 'rf__max_depth': 18, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 7, 'rf__min_samples_split': 2, 'rf__n_estimators': 900}


In [11]:
from sklearn.model_selection import cross_val_predict
best_oof_proba = cross_val_predict(best_final, X, y, cv=5, method='predict_proba',n_jobs=-1)[:, 1]


In [17]:
from sklearn.metrics import precision_score, recall_score

target_recall = 0.9
best_thr, best_prec = 0.5, 0

for thr in np.linspace(0, 1, 101):
    preds = (best_oof_proba >= thr).astype(int)
    rec = recall_score(y, preds)
    if rec >= target_recall:
        prec = precision_score(y, preds)
        if prec > best_prec:
            best_prec, best_thr = prec, thr

print(f"Best threshold with recall ≥ {target_recall}:",
      best_thr, "Precision =", best_prec)



Best threshold with recall ≥ 0.9: 0.02 Precision = 0.627906976744186


In [18]:
import numpy as np
from sklearn.base import clone
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    precision_score, recall_score, f1_score,
    confusion_matrix
)

# ===== 1) 設定找到的閾值 =====
THR = 0.02   

# ===== 2) 重新在全資料上訓練最終模型 (for 最終部署/之後預測) =====
best_final.fit(X, y)  

# ===== 3) 產生 OOF 機率，避免樂觀偏差 =====
# 用 clone(best_final) 確保 cross_val_predict 內外模型互不干擾
oof_proba = cross_val_predict(
    clone(best_final), X, y,
    cv=cv,                   
    method='predict_proba',
    n_jobs=-1
)[:, 1]

# ===== 4) 依固定閾值產生 OOF 預測並評分 =====
oof_pred = (oof_proba >= THR).astype(int)

metrics = {
    "ROC AUC": roc_auc_score(y, oof_proba),
    "PR AUC (AP)": average_precision_score(y, oof_proba),
    "Precision@THR": precision_score(y, oof_pred, zero_division=0),
    "Recall@THR": recall_score(y, oof_pred, zero_division=0),
    "F1@THR": f1_score(y, oof_pred, zero_division=0),
}
cm = confusion_matrix(y, oof_pred)

print(f"[Fixed threshold = {THR:.4f}] OOF metrics")
for k, v in metrics.items():
    print(f"- {k}: {v:.6f}")
print("\nConfusion matrix @THR (OOF):\n", cm)



[Fixed threshold = 0.0200] OOF metrics
- ROC AUC: 0.901784
- PR AUC (AP): 0.666187
- Precision@THR: 0.627907
- Recall@THR: 1.000000
- F1@THR: 0.771429

Confusion matrix @THR (OOF):
 [[2223  624]
 [   0 1053]]


In [None]:
import pathlib, joblib, json
from datetime import datetime, timezone

THR = 0.02
base = pathlib.Path(r"C:\Users\USER\Desktop\資料分析\作品\購物行為分析\models")
base.mkdir(parents=True, exist_ok=True)

# 存模型
joblib.dump(best_final, base / "model.joblib")

# 存 meta
meta = {"threshold": THR, "exported_at": datetime.now(timezone.utc).isoformat()}
with open(base / "meta.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)


