In [21]:
%pip install category_encoders
import pandas as pd
import numpy as np

df = pd.read_csv("Shopping Trends And Customer Behaviour Dataset.csv")

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [22]:
df['Subscription Status']=df['Subscription Status'].map({'Yes':1,'No':0}).astype('Int64')
df.head()

Unnamed: 0.1,Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,0,1,55,Male,Blouse,Clothing,53,Kentucky,Gray,Winter,3.1,1,Express,Yes,Yes,14,Venmo,Fortnightly
1,1,2,19,Male,Sweater,Clothing,64,Maine,Maroon,Winter,3.1,1,Express,Yes,Yes,2,Cash,Fortnightly
2,2,3,50,Male,Jeans,Clothing,73,Massachusetts,Maroon,Spring,3.1,1,Free Shipping,Yes,Yes,23,Credit Card,Weekly
3,3,4,21,Male,Sandals,Footwear,90,Rhode Island,Maroon,Spring,3.5,1,Next Day Air,Yes,Yes,49,PayPal,Weekly
4,4,5,45,Male,Blouse,Clothing,49,Oregon,Turquoise,Spring,2.7,1,Free Shipping,Yes,Yes,31,PayPal,Annually


In [23]:
# X, y
X = df.drop(columns=['Subscription Status'])
y = df['Subscription Status']  # 0/1（二元）

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.metrics import roc_auc_score
import category_encoders as ce

# 欄位分群
num_cols      = ['Age','Purchase Amount (USD)','Review Rating','Previous Purchases']
low_card_cols = ['Gender','Category','Season','Shipping Type','Payment Method','Discount Applied','Promo Code Used']
high_card_cols= ['Location','Color','Item Purchased']
order_cols    = ['Frequency of Purchases']

# 有序類別順序
ord_categories = [['Weekly','Bi-Weekly','Fortnightly','Monthly','Every 3 Months','Quarterly','Annually']]
ord_enc = OrdinalEncoder(categories=ord_categories, handle_unknown='use_encoded_value', unknown_value=-1)

# 前處理
pre = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
        ]), num_cols),

        ('low_card', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ]), low_card_cols),

        ('high_card', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('ce', ce.CountEncoder(handle_unknown=0))
        ]), high_card_cols),

        ('order', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('ord', ord_enc)
        ]), order_cols),
    ],
    remainder='drop'
)

# 模型
rf = RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=42)

pipe = Pipeline([
    ('pre', pre),
    ('rf', rf),
])

# 交叉驗證（ROC AUC）
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_auc = cross_val_score(pipe, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print('CV ROC AUC (mean ± std):', cv_auc.mean(), cv_auc.std())

# OOF 機率與 AUC
y_oof_proba = cross_val_predict(pipe, X, y, cv=cv, method='predict_proba', n_jobs=-1)[:, 1]
print('OOF ROC AUC:', roc_auc_score(y, y_oof_proba))


CV ROC AUC (mean ± std): 0.9022417654260673 0.007290504308973167
OOF ROC AUC: 0.9018424952741777
