In [5]:
import pandas as pd
import numpy as np

df = pd.read_csv('fraudTrain.csv')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1296675 non-null  int64  
 1   trans_date_trans_time  1296675 non-null  object 
 2   cc_num                 1296675 non-null  int64  
 3   merchant               1296675 non-null  object 
 4   category               1296675 non-null  object 
 5   amt                    1296675 non-null  float64
 6   first                  1296675 non-null  object 
 7   last                   1296675 non-null  object 
 8   gender                 1296675 non-null  object 
 9   street                 1296675 non-null  object 
 10  city                   1296675 non-null  object 
 11  state                  1296675 non-null  object 
 12  zip                    1296675 non-null  int64  
 13  lat                    1296675 non-null  float64
 14  long              

In [7]:
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])

In [8]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import numpy as np

cat_cols=['category','gender']
city_cols=['city']
num_cols=['amt','unix_time']
y_cols=['is_fraud']

pre=ColumnTransformer(
    transformers=[
        ('num',Pipeline([
            ('imputer',SimpleImputer(strategy='median')),
        ]),num_cols),
        ('cat',Pipeline([
            ('imputer',SimpleImputer(strategy='most_frequent')),
            ('ohe',OneHotEncoder()),
        ]),cat_cols),
    ],
    remainder='drop'
)

xgb=XGBClassifier(
    n_estimators=200,        
    max_depth=6,             
    learning_rate=0.1,      
    subsample=0.8,           
    colsample_bytree=0.8,    
    min_child_weight=5,      
    reg_lambda=1.0,          
    n_jobs=-1,              
    random_state=42,
    tree_method="hist"  
)

pipe=Pipeline([
    ('pre',pre),
    ('xgb',xgb)
])

X=df.copy()
y=df[y_cols[0]]

cv=StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
scores_f1=cross_val_score(pipe,X,y,scoring='f1',cv=cv,n_jobs=-1)

scores_recall=cross_val_score(pipe,X,y,scoring='recall',cv=cv,n_jobs=-1)


print(scores_f1.mean())
print(scores_f1)
print(scores_recall.mean())
print(scores_recall)

0.6731094594963818
[0.68325288 0.67557252 0.66716924 0.67595819 0.66359447]
0.5897967710829265
[0.61292472 0.58960693 0.58960693 0.58161226 0.57523302]


In [9]:
from sklearn.model_selection import StratifiedGroupKFold

groups = df['cc_num'].values

gcv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

scores_f1 = cross_val_score(pipe, X, y, groups=groups, scoring='f1', cv=gcv, n_jobs=-1)
scores_recall = cross_val_score(pipe, X, y, groups=groups, scoring='recall', cv=gcv, n_jobs=-1)

print(scores_f1.mean(), scores_f1)
print(scores_recall.mean(), scores_recall)


0.6460434986902688 [0.63126593 0.65048924 0.66505246 0.63432562 0.64908425]
0.5528626648862438 [0.54273192 0.56072874 0.57461646 0.53075031 0.55548589]


In [10]:
print(df['is_fraud'].value_counts(normalize=True))

is_fraud
0    0.994211
1    0.005789
Name: proportion, dtype: float64


In [11]:
from sklearn.metrics import average_precision_score, roc_auc_score

xgb = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=171,   # 設定權重
    eval_metric='aucpr',    # 改用 PR-AUC
    tree_method='hist',
    random_state=42,
    n_jobs=-1
)

pipe=Pipeline([
    ('pre',pre),
    ('xgb',xgb)
])

scores_pr = cross_val_score(pipe, X, y, scoring='average_precision', cv=gcv, n_jobs=-1, groups=groups)
scores_roc = cross_val_score(pipe, X, y, scoring='roc_auc', cv=gcv, n_jobs=-1, groups=groups)

print("PR-AUC:", scores_pr.mean(), scores_pr)
print("ROC-AUC:", scores_roc.mean(), scores_roc)

PR-AUC: 0.7242123239693096 [0.70285883 0.72345375 0.73410921 0.72669426 0.73394557]
ROC-AUC: 0.9939748027246378 [0.99391673 0.99343074 0.9940852  0.99454076 0.99390058]


In [12]:
from sklearn.metrics import make_scorer, fbeta_score

f2_scorer = make_scorer(fbeta_score, beta=2)

scores_f2 = cross_val_score(pipe, X, y, scoring=f2_scorer, cv=gcv, groups=groups, n_jobs=-1)
print("F2-score:", scores_f2.mean(), scores_f2)


F2-score: 0.4746163002802037 [0.4330814  0.46139111 0.46241396 0.4995423  0.51665273]


In [None]:
from time_transformer_tools import TimeFeaturesTransformer
from sklearn.compose import make_column_selector as selector

tf = TimeFeaturesTransformer(
    datetime_col="trans_date_trans_time",
    group_cols=("cc_num",),
    one_hot=True,
    fill_first_delta=0,
    drop_datetime=False
)

# 1) 明確把 tf 產生的數值特徵加到數值清單
time_num_feats = ["delta_sec_prev_tx", "is_unusual_hour"]
num_cols_final = num_cols + time_num_feats

# 2) 針對四個 one-hot 欄位，直接 passthrough（它們已是 0/1，不要再經過 OneHotEncoder）
bucket_selector = selector(pattern=r"^time_bucket_")  # 動態抓四欄

pre = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
        ]), num_cols_final),

        # 已經是 one-hot 的欄位直接通過
        ('bucket_pass', 'passthrough', bucket_selector),

        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('ohe', OneHotEncoder(handle_unknown='ignore')),
        ]), cat_cols),
    ],
    remainder='drop'
)

pipe=Pipeline([
    ('tf',tf),
    ('pre',pre),
    ('xgb',xgb)
])

scores_pr=cross_val_score(pipe,X,y,scoring='average_precision',cv=gcv,n_jobs=-1,groups=groups)
print("PR-AUC:",scores_pr.mean(),scores_pr)

scores_f2 = cross_val_score(pipe, X, y, scoring=f2_scorer, cv=gcv, groups=groups, n_jobs=-1)
print("F2-score:", scores_f2.mean(), scores_f2)

PR-AUC: 0.8333486352603604 [0.81123381 0.84078191 0.84101291 0.82722396 0.84649059]
F2-score: 0.6335824382253682 [0.59579762 0.61856577 0.62534309 0.6456444  0.68256131]


In [17]:
from importlib import reload
import locat_transformer_tools
reload(locat_transformer_tools)

<module 'locat_transformer_tools' from 'c:\\Users\\USER\\Desktop\\資料分析\\作品\\信用卡交易詐欺偵測\\locat_transformer_tools.py'>

In [18]:
from locat_transformer_tools import GeoTemporalFeatures

geo = GeoTemporalFeatures(
    lat_col="lat", lon_col="long",
    merch_lat_col="merch_lat", merch_lon_col="merch_long",
    city_col="city", cc_col="cc_num", time_col="trans_date_trans_time",
    merchant_key_cols=("merchant"),
    hours_window=24,
    first_speed_fill="nan",
    return_dataframe=True,
    append_original=True,          
    drop_time_col_in_output=False 
)

geo_cols = ["distance_home_to_merchant","travel_speed_kmh","uniq_cities_24h","uniq_merchants_24h"]

pre = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
        ]), num_cols_final),
        ('bucket_pass', 'passthrough', bucket_selector),
        ("geo", "passthrough", geo_cols),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('ohe', OneHotEncoder(handle_unknown='ignore')),
        ]), cat_cols),
    ],
    remainder='drop'
)

pipe=Pipeline([
    ('tf',tf),
    ('geo',geo),
    ('pre',pre),
    ('xgb',xgb)
])

scores_pr=cross_val_score(pipe,X,y,scoring='average_precision',cv=gcv,n_jobs=-1,groups=groups)
print("PR-AUC:",scores_pr.mean(),scores_pr)

scores_f2 = cross_val_score(pipe, X, y, scoring=f2_scorer, cv=gcv, groups=groups, n_jobs=-1)
print("F2-score:", scores_f2.mean(), scores_f2)

PR-AUC: 0.8444498497723737 [0.82381086 0.8521711  0.85077182 0.84291501 0.85258046]
F2-score: 0.6867242150114028 [0.64971464 0.67524427 0.68792457 0.69564427 0.72509334]


In [19]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, precision_recall_curve
from scipy.stats import randint, uniform, loguniform

# ---- scorers ----
def max_fbeta_from_proba(y_true, y_prob, beta=2.0):
    p, r, thr = precision_recall_curve(y_true, y_prob)
    f = (1+beta**2) * (p*r) / (beta**2 * p + r + 1e-12)
    return np.nanmax(f[:-1])

f2_scorer_prob = make_scorer(max_fbeta_from_proba, needs_threshold=True, greater_is_better=True)


# ---- 覆蓋 xgb 參數空間 ----
param_distributions = {
    'xgb__n_estimators': randint(200, 1001),
    'xgb__learning_rate': uniform(0.02, 0.15),      
    'xgb__max_depth': randint(3, 9),
    'xgb__min_child_weight': randint(1, 9),
    'xgb__subsample': uniform(0.6, 0.4),            
    'xgb__colsample_bytree': uniform(0.6, 0.4),    
    'xgb__gamma': uniform(0.0, 5.0),
    'xgb__reg_alpha': loguniform(1e-8, 1e-1),
    'xgb__reg_lambda': loguniform(1e-2, 10), 
    'xgb__scale_pos_weight': [171],  
    'xgb__tree_method': ['hist'],
    'xgb__max_delta_step': randint(0, 5),
}

search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_distributions,
    n_iter=30,                        
    scoring={'pr_auc': 'average_precision', 'f2': f2_scorer_prob},
    refit='pr_auc',                    # 以 PR-AUC 為最終挑選
    cv=gcv,
    n_jobs=-1,
    verbose=1,
    random_state=42,
)

search.fit(X, y, groups=groups)
print("Best PR-AUC:", search.best_score_)
print("Best params:", search.best_params_)

best_model = search.best_estimator_


Fitting 5 folds for each of 30 candidates, totalling 150 fits


 nan nan nan nan nan nan nan nan nan nan nan nan]


Best PR-AUC: 0.8467928879219093
Best params: {'xgb__colsample_bytree': np.float64(0.8537405378805455), 'xgb__gamma': np.float64(3.403527257773834), 'xgb__learning_rate': np.float64(0.09964018749757046), 'xgb__max_delta_step': 1, 'xgb__max_depth': 8, 'xgb__min_child_weight': 8, 'xgb__n_estimators': 446, 'xgb__reg_alpha': np.float64(0.000506930978634979), 'xgb__reg_lambda': np.float64(1.926898532522621), 'xgb__scale_pos_weight': 171, 'xgb__subsample': np.float64(0.6950550175969599), 'xgb__tree_method': 'hist'}


In [23]:
import numpy as np
import pandas as pd

res = pd.DataFrame(search.cv_results_)

# 找出所有 split fold 的測試分數欄位
fold_cols = [c for c in res.columns if c.startswith('split') and c.endswith('_test_pr_auc')]

# 轉成 float 再檢查 NaN
res[fold_cols] = res[fold_cols].astype(float)

print("各 split 欄位 NaN 比例：")
print(res[fold_cols].isna().mean())

# 檢查 best params 用到多少有效折
best = res.loc[res['rank_test_pr_auc'].idxmin()]
valid_folds = np.isfinite(best[fold_cols].astype(float)).sum()
print("best params 有效折數：", valid_folds, "/", len(fold_cols))


各 split 欄位 NaN 比例：
split0_test_pr_auc    0.0
split1_test_pr_auc    0.0
split2_test_pr_auc    0.0
split3_test_pr_auc    0.0
split4_test_pr_auc    0.0
dtype: float64
best params 有效折數： 5 / 5


In [25]:
import numpy as np
from sklearn.metrics import precision_recall_curve
import joblib

best_model = search.best_estimator_   # 已用 best params refit 完成
proba = best_model.predict_proba(X)[:,1]

p, r, thr = precision_recall_curve(y, proba)

# --- Max-F2 ---
f2 = (5 * p * r) / (4 * p + r + 1e-12)
best_idx = np.nanargmax(f2[:-1])  # 最後一點沒有對應 threshold
best_th = float(thr[best_idx])
best_f2 = float(f2[best_idx])
print(f"Best threshold (Max-F2): {best_th:.4f}, F2={best_f2:.4f}")

# --- Recall@P≥0.90 ---
mask = p[:-1] >= 0.90   # precision 和 threshold 對齊
if mask.any():
    recall_at_p90 = float(r[:-1][mask].max())
    th_at_p90 = float(thr[mask][r[:-1][mask].argmax()])
else:
    recall_at_p90, th_at_p90 = 0.0, 0.5

print(f"Recall@P≥0.90: {recall_at_p90:.4f} at threshold={th_at_p90:.4f}")


# 3) 持久化
artifact = {"model": best_model, "threshold": best_th, "metrics": {
    "cv_pr_auc": search.best_score_,
    "cv_f2_max": best_f2,
    "recall_at_p90": recall_at_p90
}}
joblib.dump(artifact, "fraud_xgb_artifact.pkl")


Best threshold (Max-F2): 0.9362, F2=0.9605
Recall@P≥0.90: 0.9376 at threshold=0.9701


['fraud_xgb_artifact.pkl']

In [26]:
from sklearn.metrics import average_precision_score, fbeta_score, precision_recall_curve
import numpy as np

oof_proba = np.zeros_like(y, dtype=float)

for tr, te in gcv.split(X, y, groups):
    m = search.best_estimator_  
    m.fit(X.iloc[tr], y[tr])
    oof_proba[te] = m.predict_proba(X.iloc[te])[:,1]

# 固定剛剛找到的門檻
best_th = artifact['threshold'] if isinstance(artifact, dict) else best_th

oof_pred = (oof_proba >= best_th).astype(int)
p, r, _ = precision_recall_curve(y, oof_proba)
pr_auc_oof = average_precision_score(y, oof_proba)
f2_oof = fbeta_score(y, oof_pred, beta=2)

print(f"OOF PR-AUC={pr_auc_oof:.4f}, OOF F2={f2_oof:.4f}")


OOF PR-AUC=0.8477, OOF F2=0.7642
