In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('fraudTrain.csv')

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1296675 non-null  int64  
 1   trans_date_trans_time  1296675 non-null  object 
 2   cc_num                 1296675 non-null  int64  
 3   merchant               1296675 non-null  object 
 4   category               1296675 non-null  object 
 5   amt                    1296675 non-null  float64
 6   first                  1296675 non-null  object 
 7   last                   1296675 non-null  object 
 8   gender                 1296675 non-null  object 
 9   street                 1296675 non-null  object 
 10  city                   1296675 non-null  object 
 11  state                  1296675 non-null  object 
 12  zip                    1296675 non-null  int64  
 13  lat                    1296675 non-null  float64
 14  long              

In [3]:
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import numpy as np

cat_cols=['category','gender']
city_cols=['city']
num_cols=['amt','unix_time']
y_cols=['is_fraud']

pre=ColumnTransformer(
    transformers=[
        ('num',Pipeline([
            ('imputer',SimpleImputer(strategy='meadian')),
        ]),num_cols),
        ('cat',Pipeline([
            ('imputer',SimpleImputer(strategy='most_frequent')),
            ('ohe',OneHotEncoder()),
        ]),cat_cols),
    ],
    remainder='drop'
)

xgb=XGBClassifier(
    n_estimators=200,        
    max_depth=6,             
    learning_rate=0.1,      
    subsample=0.8,           
    colsample_bytree=0.8,    
    min_child_weight=5,      
    reg_lambda=1.0,          
    n_jobs=-1,              
    random_state=42,
    tree_method="hist"  
)

pipe=Pipeline([
    ('pre',pre),
    ('xgb',xgb)
])

X=df.copy()
y=df[y_cols[0]]

cv=StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
scores_f1=cross_val_score(pipe,X,y,scoring='f1',cv=cv,n_jobs=-1)

scores_recall=cross_val_score(pipe,X,y,scoring='recall',cv=cv,n_jobs=-1)


print(scores_f1.mean())
print(scores_f1)
print(scores_recall.mean())
print(scores_recall)

0.6731094594963818
[0.68325288 0.67557252 0.66716924 0.67595819 0.66359447]
0.5897967710829265
[0.61292472 0.58960693 0.58960693 0.58161226 0.57523302]


In [5]:
from sklearn.model_selection import StratifiedGroupKFold

groups = df['cc_num'].values

gcv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

scores_f1 = cross_val_score(pipe, X, y, groups=groups, scoring='f1', cv=gcv, n_jobs=-1)
scores_recall = cross_val_score(pipe, X, y, groups=groups, scoring='recall', cv=gcv, n_jobs=-1)

print(scores_f1.mean(), scores_f1)
print(scores_recall.mean(), scores_recall)


0.6460434986902688 [0.63126593 0.65048924 0.66505246 0.63432562 0.64908425]
0.5528626648862438 [0.54273192 0.56072874 0.57461646 0.53075031 0.55548589]


In [6]:
print(df['is_fraud'].value_counts(normalize=True))

is_fraud
0    0.994211
1    0.005789
Name: proportion, dtype: float64


In [7]:
from sklearn.metrics import average_precision_score, roc_auc_score

xgb = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=171,   # 設定權重
    eval_metric='aucpr',    # 改用 PR-AUC
    tree_method='hist',
    random_state=42,
    n_jobs=-1
)

pipe=Pipeline([
    ('pre',pre),
    ('xgb',xgb)
])

scores_pr = cross_val_score(pipe, X, y, scoring='average_precision', cv=gcv, n_jobs=-1, groups=groups)
scores_roc = cross_val_score(pipe, X, y, scoring='roc_auc', cv=gcv, n_jobs=-1, groups=groups)

print("PR-AUC:", scores_pr.mean(), scores_pr)
print("ROC-AUC:", scores_roc.mean(), scores_roc)

PR-AUC: 0.7242123239693096 [0.70285883 0.72345375 0.73410921 0.72669426 0.73394557]
ROC-AUC: 0.9939748027246378 [0.99391673 0.99343074 0.9940852  0.99454076 0.99390058]


In [8]:
from sklearn.metrics import make_scorer, fbeta_score

f2_scorer = make_scorer(fbeta_score, beta=2)

scores_f2 = cross_val_score(pipe, X, y, scoring=f2_scorer, cv=gcv, groups=groups, n_jobs=-1)
print("F2-score:", scores_f2.mean(), scores_f2)


F2-score: 0.4746163002802037 [0.4330814  0.46139111 0.46241396 0.4995423  0.51665273]


In [None]:
from time_transformer_tools import TimeFeaturesTransformer
from sklearn.compose import make_column_selector as selector
tf = TimeFeaturesTransformer(
    datetime_col="trans_date_trans_time",
    group_cols=("cc_num",),
    one_hot=True,
    fill_first_delta=0,
    drop_datetime=False
)

# 1) 明確把 tf 產生的數值特徵加到數值清單
time_num_feats = ["delta_sec_prev_tx", "is_unusual_hour"]
num_cols_final = num_cols + time_num_feats

# 2) 針對四個 one-hot 欄位，直接 passthrough（它們已是 0/1，不要再經過 OneHotEncoder）
bucket_selector = selector(pattern=r"^time_bucket_")  # 動態抓四欄

pre = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='meadian')),
        ]), num_cols_final),

        # 已經是 one-hot 的欄位直接通過
        ('bucket_pass', 'passthrough', bucket_selector),

        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('ohe', OneHotEncoder(handle_unknown='ignore')),
        ]), cat_cols),
    ],
    remainder='drop'
)

pipe=Pipeline([
    ('tf',tf),
    ('pre',pre),
    ('xgb',xgb)
])

scores_pr=cross_val_score(pipe,X,y,scoring='average_precision',cv=gcv,n_jobs=-1,groups=groups)
print("PR-AUC:",scores_pr.mean(),scores_pr)

scores_f2 = cross_val_score(pipe, X, y, scoring=f2_scorer, cv=gcv, groups=groups, n_jobs=-1)
print("F2-score:", scores_f2.mean(), scores_f2)

PR-AUC: 0.8333486352603604 [0.81123381 0.84078191 0.84101291 0.82722396 0.84649059]
F2-score: 0.6335824382253682 [0.59579762 0.61856577 0.62534309 0.6456444  0.68256131]
