In [None]:
import sqlite3
import pandas as pd

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, roc_auc_score, roc_curve,f1_score, precision_score,recall_score,precision_recall_curve
import matplotlib.pyplot as plt
import joblib

In [None]:
from scipy.stats import uniform

In [None]:
from sklearn.preprocessing import StandardScaler
import pickle
import json

In [None]:
conn = sqlite3.connect('fraud_detection.db')
#when only query, no need cursor.   cursor = conn.cursor()

df = pd.read_csv('creditcard.csv')
df.to_sql('transactions', conn, if_exists='replace', index=False)

query = """
SELECT * FROM transactions 
WHERE Amount > 0
"""
data = pd.read_sql_query(query, conn)

conn.close()

print(data.head())
# This code connects to a SQLite database, reads a CSV file into a DataFrame,
# and executes a SQL query to retrieve transactions with a positive amount.
print(data['Class'].value_counts())

In [None]:
print(data.isnull().sum()) #check for null values
data['Class'] = data['Class'].astype('int') #make sure Class is int

Q1 = data['Amount'].quantile(0.25)
Q3 = data['Amount'].quantile(0.75)
IQR = Q3 - Q1
outliners = data[(data['Amount'] < (Q1 - 1.5 * IQR)) | (data['Amount'] > (Q3 + 1.5 * IQR))]
print(f"Number of outliners: {len(outliners)}") # check for outliners in Amount column
# these outliners may be frauds, so we will keep them for now

In [None]:
# Convert Time to Hour and create Is_Night feature
data['Hour'] = data['Time'] // 3600 % 24  # Convert Time to Hour (0-23)
data['Is_Night'] = data['Hour'].apply(lambda x: 1 if 0<= x <= 6 else 0)

# Amount feature engineering, scaling and binning for better model performance

scaler = StandardScaler()

# standardize the Amount feature
data['Amount_Scaled'] = scaler.fit_transform(data[['Amount']])
data['Amount_high'] = data['Amount'].apply(lambda x: 1 if x > 300 else 0)
# save scaler.pkl
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

print("scaler.pkl saved！")

print(data.head(5))  # Display summary statistics of the DataFrame

In [None]:
# Resampling using SMOTE, to handle class imbalance, improve model performance,
# and ensure the model is not biased towards the majority class
# but may increase the risk of overfitting, need to compare with scale_pos_weight
x = data.drop(['Class', 'Time'], axis=1)
y = data['Class']
smote = SMOTE(random_state=42)
x_resampled, y_resampled = smote.fit_resample(x, y)
print(f"resampled class distribution:\n{y_resampled.value_counts()}")
x_resampled.to_csv('x_resampled.csv', index=False)
y_resampled.to_csv('y_resampled.csv', index=False)

In [None]:
# 1. load smote data
X = pd.read_csv('x_resampled.csv')
y = pd.read_csv('y_resampled.csv').values.ravel()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("SMOTE train shape：", X_train.shape)
print("SMOTE test shape：", X_test.shape)

# 2. smote model training and evaluation
model_smote = XGBClassifier(random_state=42)  # default：n_estimators=100, max_depth=6, learning_rate=0.3
model_smote.fit(X_train, y_train)
y_pred_smote = model_smote.predict(X_test)
print("\nsolution1：SMOTE（default） - report：")
print(classification_report(y_test, y_pred_smote))
print(f"solution1 AUC-ROC: {roc_auc_score(y_test, model_smote.predict_proba(X_test)[:, 1])}")
# check if overfitting
print("SMOTE train F1：", f1_score(y_train, model_smote.predict(X_train)))
print("SMOTE test F1：", f1_score(y_test, y_pred_smote))



In [None]:
# 3. solution2 original data + scale_pos_weight
data = pd.read_csv('creditcard.csv')
data['Hour'] = (data['Time'] // 3600) % 24
data['Is_Night'] = data['Hour'].apply(lambda x: 1 if 0 <= x <= 6 else 0)
data['Amount_Scaled'] = StandardScaler().fit_transform(data[['Amount']])
data['Amount_high'] = data['Amount_Scaled'].apply(lambda x: 1 if x > 500 else 0)
X_orig = data.drop(['Class', 'Time'], axis=1)
y_orig = data['Class']

X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X_orig, y_orig, test_size=0.2, random_state=42)
fraud_ratio = len(y_train_orig[y_train_orig == 0]) / len(y_train_orig[y_train_orig == 1])
model_spw = XGBClassifier(scale_pos_weight=2*fraud_ratio, random_state=42)  # increase scale_pos_weight to handle class imbalance
model_spw.fit(X_train_orig, y_train_orig)
y_pred_spw = model_spw.predict(X_test_orig)
print("\nsolution2：scale_pos_weight（default） - report：")
print(classification_report(y_test_orig, y_pred_spw))
print(f"solution2 AUC-ROC: {roc_auc_score(y_test_orig, model_spw.predict_proba(X_test_orig)[:, 1])}")
# check if overfitting
print("scale_pos_weight train F1：", f1_score(y_train_orig, model_spw.predict(X_train_orig)))
print("scale_pos_weight test F1：", f1_score(y_test_orig, y_pred_spw))

In [None]:
# 4. optimize （RandomizedSearchCV，optimize recall）
param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': uniform(0.01, 0.3),  # use uniform distribution
}
random_search = RandomizedSearchCV(XGBClassifier(random_state=42), param_grid, n_iter=20, cv=5, scoring='recall', n_jobs=-1)
random_search.fit(X_train, y_train)
print("\nBEST prarmater（SMOTE，optimize recall）：", random_search.best_params_)
print("Best Recall score：", random_search.best_score_)
model_best = random_search.best_estimator_

In [None]:
# evaluate model（modify threshold to 0.3）defaule threshhold 0.5
# y_pred = model_best.predict(X_test)   
y_pred_best = (model_best.predict_proba(X_test)[:, 1] > 0.3).astype(int)
print("\noptimized model（SMOTE，threshold 0.3） - report：")
print(classification_report(y_test, y_pred_best))
print(f"optimized AUC-ROC: {roc_auc_score(y_test, model_best.predict_proba(X_test)[:, 1])}")
# 检查过拟合
print("Optimized train F1：", f1_score(y_train, model_best.predict(X_train)))
print("Optimized test F1：", f1_score(y_test, y_pred_best))

In [None]:
# 5. feature importance
importance = pd.Series(model_best.feature_importances_, index=X_train.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 6))
importance[:10].plot(kind='bar')
plt.title('feature importance（XGBoost - SMOTE optimized）')
plt.xlabel('feature')
plt.ylabel('importance')
plt.savefig('feature_importance.png')
plt.close()
pd.DataFrame({'Feature': X_train.columns, 'Importance': model_best.feature_importances_}).to_csv('feature_importance.csv', index=False)
print("\nTop 10 importance：")
print(importance[:10])

In [None]:
# 6. model save（base recall）
final_model = model_best

joblib.dump(final_model, 'fraud_model.pkl')


In [None]:
data = pd.read_csv("creditcard.csv")
X = data.drop(columns=['Class', 'Time'])  # drop Class and Time columns
with open("X_mean.json", "w") as f:
    json.dump(X.mean().to_dict(), f)
