# Alles importen

In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from sklearn.model_selection import train_test_split, learning_curve
from xgboost.sklearn import XGBClassifier
from xgboost import plot_importance, to_graphviz
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, average_precision_score
import pickle
import warnings
warnings.filterwarnings("ignore")

# Secundaire data inladen:

In [2]:
df = pd.read_csv('Synthetic_Fraud_Dataset_md2.csv')
df.sample(15)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
7063,689,TRANSFER,79288.673556,C7692094644,84067.763568,4779.090012,M3269934594,165484.908651,244773.582207,0
27802,254,TRANSFER,3623.011255,C5641837462,49506.701301,45883.690046,M8104797644,118024.464764,121647.476019,0
11991,159,CASH_OUT,20919.891541,C8062442760,116551.192617,95631.301076,M5533710989,83040.755488,103960.647029,0
44647,628,PAYMENT,109706.218949,C4543753156,57115.62283,0.0,M7593759283,115816.253349,225522.472298,0
19445,143,PAYMENT,5124.61012,C7063627061,119183.193104,114058.582984,M9294615020,68557.685307,73682.295427,0
11437,280,CASH_OUT,14826.436383,C4191081138,114055.2769,99228.840518,M6142123586,149987.319794,164813.756177,0
9646,280,CASH_OUT,46584.331071,C5815991234,161959.993533,115375.662462,M6773026655,173017.657078,219601.988149,0
44508,274,CASH_OUT,95369.449515,C7533304417,81999.703995,81999.703995,M8781884948,98443.990685,98443.990685,1
10406,192,PAYMENT,110184.176857,C5185887733,105043.608001,0.0,M5918606153,81501.010434,191685.187291,0
32567,674,TRANSFER,72928.623658,C1321425564,52132.963853,0.0,M5591908781,257732.541038,330661.164695,0


# Onnodige data droppen en feature engineering toepassen:
## Feature engineering: bereken balansafwijkingen voor verzender en ontvanger. Deze waarden zouden nul moeten zijn als het saldo correct wordt bijgewerkt na de transactie. Niet-nul waarden kunnen wijzen op frauduleuze transacties.

In [3]:
# drop not important features
df = df.drop(['step', 'nameOrig', 'nameDest'], axis=1)

# feature engineering
df['errorBalanceOrig'] = df.newbalanceOrig + df.amount - df.oldbalanceOrg
df['errorBalanceDest'] = df.oldbalanceDest + df.amount - df.newbalanceDest

# encode categorical labels
encoder = LabelEncoder()
df["type"] = encoder.fit_transform(df["type"])

# define x en y
X = df.drop(columns=["isFraud"])  # Features
y = df["isFraud"]  # Goal feature

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# Opgeslagen beste XGBoost model laden en toepassen op een 5-foudige cross validation en scores opslaan

In [4]:
import pickle
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

# Load the saved model & threshold
with open("fraud_detection_model_mandy_nieuw.pkl", "rb") as file:
    loaded_data = pickle.load(file)

loaded_model = loaded_data["model"]
loaded_threshold = loaded_data["threshold"]

# Force the model to use CPU (disable GPU)
loaded_model.set_params(tree_method='hist')  # Ensure XGBoost runs on CPU
print(f"Loaded Model with Saved Threshold: {loaded_threshold:.4f}")

# Perform 5-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Store scores
precision_scores, recall_scores, f1_scores, roc_auc_scores = [], [], [], []

for train_idx, test_idx in cv.split(X_train, y_train):
    X_fold_train, X_fold_test = X_train.iloc[train_idx], X_train.iloc[test_idx]
    y_fold_train, y_fold_test = y_train.iloc[train_idx], y_train.iloc[test_idx]

    # Train on the current fold
    loaded_model.fit(X_fold_train, y_fold_train)

    # Predict probabilities
    y_fold_proba = loaded_model.predict_proba(X_fold_test)[:, 1]

    # Apply the saved threshold
    y_fold_pred = (y_fold_proba >= loaded_threshold).astype(int)

    # Calculate metrics
    precision_scores.append(precision_score(y_fold_test, y_fold_pred))
    recall_scores.append(recall_score(y_fold_test, y_fold_pred))
    f1_scores.append(f1_score(y_fold_test, y_fold_pred))
    roc_auc_scores.append(roc_auc_score(y_fold_test, y_fold_proba))  # ROC-AUC uses probabilities

# Print mean and standard deviation of scores
print(f"Precision: Mean = {np.mean(precision_scores):.4f}, Std = {np.std(precision_scores):.4f}")
print(f"Recall: Mean = {np.mean(recall_scores):.4f}, Std = {np.std(recall_scores):.4f}")
print(f"F1 Score: Mean = {np.mean(f1_scores):.4f}, Std = {np.std(f1_scores):.4f}")
print(f"ROC-AUC: Mean = {np.mean(roc_auc_scores):.4f}, Std = {np.std(roc_auc_scores):.4f}")




Loaded Model with Saved Threshold: 0.9974
Precision: Mean = 0.9181, Std = 0.0113
Recall: Mean = 0.8981, Std = 0.0235
F1 Score: Mean = 0.9079, Std = 0.0145
ROC-AUC: Mean = 0.9583, Std = 0.0088
