In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE

# Load data
data_path = r"C:\Job_og_eksamensbevis\Github\projekter\RF_project\data\creditcard_preprocessed.csv"
df = pd.read_csv(data_path)

X = df.drop('Class', axis=1)
y = df['Class']

# Split data i træning+validering og test (20% test, stratificeret)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

# Stratified K-Fold på træning+validering (80%)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

precisions = []
recalls = []
f1s = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_val, y_train_val), 1):
    X_train, X_val = X_train_val.iloc[train_idx], X_train_val.iloc[val_idx]
    y_train, y_val = y_train_val.iloc[train_idx], y_train_val.iloc[val_idx]

    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    rf = RandomForestClassifier(
        n_estimators=100,
        random_state=42,
        n_jobs=-1
    )

    rf.fit(X_train_resampled, y_train_resampled)
    y_val_pred = rf.predict(X_val)

    precision = precision_score(y_val, y_val_pred)
    recall = recall_score(y_val, y_val_pred)
    f1 = f1_score(y_val, y_val_pred)

    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)

    print(f"\nFold {fold}:")
    print("Confusion Matrix:")
    print(confusion_matrix(y_val, y_val_pred))
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-score:  {f1:.4f}")
    print("Classification report:")
    print(classification_report(y_val, y_val_pred))

print("\n=== Gennemsnitlig performance over 5 folds ===")
print(f"Gns. Precision: {sum(precisions)/len(precisions):.4f}")
print(f"Gns. Recall:    {sum(recalls)/len(recalls):.4f}")
print(f"Gns. F1-score:  {sum(f1s)/len(f1s):.4f}")

# --- Endelig test på hold-out test-sæt ---

# Træn model på hele træning+valideringsdata (med SMOTE)
smote = SMOTE(random_state=42)
X_train_val_resampled, y_train_val_resampled = smote.fit_resample(X_train_val, y_train_val)

final_rf = RandomForestClassifier(n_estimators=100,
                                  random_state=42,
                                  n_jobs=-1
)

final_rf.fit(X_train_val_resampled, y_train_val_resampled)

# Forudsig på det uafhængige test-sæt
y_test_pred = final_rf.predict(X_test)

print("\n=== Endelig evaluering på test-sæt ===")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))
print(f"Precision: {precision_score(y_test, y_test_pred):.4f}")
print(f"Recall:    {recall_score(y_test, y_test_pred):.4f}")
print(f"F1-score:  {f1_score(y_test, y_test_pred):.4f}")
print("Classification report:")
print(classification_report(y_test, y_test_pred))

import joblib


# Gem den trænede model
joblib.dump(final_rf, "rf_model.pkl")






Fold 1:
Confusion Matrix:
[[45479    12]
 [   15    63]]
Precision: 0.8400
Recall:    0.8077
F1-score:  0.8235
Classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     45491
           1       0.84      0.81      0.82        78

    accuracy                           1.00     45569
   macro avg       0.92      0.90      0.91     45569
weighted avg       1.00      1.00      1.00     45569






Fold 2:
Confusion Matrix:
[[45484     6]
 [   21    58]]
Precision: 0.9062
Recall:    0.7342
F1-score:  0.8112
Classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     45490
           1       0.91      0.73      0.81        79

    accuracy                           1.00     45569
   macro avg       0.95      0.87      0.91     45569
weighted avg       1.00      1.00      1.00     45569






Fold 3:
Confusion Matrix:
[[45485     5]
 [   20    59]]
Precision: 0.9219
Recall:    0.7468
F1-score:  0.8252
Classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     45490
           1       0.92      0.75      0.83        79

    accuracy                           1.00     45569
   macro avg       0.96      0.87      0.91     45569
weighted avg       1.00      1.00      1.00     45569






Fold 4:
Confusion Matrix:
[[45477    13]
 [    8    71]]
Precision: 0.8452
Recall:    0.8987
F1-score:  0.8712
Classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     45490
           1       0.85      0.90      0.87        79

    accuracy                           1.00     45569
   macro avg       0.92      0.95      0.94     45569
weighted avg       1.00      1.00      1.00     45569






Fold 5:
Confusion Matrix:
[[45484     6]
 [   14    65]]
Precision: 0.9155
Recall:    0.8228
F1-score:  0.8667
Classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     45490
           1       0.92      0.82      0.87        79

    accuracy                           1.00     45569
   macro avg       0.96      0.91      0.93     45569
weighted avg       1.00      1.00      1.00     45569


=== Gennemsnitlig performance over 5 folds ===
Gns. Precision: 0.8858
Gns. Recall:    0.8020
Gns. F1-score:  0.8395





=== Endelig evaluering på test-sæt ===
Confusion Matrix:
[[56853    11]
 [   18    80]]
Precision: 0.8791
Recall:    0.8163
F1-score:  0.8466
Classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.88      0.82      0.85        98

    accuracy                           1.00     56962
   macro avg       0.94      0.91      0.92     56962
weighted avg       1.00      1.00      1.00     56962



['rf_model.pkl']