Her er en fuldt integreret løsning, som bevarer SMOTE-oversampling i træningsfoldene, og samtidig bruger GridSearchCV med F1-score som scoringsmetrik. SMOTE kan ikke inkluderes direkte i GridSearchCV, så vi pakker det hele ind i en Pipeline.

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline  # NB: fra imblearn, ikke sklearn!from tqdm.notebook import tqdm

# Load data
data_path = r"C:\Job_og_eksamensbevis\Github\projekter\RF_project\data\creditcard.csv"
df = pd.read_csv(data_path)

X = df.drop('Class', axis=1)
y = df['Class']

# Train/test split
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

# --- Pipeline: SMOTE + Random Forest ---
pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42, n_jobs=-1))
])

# --- Hyperparameter grid ---
param_grid = {
    'rf__n_estimators': [100, 200],
    'rf__max_depth': [None, 10, 20],
    'rf__min_samples_split': [2, 5],
    'rf__min_samples_leaf': [1, 2],
    'rf__max_features': ['sqrt', 'log2']
}

# --- Grid Search med cross-validation ---
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    scoring='f1',
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train_val, y_train_val)

# --- Bedste parametre og score ---
print("\n=== Bedste parametre fundet af GridSearchCV ===")
print(grid_search.best_params_)
print(f"Bedste F1-score på valideringsfolds: {grid_search.best_score_:.4f}")

# --- Endelig evaluering på test-sæt ---
y_test_pred = grid_search.best_estimator_.predict(X_test)

print("\n=== Endelig evaluering på test-sæt ===")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))
print(f"Precision: {precision_score(y_test, y_test_pred):.4f}")
print(f"Recall:    {recall_score(y_test, y_test_pred):.4f}")
print(f"F1-score:  {f1_score(y_test, y_test_pred):.4f}")
print("Classification report:")
print(classification_report(y_test, y_test_pred))

Fitting 5 folds for each of 48 candidates, totalling 240 fits


min_samples_split 2, 5, 10

min_samples_leaf 1, 2, 5

max_features 'sqrt', 'log2', None

bootstrap True eller False

class_weight	 {0:1, 1:5}

random_state		Fx 42

n_jobs	-1 = alle kerner

criterion 'gini' (default) eller 'entropy', 'log_loss'