In [5]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import mlflow
import mlflow.sklearn

# MLflow setup
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("fraud_detection_hyperparameter_tuning")

# Load data
data_path = r"C:\Job_og_eksamensbevis\Github\projekter\RF_project\data\creditcard.csv"
df = pd.read_csv(data_path)

X = df.drop('Class', axis=1)
y = df['Class']

X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

# Pipeline med SMOTE + RandomForest
pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42, n_jobs=-1))
])

# Grid af hyperparametre til GridSearchCV
param_grid = {
    'rf__n_estimators': [100, 150],
    'rf__max_depth': [10, 20, None],
    #'rf__min_samples_split': [2, 5],
    #'rf__min_samples_leaf': [1, 2],
    #'rf__max_features': ['auto', 'sqrt']
}

# Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# GridSearchCV
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='f1',
    cv=skf,
    verbose=2,
    n_jobs=-1,
    return_train_score=True
)

In [None]:
# Fit GridSearchCV
grid_search.fit(X_train_val, y_train_val)

print(f"Best params: {grid_search.best_params_}")
print(f"Best CV F1-score: {grid_search.best_score_:.4f}")

# Log hver kombination i MLflow
for i, params in enumerate(grid_search.cv_results_['params']):
    with mlflow.start_run(run_name=f"params_run_{i}"):
        mlflow.log_params(params)
        mlflow.log_metric("mean_test_f1", grid_search.cv_results_['mean_test_score'][i])
        mlflow.log_metric("std_test_f1", grid_search.cv_results_['std_test_score'][i])


    # Log den bedste model
    mlflow.sklearn.log_model(grid_search.best_estimator_, artifact_path="best_rf_model")

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [None]:
# --- Evaluering på test-sæt ---
best_model = grid_search.best_estimator_
y_test_pred = best_model.predict(X_test)

precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

print("\n=== Endelig evaluering på test-sæt ===")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-score:  {f1:.4f}")
print("Classification report:")
print(classification_report(y_test, y_test_pred))

with mlflow.start_run(run_name="Final_model_evaluation"):
    mlflow.log_metric("test_precision", precision)
    mlflow.log_metric("test_recall", recall)
    mlflow.log_metric("test_f1_score", f1)
    mlflow.sklearn.log_model(best_model, artifact_path="final_rf_model")


nedenstående model er det samme som alt ovenover bare i én sammenlagt celle

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import mlflow
import mlflow.sklearn

# MLflow setup
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("fraud_detection_hyperparameter_tuning")

# Load data
data_path = r"C:\Job_og_eksamensbevis\Github\projekter\RF_project\data\creditcard.csv"
df = pd.read_csv(data_path)

X = df.drop('Class', axis=1)
y = df['Class']

X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

# Pipeline med SMOTE + RandomForest
pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42, n_jobs=-1))
])

# Grid af hyperparametre til GridSearchCV
param_grid = {
    'rf__n_estimators': [100, 150],
    #'rf__max_depth': [10, 20, None],
    #'rf__min_samples_split': [2, 5],
    #'rf__min_samples_leaf': [1, 2],
    #'rf__max_features': ['auto', 'sqrt']
}

# Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# GridSearchCV
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='f1',
    cv=skf,
    verbose=2,
    n_jobs=-1,
    return_train_score=True
)
# Fit GridSearchCV
grid_search.fit(X_train_val, y_train_val)

print(f"Best params: {grid_search.best_params_}")
print(f"Best CV F1-score: {grid_search.best_score_:.4f}")

# Log hver kombination i MLflow
with mlflow.start_run(run_name="GridSearchCV"):
    for i, params in enumerate(grid_search.cv_results_['params']):
        mlflow.log_params(params)
        mlflow.log_metric("mean_test_f1", grid_search.cv_results_['mean_test_score'][i])
        mlflow.log_metric("std_test_f1", grid_search.cv_results_['std_test_score'][i])

    # Log den bedste model
    mlflow.sklearn.log_model(grid_search.best_estimator_, artifact_path="best_rf_model")
    
# --- Evaluering på test-sæt ---
best_model = grid_search.best_estimator_
y_test_pred = best_model.predict(X_test)

precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

print("\n=== Endelig evaluering på test-sæt ===")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-score:  {f1:.4f}")
print("Classification report:")
print(classification_report(y_test, y_test_pred))

with mlflow.start_run(run_name="Final_model_evaluation"):
    mlflow.log_metric("test_precision", precision)
    mlflow.log_metric("test_recall", recall)
    mlflow.log_metric("test_f1_score", f1)
    mlflow.sklearn.log_model(best_model, artifact_path="final_rf_model")