In [2]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import mlflow
import mlflow.sklearn

# === Output-mappe ===
output_dir = "outputs"
os.makedirs(output_dir, exist_ok=True)

# === MLflow setup ===
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("fraud_detection_hyperparameter_tuning")

# === Load data ===
data_path = r"C:\Job_og_eksamensbevis\Github\projekter\RF_project\data\creditcard_preprocessed.csv"
df = pd.read_csv(data_path)
X = df.drop('Class', axis=1)
y = df['Class']

X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

# === Pipeline med SMOTE + RandomForest ===
pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42, n_jobs=-1))
])

# === Grid af hyperparametre ===
param_grid = {
    'rf__n_estimators': [150], # 50 - 200
    'rf__max_depth': [None], # 10, 20, None
    'rf__min_samples_split': [2, 6], # 2-6
    'rf__min_samples_leaf': [1, 2],
    'rf__max_features': ['sqrt']
}

# === Stratified K-Fold ===
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# === GridSearchCV ===
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='f1',
    cv=skf,
    verbose=2,
    n_jobs=-1,
    return_train_score=True
)

# === Fit model ===
grid_search.fit(X_train_val, y_train_val)

# === Gem bedste parametre og CV-score ===
best_params = grid_search.best_params_
best_cv_score = grid_search.best_score_

with open(os.path.join(output_dir, "best_model_info.txt"), "w") as f:
    f.write("Best Hyperparameters:\n")
    f.write(str(best_params) + "\n\n")
    f.write(f"Best Cross-Validated F1-score: {best_cv_score:.4f}\n")

# === MLflow log af bedste model ===
with mlflow.start_run(run_name="Best_GridSearch_Model"):
    mlflow.log_params(best_params)
    mlflow.log_metric("best_cv_f1", best_cv_score)
    mlflow.sklearn.log_model(grid_search.best_estimator_, artifact_path="best_rf_model")

# === Evaluering på test-sæt ===
best_model = grid_search.best_estimator_
y_test_pred = best_model.predict(X_test)

precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)
clf_report = classification_report(y_test, y_test_pred)
cm = confusion_matrix(y_test, y_test_pred)

# === Gem performance metrics som tekst ===
with open(os.path.join(output_dir, "test_set_results.txt"), "w") as f:
    f.write("=== Final Evaluation on Test Set ===\n")
    f.write(f"Precision: {precision:.4f}\n")
    f.write(f"Recall:    {recall:.4f}\n")
    f.write(f"F1-score:  {f1:.4f}\n\n")
    f.write("Classification Report:\n")
    f.write(clf_report + "\n")

# === Gem confusion matrix som billede ===
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Not Fraud', 'Fraud'],
            yticklabels=['Not Fraud', 'Fraud'])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "confusion_matrix.png"))
plt.close()

# === MLflow log af test-performance ===
with mlflow.start_run(run_name="Final_model_evaluation"):
    mlflow.log_metric("test_precision", precision)
    mlflow.log_metric("test_recall", recall)
    mlflow.log_metric("test_f1_score", f1)
    mlflow.sklearn.log_model(best_model, artifact_path="final_rf_model")


KeyboardInterrupt: 

In [2]:
# tilføj navn til den model jeg vil registrere (tilføj challenger eller champion i alias)
model_name = 'Final_model_evaluation'
run_id=input('Please type RunID')
model_uri = f'runs:/{run_id}/final_rf_model'

with mlflow.start_run(run_id=run_id):
    mlflow.register_model(model_uri=model_uri, name=model_name)

Registered model 'Final_model_evaluation' already exists. Creating a new version of this model...
2025/07/15 14:02:49 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Final_model_evaluation, version 2


🏃 View run Final_model_evaluation at: http://127.0.0.1:5000/#/experiments/4/runs/261625da33554440bac00f10cfdb71f3
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/4


Created version '2' of model 'Final_model_evaluation'.
