In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import mlflow
import mlflow.sklearn

# MLflow setup
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("fraud_detection_hyperparameter_tuning")

# Load data
data_path = r"C:\Job_og_eksamensbevis\Github\projekter\RF_project\data\creditcard.csv"
df = pd.read_csv(data_path)

X = df.drop('Class', axis=1)
y = df['Class']

X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

# Pipeline med SMOTE + RandomForest
pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42, n_jobs=-1))
])

# Grid af hyperparametre til GridSearchCV
param_grid = {
    'rf__n_estimators': [50, 150],
    'rf__max_depth': [20, None],
    'rf__min_samples_split': [2, 6],
    'rf__min_samples_leaf': [1, 2],
    'rf__max_features': ['sqrt', None]
}

# Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# GridSearchCV
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='f1',
    cv=skf,
    verbose=2,
    n_jobs=-1,
    return_train_score=True
)

In [4]:
# Fit GridSearchCV
grid_search.fit(X_train_val, y_train_val)

print(f"Best params: {grid_search.best_params_}")
print(f"Best CV F1-score: {grid_search.best_score_:.4f}")

for i, params in enumerate(grid_search.cv_results_['params']):
    with mlflow.start_run(run_name=f"CV_Combo_{i}"):
        for key, value in params.items():
            mlflow.log_param(key, value)
        mlflow.log_metric("mean_test_f1", grid_search.cv_results_['mean_test_score'][i])
        mlflow.log_metric("std_test_f1", grid_search.cv_results_['std_test_score'][i])


    # Log den bedste model
    mlflow.sklearn.log_model(grid_search.best_estimator_, artifact_path="best_rf_model")

Fitting 5 folds for each of 32 candidates, totalling 160 fits


KeyboardInterrupt: 

In [None]:
# --- Evaluering på test-sæt ---
best_model = grid_search.best_estimator_
y_test_pred = best_model.predict(X_test)

precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

print("\n=== Endelig evaluering på test-sæt ===")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-score:  {f1:.4f}")
print("Classification report:")
print(classification_report(y_test, y_test_pred))

with mlflow.start_run(run_name="Final_model_evaluation"):
    mlflow.log_metric("test_precision", precision)
    mlflow.log_metric("test_recall", recall)
    mlflow.log_metric("test_f1_score", f1)
    mlflow.sklearn.log_model(best_model, artifact_path="final_rf_model")


nedenstående model er det samme som alt ovenover bare i én sammenlagt celle

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import mlflow
import mlflow.sklearn

# MLflow setup
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("fraud_detection_hyperparameter_tuning")

# Load data
data_path = r"C:\Job_og_eksamensbevis\Github\projekter\RF_project\data\creditcard_preprocessed.csv"
df = pd.read_csv(data_path)

X = df.drop('Class', axis=1)
y = df['Class']

X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

# Pipeline med SMOTE + RandomForest
pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42, n_jobs=-1))
])

# Grid af hyperparametre til GridSearchCV
param_grid = {
    'rf__n_estimators': [100, 200], # 50 - 200
    'rf__max_depth': [None], # 10, 20, None
    # 'rf__min_samples_split': [2, 6], # 2-6
    # 'rf__min_samples_leaf': [1, 2],
    'rf__max_features': ['sqrt']
}

# Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# GridSearchCV
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='f1',
    cv=skf,
    verbose=2,
    n_jobs=-1,
    return_train_score=True
)

# Fit GridSearchCV
grid_search.fit(X_train_val, y_train_val)

print(f"Best params: {grid_search.best_params_}")
print(f"Best CV F1-score: {grid_search.best_score_:.4f}")

# Log kun den bedste model og dens hyperparametre
with mlflow.start_run(run_name="Best_GridSearch_Model"):
    mlflow.log_params(grid_search.best_params_)
    mlflow.log_metric("best_cv_f1", grid_search.best_score_)
    mlflow.sklearn.log_model(grid_search.best_estimator_, artifact_path="best_rf_model")

# --- Evaluering på test-sæt ---
best_model = grid_search.best_estimator_
y_test_pred = best_model.predict(X_test)

precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

print("\n=== Endelig evaluering på test-sæt ===")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-score:  {f1:.4f}")
print("Classification report:")
print(classification_report(y_test, y_test_pred))

# Log test-sæt evaluering separat
with mlflow.start_run(run_name="Final_model_evaluation"):
    mlflow.log_metric("test_precision", precision)
    mlflow.log_metric("test_recall", recall)
    mlflow.log_metric("test_f1_score", f1)
    mlflow.sklearn.log_model(best_model, artifact_path="final_rf_model")


Fitting 5 folds for each of 4 candidates, totalling 20 fits




Best params: {'rf__max_depth': None, 'rf__max_features': 'sqrt', 'rf__n_estimators': 150}
Best CV F1-score: 0.8518




🏃 View run Best_GridSearch_Model at: http://127.0.0.1:5000/#/experiments/4/runs/c5e0e63e9ac347c39b0967f90ad7d218
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/4

=== Endelig evaluering på test-sæt ===
Confusion Matrix:
[[56848    16]
 [   17    81]]
Precision: 0.8351
Recall:    0.8265
F1-score:  0.8308
Classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.84      0.83      0.83        98

    accuracy                           1.00     56962
   macro avg       0.92      0.91      0.92     56962
weighted avg       1.00      1.00      1.00     56962





🏃 View run Final_model_evaluation at: http://127.0.0.1:5000/#/experiments/4/runs/4961c96012e349708a9e66e809e9366d
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/4


In [11]:
# tilføj navn til den model jeg vil registrere (tilføj challenger eller champion i alias)
model_name = 'Final_model_evaluation'
run_id=input('Please type RunID')
model_uri = f'runs:/{run_id}/final_rf_model'

with mlflow.start_run(run_id=run_id):
    mlflow.register_model(model_uri=model_uri, name=model_name)

Registered model 'Final_model_evaluation' already exists. Creating a new version of this model...
2025/07/15 13:20:26 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Final_model_evaluation, version 1


🏃 View run Final_model_evaluation at: http://127.0.0.1:5000/#/experiments/4/runs/4961c96012e349708a9e66e809e9366d
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/4


Created version '1' of model 'Final_model_evaluation'.
