In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import mlflow
import mlflow.sklearn

# Load dataset
file_path = r"C:\Users\torjm\OneDrive\Bureau\Master Degree\first year\ML\project1\Project 1\cybersecurity-dashboard\Datasets\cyberdata_ip_city.csv"
cyberdata = pd.read_csv(file_path)

if "Attack Type" in cyberdata.columns:
    features = cyberdata.select_dtypes(include=['number']).drop(columns=["Attack Type"], errors='ignore')
    target = cyberdata["Attack Type"]

    X_train, X_test, y_train, y_test = train_test_split(
        features, target, test_size=0.2, random_state=42, stratify=target
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    rf_estimator = RandomForestClassifier(n_estimators=100, random_state=42)
    cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    mlflow.set_experiment("CyberAttackDetection")

    with mlflow.start_run(run_name="RandomForest_RFECV"):
        rfecv = RFECV(estimator=rf_estimator, step=1, cv=cv_strategy, scoring='accuracy', n_jobs=-1)
        rfecv.fit(X_train_scaled, y_train)

        print("Optimal number of features:", rfecv.n_features_)

        X_train_selected = rfecv.transform(X_train_scaled)
        X_test_selected = rfecv.transform(X_test_scaled)

        final_model = RandomForestClassifier(n_estimators=100, random_state=42)
        final_model.fit(X_train_selected, y_train)

        y_pred = final_model.predict(X_test_selected)
        accuracy = accuracy_score(y_test, y_pred)
        classification_rep = classification_report(y_test, y_pred)

        # Log parameters
        mlflow.log_param("model_type", "RandomForest")
        mlflow.log_param("feature_selection", "RFECV")
        mlflow.log_param("n_estimators", 100)
        mlflow.log_param("n_features_selected", rfecv.n_features_)

        # Log metrics
        mlflow.log_metric("accuracy", accuracy)

        # Log model
        mlflow.sklearn.log_model(final_model, "model")

        # Log selected features
        selected_features = features.columns[rfecv.support_].tolist()
        with open("selected_features_rf.txt", "w") as f:
            for feat in selected_features:
                f.write(f"{feat}\n")
        mlflow.log_artifact("selected_features_rf.txt")

        # Log classification report
        with open("classification_report_rf.txt", "w") as f:
            f.write(classification_rep)
        mlflow.log_artifact("classification_report_rf.txt")

        print("Random Forest model with RFECV logged to MLflow")
        print("Accuracy:", accuracy)
        print("Classification Report:\n", classification_rep)

else:
    print("Error: 'Attack Type' does not exist in dataset.")


Optimal number of features: 8




✅ Random Forest model with RFECV logged to MLflow
Accuracy: 0.329375
Classification Report:
               precision    recall  f1-score   support

           0       0.33      0.35      0.34      2686
           1       0.32      0.32      0.32      2653
           2       0.33      0.31      0.32      2661

    accuracy                           0.33      8000
   macro avg       0.33      0.33      0.33      8000
weighted avg       0.33      0.33      0.33      8000

