In [2]:
# model_train.py
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import joblib

# Load preprocessed features and target separately
X = pd.read_csv('KDDTrain+_features_processed.csv')
y_df = pd.read_csv('KDDTrain+_target_processed.csv')
y = y_df['attack_label']

# Split data into train and test sets (70/30) with stratification to preserve class distribution
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.3, random_state=42)

# Initialize and train Random Forest Classifier
rf = RandomForestClassifier(n_estimators=300, max_depth=10, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

# Train Decision Tree Classifier as a baseline for comparison
baseline = DecisionTreeClassifier(random_state=42)
baseline.fit(X_train, y_train)

# Train Isolation Forest for anomaly detection (unsupervised)
iso_forest = IsolationForest(contamination=0.01, random_state=42)
iso_forest.fit(X)

# Evaluate Random Forest accuracy on test data
print("Random Forest Accuracy:", rf.score(X_test, y_test))

# Save the trained Random Forest model for later use (e.g., Flask app)
joblib.dump(rf, 'random_forest_model.joblib')

# Prediction function using trained RF model
def predict(features):
    pred = rf.predict([features])
    prob = rf.predict_proba([features])
    return pred[0], max(prob[0])


Random Forest Accuracy: 0.9933848433530906


In [3]:
print("Random Forest Training Accuracy:", rf.score(X_train, y_train))
print("Random Forest Test Accuracy:", rf.score(X_test, y_test))

Random Forest Training Accuracy: 0.9938875721527313
Random Forest Test Accuracy: 0.9933848433530906
