In [10]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pickle
import os

# Define the file path
DATA_FILE = 'data/kddcup.data_10_percent' 

print("Attempting ultimate data load fix: Skipping first row and using regex separator...")

# --- Load the Data using the Regex Separator and Skip Rows ---
# We use the Python engine (slower) because it's the only one that handles regex separators.
try:
    df = pd.read_csv(
        DATA_FILE, 
        header=None, 
        sep=r'\s+',               # Use regex to match one or more spaces as the separator (recommended fix)
        engine='python',          # Must use the Python engine for regex separators
        skiprows=1,               # CRITICAL FIX: Skip the very first row
        skipinitialspace=True     
    ) 
except Exception as e:
    print(f"FATAL ERROR during data read: {e}")
    # You should not reach this point.

# --- CRITICAL FIX: Handle the Trailing Label Column ---
# Standard KDD data should have 42 columns (41 features + 1 label).
if df.shape[1] == 43:
    print("Detected 43 columns; dropping the empty trailing column.")
    df = df.iloc[:, :-1]
elif df.shape[1] != 42:
    print(f"WARNING: Unexpected column count: {df.shape[1]}. Expected 42. Proceeding...")
    
print(f"DataFrame successfully loaded with {df.shape[1]} columns.")

# --- 1. Separate Features (X) and Label (y) ---
X = df.iloc[:, :-1]   # Features (index 0 to 40)
y = df.iloc[:, -1]    # Label (index 41)

# --- 2. Label Binarization (Attack vs. Normal) ---
y = y.apply(lambda x: 0 if x.strip().strip('.') == 'normal' else 1)

# --- 3. One-Hot Encoding for Categorical Features ---
categorical_cols = [1, 2, 3] 

print("Applying One-Hot Encoding to categorical features...")
X_processed = pd.get_dummies(X, columns=categorical_cols) 

# Ensure data types are numeric and handle any potential infinity/NaN introduced
X_processed = X_processed.replace([float('inf'), -float('inf')], 0).fillna(0)

print(f"New feature vector shape (features): {X_processed.shape}")

Attempting ultimate data load fix: Skipping first row and using regex separator...
DataFrame successfully loaded with 1 columns.
Applying One-Hot Encoding to categorical features...


KeyError: "None of [Index([1, 2, 3], dtype='int64')] are in the [columns]"

In [11]:
import pandas as pd

# Correct file path
DATA_FILE = "/Users/lakshithamadushan/IDS project FYP/data/kddcup.data_10_percent"

# Load dataset
df = pd.read_csv(DATA_FILE, header=None)
print("Original shape:", df.shape)
print(df.head())

# Drop the last column if it's the difficulty score
if df.shape[1] == 43:
    df = df.iloc[:, :-1]

print("After dropping difficulty column:", df.shape)

# Separate features and labels
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Binary classification: 0 = normal, 1 = attack
y = y.apply(lambda x: 0 if x.strip().strip('.') == 'normal' else 1)

# One-Hot Encoding for categorical columns: protocol_type, service, flag
categorical_cols = [1, 2, 3]
X_processed = pd.get_dummies(X, columns=categorical_cols)

# Ensure no NaNs or infinity
X_processed = X_processed.fillna(0)

print("Processed features shape:", X_processed.shape)
print("Ready to train your model!")


Original shape: (125973, 43)
   0    1         2   3    4     5   6   7   8   9   ...    33    34    35  \
0   0  tcp  ftp_data  SF  491     0   0   0   0   0  ...  0.17  0.03  0.17   
1   0  udp     other  SF  146     0   0   0   0   0  ...  0.00  0.60  0.88   
2   0  tcp   private  S0    0     0   0   0   0   0  ...  0.10  0.05  0.00   
3   0  tcp      http  SF  232  8153   0   0   0   0  ...  1.00  0.00  0.03   
4   0  tcp      http  SF  199   420   0   0   0   0  ...  1.00  0.00  0.00   

     36    37    38    39    40       41  42  
0  0.00  0.00  0.00  0.05  0.00   normal  20  
1  0.00  0.00  0.00  0.00  0.00   normal  15  
2  0.00  1.00  1.00  0.00  0.00  neptune  19  
3  0.04  0.03  0.01  0.00  0.01   normal  21  
4  0.00  0.00  0.00  0.00  0.00   normal  21  

[5 rows x 43 columns]
After dropping difficulty column: (125973, 42)
Processed features shape: (125973, 122)
Ready to train your model!


In [12]:
# Assuming X_processed and y are defined from the successful run above
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pickle
import os

# --- 4. Split Data ---
print("Splitting data into training (80%) and testing (20%)...")
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42
)

# --- 5. Train Model ---
print("Starting Model Training (Random Forest Classifier)... This may take a few minutes.")
# n_jobs=-1 tells the model to use all CPU cores on your MacBook Pro
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1) 
model.fit(X_train, y_train)

# --- 6. Evaluate and Save ---
accuracy = model.score(X_test, y_test)
print(f"\nModel Accuracy on Test Data: {accuracy*100:.2f}%")

# Aim for 98%+ accuracy!

# Save the trained model and the feature list for the dashboard
MODEL_FILE = 'basic_model.pkl'
FEATURE_FILE = 'feature_columns.pkl'

with open(MODEL_FILE, 'wb') as f:
    pickle.dump(model, f)
    
with open(FEATURE_FILE, 'wb') as f:
    pickle.dump(X_processed.columns.tolist(), f)
    
print(f"\nSuccessfully saved model to: {MODEL_FILE}")
print(f"Successfully saved feature list to: {FEATURE_FILE}")

Splitting data into training (80%) and testing (20%)...
Starting Model Training (Random Forest Classifier)... This may take a few minutes.


TypeError: Feature names are only supported if all input features have string names, but your input has ['int', 'str'] as feature name / column name types. If you want feature names to be stored and validated, you must convert them all to strings, by using X.columns = X.columns.astype(str) for example. Otherwise you can remove feature / column names from your input data, or convert them all to a non-string data type.

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle

# 1️⃣ Split the dataset (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42
)

print("Training set:", X_train.shape, "Test set:", X_test.shape)

# 2️⃣ Initialize Random Forest Classifier
rf_model = RandomForestClassifier(
    n_estimators=100,    # number of trees
    random_state=42,
    n_jobs=-1            # use all cores for faster training
)

# 3️⃣ Train the model
print("Training the Random Forest model...")
rf_model.fit(X_train, y_train)
print("Model training complete!")

# 4️⃣ Predict on the test set
y_pred = rf_model.predict(X_test)

# 5️⃣ Evaluate accuracy and performance
acc = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {acc*100:.2f}%\n")

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# 6️⃣ Save the trained model for later use
MODEL_FILE = "rf_model.pkl"
with open(MODEL_FILE, "wb") as f:
    pickle.dump(rf_model, f)

print(f"Trained model saved as '{MODEL_FILE}'")


Training set: (100778, 122) Test set: (25195, 122)
Training the Random Forest model...


TypeError: Feature names are only supported if all input features have string names, but your input has ['int', 'str'] as feature name / column name types. If you want feature names to be stored and validated, you must convert them all to strings, by using X.columns = X.columns.astype(str) for example. Otherwise you can remove feature / column names from your input data, or convert them all to a non-string data type.

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle

# 1️⃣ Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42
)

# 2️⃣ Fix column types for scikit-learn
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

# 3️⃣ Initialize Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# 4️⃣ Train
print("Training Random Forest model...")
rf_model.fit(X_train, y_train)
print("Training complete!")

# 5️⃣ Predict & Evaluate
y_pred = rf_model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# 6️⃣ Save model
with open("rf_model.pkl", "wb") as f:
    pickle.dump(rf_model, f)
print("Model saved as rf_model.pkl")


Training Random Forest model...
Training complete!
Accuracy: 99.88%
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     13422
           1       1.00      1.00      1.00     11773

    accuracy                           1.00     25195
   macro avg       1.00      1.00      1.00     25195
weighted avg       1.00      1.00      1.00     25195

Confusion Matrix:
[[13417     5]
 [   24 11749]]
Model saved as rf_model.pkl


In [15]:
def predict_attack(new_data):
    import pickle
    import pandas as pd

    # Load the model
    with open("rf_model.pkl", "rb") as f:
        model = pickle.load(f)
    
    # Ensure categorical columns are one-hot encoded
    new_data_processed = pd.get_dummies(new_data)
    
    # Align columns with training data
    model_features = model.feature_names_in_
    for col in model_features:
        if col not in new_data_processed:
            new_data_processed[col] = 0
    new_data_processed = new_data_processed[model_features]
    
    return model.predict(new_data_processed)
