In [1]:
pip install pandas scikit-learn numpy

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np

# Load the dataset with low_memory=False to handle potential mixed data types
df = pd.read_csv('/kaggle/input/icd-dataset/combine.csv', low_memory=False)

print("Dataset loaded successfully!")
print("Initial Data:")
print(df.head())

Dataset loaded successfully!
Initial Data:
   Destination Port   Flow Duration   Total Fwd Packets  \
0             54865             3.0                 2.0   
1             55054           109.0                 1.0   
2             55055            52.0                 1.0   
3             46236            34.0                 1.0   
4             54863             3.0                 2.0   

    Total Backward Packets  Total Length of Fwd Packets  \
0                      0.0                         12.0   
1                      1.0                          6.0   
2                      1.0                          6.0   
3                      1.0                          6.0   
4                      0.0                         12.0   

    Total Length of Bwd Packets   Fwd Packet Length Max  \
0                           0.0                     6.0   
1                           6.0                     6.0   
2                           6.0                     6.0   
3          

In [3]:
import pandas as pd
import numpy as np

# Assuming 'df' is your loaded DataFrame
# Drop any rows with missing values (NaN)
df.dropna(inplace=True)

# Replace infinite values with 0
df.replace([np.inf, -np.inf], 0, inplace=True)

# Rename the ' Label' column for easier access
df.rename(columns={' Label': 'Label'}, inplace=True)

# Convert the label to a numerical format: 1 for attack, 0 for benign
df['Label'] = df['Label'].apply(lambda x: 1 if x != 'BENIGN' else 0)

print("\nCleaned Data Labels:")
print(df['Label'].value_counts())


Cleaned Data Labels:
Label
0    1672649
1     540682
Name: count, dtype: int64


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

# Assuming 'df' is your DataFrame from the previous step
# Drop non-numerical and irrelevant columns for the model
features = df.drop(columns=['Flow ID', ' Source IP', ' Destination IP', ' Timestamp', 'Label'], errors='ignore')
labels = df['Label']

# Scale the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Split the data into 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(scaled_features, labels, test_size=0.2, random_state=42)

# Create and train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

print("\nRandom Forest model trained successfully!")


Random Forest model trained successfully!


In [5]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Use the trained model to predict on the test data
predictions = model.predict(X_test)

# Calculate key performance metrics
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)

print("\nModel Evaluation:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


Model Evaluation:
Accuracy: 0.9990
Precision: 0.9975
Recall: 0.9982
F1-Score: 0.9979


In [6]:
import joblib

# Save the trained model and the scaler to disk
joblib.dump(model, 'ids_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

print("Model and scaler saved successfully!")

Model and scaler saved successfully!


In [7]:
import joblib

# Load the saved scaler
loaded_scaler = joblib.load('scaler.pkl')

# Get the list of feature names the model was trained on
feature_names = loaded_scaler.feature_names_in_

print("Feature names the model was trained on:")
print(feature_names)

Feature names the model was trained on:
[' Destination Port' ' Flow Duration' ' Total Fwd Packets'
 ' Total Backward Packets' 'Total Length of Fwd Packets'
 ' Total Length of Bwd Packets' ' Fwd Packet Length Max'
 ' Fwd Packet Length Min' ' Fwd Packet Length Mean'
 ' Fwd Packet Length Std' 'Bwd Packet Length Max' ' Bwd Packet Length Min'
 ' Bwd Packet Length Mean' ' Bwd Packet Length Std' 'Flow Bytes/s'
 ' Flow Packets/s' ' Flow IAT Mean' ' Flow IAT Std' ' Flow IAT Max'
 ' Flow IAT Min' 'Fwd IAT Total' ' Fwd IAT Mean' ' Fwd IAT Std'
 ' Fwd IAT Max' ' Fwd IAT Min' 'Bwd IAT Total' ' Bwd IAT Mean'
 ' Bwd IAT Std' ' Bwd IAT Max' ' Bwd IAT Min' 'Fwd PSH Flags'
 ' Bwd PSH Flags' ' Fwd URG Flags' ' Bwd URG Flags' ' Fwd Header Length'
 ' Bwd Header Length' 'Fwd Packets/s' ' Bwd Packets/s'
 ' Min Packet Length' ' Max Packet Length' ' Packet Length Mean'
 ' Packet Length Std' ' Packet Length Variance' 'FIN Flag Count'
 ' SYN Flag Count' ' RST Flag Count' ' PSH Flag Count' ' ACK Flag Count'
 ' UR

In [8]:
import pandas as pd
import joblib
import numpy as np

# Load the saved model and scaler
loaded_model = joblib.load('ids_model.pkl')
loaded_scaler = joblib.load('scaler.pkl')

# Create a new, single piece of network traffic data as a dictionary
# IMPORTANT: All feature names from your training data must be included.
new_data_dict = {
    ' Destination Port': [80], ' Flow Duration': [1000], ' Total Fwd Packets': [10],
    ' Total Backward Packets': [5], 'Total Length of Fwd Packets': [500],
    ' Total Length of Bwd Packets': [200], ' Fwd Packet Length Max': [100],
    ' Fwd Packet Length Min': [50], ' Fwd Packet Length Mean': [75],
    ' Fwd Packet Length Std': [10.5], 'Bwd Packet Length Max': [150], ' Bwd Packet Length Min': [30],
    ' Bwd Packet Length Mean': [70], ' Bwd Packet Length Std': [20.1], 'Flow Bytes/s': [700],
    ' Flow Packets/s': [15], ' Flow IAT Mean': [120], ' Flow IAT Std': [55], ' Flow IAT Max': [500],
    ' Flow IAT Min': [10], 'Fwd IAT Total': [900], ' Fwd IAT Mean': [110], ' Fwd IAT Std': [45],
    ' Fwd IAT Max': [450], ' Fwd IAT Min': [5], 'Bwd IAT Total': [850], ' Bwd IAT Mean': [100],
    ' Bwd IAT Std': [40], ' Bwd IAT Max': [400], ' Bwd IAT Min': [8], 'Fwd PSH Flags': [0],
    ' Bwd PSH Flags': [0], ' Fwd URG Flags': [0], ' Bwd URG Flags': [0], ' Fwd Header Length': [320],
    ' Bwd Header Length': [160], 'Fwd Packets/s': [10], ' Bwd Packets/s': [5],
    ' Min Packet Length': [30], ' Max Packet Length': [150], ' Packet Length Mean': [70],
    ' Packet Length Std': [25], ' Packet Length Variance': [625], 'FIN Flag Count': [0],
    ' SYN Flag Count': [1], ' RST Flag Count': [0], ' PSH Flag Count': [1], ' ACK Flag Count': [1],
    ' URG Flag Count': [0], ' CWE Flag Count': [0], ' ECE Flag Count': [0], ' Down/Up Ratio': [0],
    ' Average Packet Size': [75], ' Avg Fwd Segment Size': [75], ' Avg Bwd Segment Size': [70],
    ' Fwd Header Length.1': [320], 'Fwd Avg Bytes/Bulk': [0], ' Fwd Avg Packets/Bulk': [0],
    ' Fwd Avg Bulk Rate': [0], ' Bwd Avg Bytes/Bulk': [0], ' Bwd Avg Packets/Bulk': [0],
    'Bwd Avg Bulk Rate': [0], 'Subflow Fwd Packets': [10], ' Subflow Fwd Bytes': [500],
    ' Subflow Bwd Packets': [5], ' Subflow Bwd Bytes': [200], 'Init_Win_bytes_forward': [29200],
    ' Init_Win_bytes_backward': [14600], ' act_data_pkt_fwd': [10], ' min_seg_size_forward': [20],
    'Active Mean': [0], ' Active Std': [0], ' Active Max': [0], ' Active Min': [0], 'Idle Mean': [0],
    ' Idle Std': [0], ' Idle Max': [0], ' Idle Min': [0]
}
new_df = pd.DataFrame(new_data_dict)

# Scale the new data using the SAME scaler that was used for training
scaled_new_data = loaded_scaler.transform(new_df)

# Make a prediction
prediction = loaded_model.predict(scaled_new_data)

# Print the result
if prediction[0] == 1:
    print("\nPrediction: This traffic is likely an ATTACK. 🚨")
else:
    print("\nPrediction: This traffic is likely BENIGN. ✅")


Prediction: This traffic is likely BENIGN. ✅
