#Data Training

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import joblib
import seaborn as sns
import matplotlib.pyplot as plt

# Load the labeled dataset
df = pd.read_csv('labeled_network_anomaly_dataset.csv')

##1. Define feature columns

In [None]:
numerical_features = ['packet_count', 'byte_count', 'duration']
categorical_features = ['protocol', 'service']
ip_features = ['src_ip', 'dst_ip']
port_features = ['src_port', 'dst_port']
target = 'is_anomaly'

##2. Function to extract the last octet from the IP address converting it into a number or integer.

In [None]:
def extract_ip_octet(ip_series):
    return ip_series.apply(lambda x: int(x.split('.')[-1]))

##3. Selection, Feature Engineering and Data Sharing (80% train, 20% test)

In [None]:
X = df[numerical_features + categorical_features + port_features + ip_features].copy()
X['src_ip_octet'] = extract_ip_octet(df['src_ip'])
X['dst_ip_octet'] = extract_ip_octet(df['dst_ip'])
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

##4. Pre-processing Pipeline Creation and Main Pipeline Creation

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_features),
        ('port', StandardScaler(), port_features),
        ('ip_src', StandardScaler(), ['src_ip_octet']),
        ('ip_dst', StandardScaler(), ['dst_ip_octet'])
    ])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1))
])

#5. Model Training and Prediction

In [None]:
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

##6. Evaluate and save the model

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print evaluation metrics
print("Random Forest Model Evaluation:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)

# Visualize confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Normal', 'Anomaly'], yticklabels=['Normal', 'Anomaly'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig('confusion_matrix.png')
plt.close()

# Save the trained model
joblib.dump(pipeline, 'random_forest_anomaly_model.joblib')

Random Forest Model Evaluation:
Accuracy: 0.9948
Precision: 0.9759
Recall: 0.9725
F1-Score: 0.9742

Confusion Matrix:
[[17932    48]
 [   55  1943]]


['random_forest_anomaly_model.joblib']