In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Saturday = pd.read_xml(r"C:\Users\Des\Downloads\ISCXIDS2012 dataset\labeled_flows_xml\TestbedSatJun12Flows.xml")
Sunday = pd.read_xml(r"C:\Users\Des\Downloads\ISCXIDS2012 dataset\labeled_flows_xml\TestbedSunJun13Flows.xml")
Monday = pd.read_xml(r"C:\Users\Des\Downloads\ISCXIDS2012 dataset\labeled_flows_xml\TestbedMonJun14Flows.xml")
Tuesday1 = pd.read_xml(r"C:\Users\Des\Downloads\ISCXIDS2012 dataset\labeled_flows_xml\TestbedTueJun15-1Flows.xml")
Tuesday2 = pd.read_xml(r"C:\Users\Des\Downloads\ISCXIDS2012 dataset\labeled_flows_xml\TestbedTueJun15-2Flows.xml")
Tuesday3 = pd.read_xml(r"C:\Users\Des\Downloads\ISCXIDS2012 dataset\labeled_flows_xml\TestbedTueJun15-3Flows.xml")
Tuesday = pd.concat([Tuesday1, Tuesday2, Tuesday3], ignore_index=True)

Wednesday1 = pd.read_xml(r"C:\Users\Des\Downloads\ISCXIDS2012 dataset\labeled_flows_xml\TestbedWedJun16-1Flows.xml")
Wednesday2 = pd.read_xml(r"C:\Users\Des\Downloads\ISCXIDS2012 dataset\labeled_flows_xml\TestbedWedJun16-2Flows.xml")
Wednesday3 = pd.read_xml(r"C:\Users\Des\Downloads\ISCXIDS2012 dataset\labeled_flows_xml\TestbedWedJun16-3Flows.xml")
Wednesday = pd.concat([Wednesday1, Wednesday2, Wednesday3], ignore_index=True)

Thursday2 = pd.read_xml(r"C:\Users\Des\Downloads\ISCXIDS2012 dataset\labeled_flows_xml\TestbedThuJun17-2Flows.xml")
Thursday3 = pd.read_xml(r"C:\Users\Des\Downloads\ISCXIDS2012 dataset\labeled_flows_xml\TestbedThuJun17-3Flows.xml")
Thursday = pd.concat([Thursday2, Thursday3], ignore_index=True)

Week = [Saturday, Sunday, Monday, Tuesday, Wednesday, Thursday]
Label = ['Saturday', 'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday']
merged_data = []

for day, label in zip(Week, Label):
    day['Tag'] = label
    print(day.info())
    merged_data.append(day)

final_df = pd.concat(merged_data, ignore_index=True)
print(final_df.head())

In [None]:
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()
X = final_df[['totalSourceBytes', 'totalDestinationBytes', 'totalDestinationPackets', 'totalSourcePackets', 'sourcePort', 'destinationPort']]
Y = final_df[['Tag']]
Y = labelencoder.fit_transform(Y)

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, KFold

clf = DecisionTreeRegressor(random_state=0)
clf.fit(X_train, Y_train)
predictions = clf.predict(X_test)
rounded_predictions = np.round(predictions).astype(int)
predicted_days = labelencoder.inverse_transform(rounded_predictions)

cv = KFold(n_splits=10, random_state=0, shuffle=True)
cv_scores = cross_val_score(clf, X_train, Y_train, scoring='r2', cv=cv, n_jobs=-1)
print(f"Cross-validated R^2 scores: {cv_scores.mean()}")

from sklearn.metrics import mean_squared_error, r2_score

predict = clf.predict(X_test)
mse = mean_squared_error(Y_test, predict)
r2 = r2_score(Y_test, predict)
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

In [None]:
import random 

sample_size = 1000
indices = random.sample(range(len(Y_test)), k=sample_size)
Y_test_sample = [labelencoder.inverse_transform([Y_test[i]])[0] for i in indices]
predict_sample = [predicted_days[i] for i in indices]

plt.figure(figsize=(10, 6))
plt.scatter(range(sample_size), Y_test_sample, color='blue', label='Actual', alpha=0.6, s=10)
plt.scatter(range(sample_size), predict_sample, color='red', label='Predicted', alpha=0.6, s=10)
plt.title('Comparison of Real vs Predicted Traffic (Sampled)', fontsize=16)
plt.xlabel('Sample Index', fontsize=12)
plt.ylabel('Day of the Week', fontsize=12)
plt.xticks(rotation=45)
plt.legend(fontsize=12)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the regressors
rf_regressor = RandomForestRegressor(n_estimators=200, max_depth=20, random_state=42)
ada_regressor = AdaBoostRegressor(n_estimators=200, random_state=42)

# Train the RandomForestRegressor
rf_regressor.fit(X_train, Y_train)
rf_predictions = rf_regressor.predict(X_test)
rf_rounded_predictions = np.round(rf_predictions).astype(int)
rf_predicted_days = labelencoder.inverse_transform(rf_rounded_predictions)

# Train the AdaBoostRegressor
ada_regressor.fit(X_train, Y_train)
ada_predictions = ada_regressor.predict(X_test)
ada_rounded_predictions = np.round(ada_predictions).astype(int)
ada_predicted_days = labelencoder.inverse_transform(ada_rounded_predictions)

# Evaluate RandomForestRegressor
rf_mse = mean_squared_error(Y_test, rf_predictions)
rf_r2 = r2_score(Y_test, rf_predictions)
print(f"RandomForestRegressor Mean Squared Error: {rf_mse}")
print(f"RandomForestRegressor R^2 Score: {rf_r2}")

# Evaluate AdaBoostRegressor
ada_mse = mean_squared_error(Y_test, ada_predictions)
ada_r2 = r2_score(Y_test, ada_predictions)
print(f"AdaBoostRegressor Mean Squared Error: {ada_mse}")
print(f"AdaBoostRegressor R^2 Score: {ada_r2}")

# Choose the best model based on R^2 score
if rf_r2 > ada_r2:
    best_model = rf_regressor
    best_predictions = rf_predictions
    best_predicted_days = rf_predicted_days
    print("RandomForestRegressor is the best model.")
else:
    best_model = ada_regressor
    best_predictions = ada_predictions
    best_predicted_days = ada_predicted_days
    print("AdaBoostRegressor is the best model.")

# Compare with the results from DecisionTreeRegressor
dt_mse = mean_squared_error(Y_test, predictions)
dt_r2 = r2_score(Y_test, predictions)
print(f"DecisionTreeRegressor Mean Squared Error: {dt_mse}")
print(f"DecisionTreeRegressor R^2 Score: {dt_r2}")

# Print the best model's performance
print(f"Best Model Mean Squared Error: {mean_squared_error(Y_test, best_predictions)}")
print(f"Best Model R^2 Score: {r2_score(Y_test, best_predictions)}")