In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np


In [3]:
DATA_PATH = "/content/drive/MyDrive/SentinelAI"

FILES = {
    "benign": "Benign-Monday-no-metadata.csv",
    "brute": "Bruteforce-Tuesday-no-metadata.csv",
    "web": "WebAttacks-Thursday-no-metadata.csv"
}


In [4]:
def load_csv(file_name, nrows=50000):
    df = pd.read_csv(f"{DATA_PATH}/{file_name}", nrows=nrows)
    print(f"{file_name} loaded → {df.shape}")
    return df

df_benign = load_csv(FILES["benign"])
df_brute  = load_csv(FILES["brute"])
df_web    = load_csv(FILES["web"])


Benign-Monday-no-metadata.csv loaded → (50000, 78)
Bruteforce-Tuesday-no-metadata.csv loaded → (50000, 78)
WebAttacks-Thursday-no-metadata.csv loaded → (50000, 78)


In [5]:
print("Benign labels:\n", df_benign["Label"].value_counts())
print("\nBruteforce labels:\n", df_brute["Label"].value_counts())
print("\nWebAttack labels:\n", df_web["Label"].value_counts())


Benign labels:
 Label
Benign    50000
Name: count, dtype: int64

Bruteforce labels:
 Label
Benign         45473
FTP-Patator     4527
Name: count, dtype: int64

WebAttack labels:
 Label
Benign                      48847
Web Attack � Brute Force     1153
Name: count, dtype: int64


In [6]:
def binarize_label(label):
    if label == "Benign":
        return 0
    else:
        return 1

df_benign["binary_label"] = df_benign["Label"].apply(binarize_label)
df_brute["binary_label"]  = df_brute["Label"].apply(binarize_label)
df_web["binary_label"]    = df_web["Label"].apply(binarize_label)


In [7]:
df_benign.drop(columns=["Label"], inplace=True)
df_brute.drop(columns=["Label"], inplace=True)
df_web.drop(columns=["Label"], inplace=True)


In [8]:
df_all = pd.concat([df_benign, df_brute, df_web], ignore_index=True)

print("Merged dataset shape:", df_all.shape)


Merged dataset shape: (150000, 78)


In [None]:
df_all = df_all.sample(frac=1, random_state=42).reset_index(drop=True)

In [9]:
X = df_all.drop(columns=["binary_label"])
y = df_all["binary_label"]

print("X shape:", X.shape)
print("y distribution:\n", y.value_counts())


X shape: (150000, 77)
y distribution:
 binary_label
0    144320
1      5680
Name: count, dtype: int64


In [10]:
X = X.select_dtypes(include=[np.number])
print("Numeric feature count:", X.shape[1])


Numeric feature count: 77


In [11]:
print("Overall binary label distribution:")
print(y.value_counts())

print("\nPercentage:")
print(y.value_counts(normalize=True) * 100)


Overall binary label distribution:
binary_label
0    144320
1      5680
Name: count, dtype: int64

Percentage:
binary_label
0    96.213333
1     3.786667
Name: proportion, dtype: float64


**PHASE2**

In [12]:
from sklearn.feature_selection import VarianceThreshold

var_thresh = VarianceThreshold(threshold=0.0)
X_var = var_thresh.fit_transform(X)

kept_columns = X.columns[var_thresh.get_support()]
X_var = pd.DataFrame(X_var, columns=kept_columns)

print("Features after variance filter:", X_var.shape[1])


Features after variance filter: 67


In [13]:
corr_matrix = X_var.corr().abs()

upper = corr_matrix.where(
    np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
)

to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

X_uncorr = X_var.drop(columns=to_drop)

print("Dropped correlated features:", len(to_drop))
print("Remaining features:", X_uncorr.shape[1])


Dropped correlated features: 23
Remaining features: 44


In [14]:
from sklearn.ensemble import RandomForestClassifier

rf_temp = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

rf_temp.fit(X_uncorr, y)


In [15]:
importances = pd.Series(
    rf_temp.feature_importances_,
    index=X_uncorr.columns
).sort_values(ascending=False)

importances.head(20)


Unnamed: 0,0
Bwd Header Length,0.10469
Fwd Packet Length Mean,0.088368
Fwd Packet Length Max,0.07523
Packet Length Max,0.070699
Flow IAT Min,0.05062
Fwd Packet Length Std,0.04781
Packet Length Variance,0.047624
Fwd Packets Length Total,0.043051
Bwd Packet Length Mean,0.040352
Packet Length Mean,0.039462


In [16]:
TOP_N = 15
selected_features = importances.head(TOP_N).index.tolist()

X_selected = X_uncorr[selected_features]

print("Final selected features:")
for f in selected_features:
    print("-", f)

print("Final feature matrix shape:", X_selected.shape)


Final selected features:
- Bwd Header Length
- Fwd Packet Length Mean
- Fwd Packet Length Max
- Packet Length Max
- Flow IAT Min
- Fwd Packet Length Std
- Packet Length Variance
- Fwd Packets Length Total
- Bwd Packet Length Mean
- Packet Length Mean
- Fwd Header Length
- Init Bwd Win Bytes
- Bwd Packet Length Max
- Init Fwd Win Bytes
- Fwd PSH Flags
Final feature matrix shape: (150000, 15)


In [17]:
X_selected.describe()

Unnamed: 0,Bwd Header Length,Fwd Packet Length Mean,Fwd Packet Length Max,Packet Length Max,Flow IAT Min,Fwd Packet Length Std,Packet Length Variance,Fwd Packets Length Total,Bwd Packet Length Mean,Packet Length Mean,Fwd Header Length,Init Bwd Win Bytes,Bwd Packet Length Max,Init Fwd Win Bytes,Fwd PSH Flags
count,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0
mean,-24866.74,60.633629,246.394073,619.46464,103267.5,76.979617,111632.9,920.1429,206.768908,127.992466,-359393.6,2785.429187,563.418433,9126.243613,0.056113
std,4091046.0,116.460265,589.309709,1051.038043,1843588.0,181.46128,297830.5,13176.08,328.816262,189.084067,88003560.0,9706.885071,957.173428,15380.52929,0.230141
min,-1073741000.0,0.0,0.0,0.0,-13.0,0.0,0.0,0.0,0.0,0.0,-32212230000.0,-1.0,0.0,-1.0,0.0
25%,20.0,12.666667,30.0,34.0,3.0,0.0,65.33334,41.0,6.0,12.25,40.0,-1.0,6.0,-1.0,0.0
50%,40.0,44.0,48.0,112.0,32.0,0.0,1228.8,82.0,91.0,66.2,64.0,-1.0,110.0,257.0,0.0
75%,160.0,58.715715,325.0,860.0,10120.5,92.664218,82156.83,492.0,209.0,137.560272,192.0,253.0,746.0,8192.0,0.0
max,5838440.0,4638.9233,24820.0,24820.0,108000000.0,7125.5967,19488230.0,1323378.0,3706.2,2295.72,4644908.0,65535.0,13140.0,65535.0,1.0


**PHASE3**

In [18]:
X_selected = X_selected.replace([np.inf, -np.inf], np.nan)

print("NaN count after inf replacement:")
print(X_selected.isna().sum())


NaN count after inf replacement:
Bwd Header Length           0
Fwd Packet Length Mean      0
Fwd Packet Length Max       0
Packet Length Max           0
Flow IAT Min                0
Fwd Packet Length Std       0
Packet Length Variance      0
Fwd Packets Length Total    0
Bwd Packet Length Mean      0
Packet Length Mean          0
Fwd Header Length           0
Init Bwd Win Bytes          0
Bwd Packet Length Max       0
Init Fwd Win Bytes          0
Fwd PSH Flags               0
dtype: int64


In [19]:
X_selected = X_selected.dropna()
y = y.loc[X_selected.index]


In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_selected,
    y,
    test_size=0.30,
    random_state=42,
    stratify=y
)


In [21]:
print("Train label distribution:")
print(y_train.value_counts(normalize=True))

print("\nTest label distribution:")
print(y_test.value_counts(normalize=True))


Train label distribution:
binary_label
0    0.962133
1    0.037867
Name: proportion, dtype: float64

Test label distribution:
binary_label
0    0.962133
1    0.037867
Name: proportion, dtype: float64


In [22]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (105000, 15)
X_test shape: (45000, 15)
y_train shape: (105000,)
y_test shape: (45000,)


**PHASE4**

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix
)


In [24]:
rf_model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1,
    class_weight="balanced"
)

rf_model.fit(X_train, y_train)


In [25]:
y_pred = rf_model.predict(X_test)


In [26]:
from sklearn.metrics import f1_score

f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)


F1 Score: 0.9877622377622378


In [27]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)


Confusion Matrix:
[[43263    33]
 [    9  1695]]


In [28]:
from sklearn.metrics import precision_score, recall_score

print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))


Precision: 0.9809027777777778
Recall: 0.9947183098591549


In [29]:
import os

ARTIFACT_DIR = "/content/drive/MyDrive/SentinelAI_Artifacts"
os.makedirs(ARTIFACT_DIR, exist_ok=True)

print("Artifact directory ready:", ARTIFACT_DIR)


Artifact directory ready: /content/drive/MyDrive/SentinelAI_Artifacts


In [30]:
import joblib

MODEL_PATH = f"{ARTIFACT_DIR}/sentinel_rf_model.pkl"

joblib.dump(rf_model, MODEL_PATH)

print("Model saved at:", MODEL_PATH)


Model saved at: /content/drive/MyDrive/SentinelAI_Artifacts/sentinel_rf_model.pkl


In [31]:
FEATURE_PATH = f"{ARTIFACT_DIR}/selected_features.txt"

with open(FEATURE_PATH, "w") as f:
    for feature in X_selected.columns:
        f.write(feature + "\n")

print("Feature list saved at:", FEATURE_PATH)


Feature list saved at: /content/drive/MyDrive/SentinelAI_Artifacts/selected_features.txt


In [32]:
METADATA_PATH = f"{ARTIFACT_DIR}/model_metadata.txt"

with open(METADATA_PATH, "w") as f:
    f.write("Model: RandomForestClassifier\n")
    f.write("Features: 15\n")
    f.write("Class imbalance handled: class_weight=balanced\n")
    f.write("Recall (malicious): 0.9947\n")
    f.write("Precision (malicious): 0.9697\n")
    f.write("F1-score: 0.9820\n")

print("Metadata saved at:", METADATA_PATH)


Metadata saved at: /content/drive/MyDrive/SentinelAI_Artifacts/model_metadata.txt


In [33]:
loaded_model = joblib.load(MODEL_PATH)

test_pred = loaded_model.predict(X_test.iloc[:5])
print("Reload test predictions:", test_pred)


Reload test predictions: [0 0 0 0 0]
