In [None]:
# 1) Data Import
from google.colab import files
import io, pandas as pd, numpy as np
uploaded = files.upload()
fname = list(uploaded.keys())[0]
df = pd.read_csv(io.BytesIO(uploaded[fname]))
print(df.shape)
df.head()

In [None]:
# 2) Data Preprocessing
df.drop_duplicates(inplace=True)
for col in ['bytes_in','bytes_out','dst_port']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        df[col].fillna(df[col].median(), inplace=True)
for col in ['creation_time','end_time']:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')
print(df.info())

In [None]:
# 3) Feature Engineering
df['session_duration'] = (df['end_time']-df['creation_time']).dt.total_seconds().fillna(600)
df['avg_packet_size'] = (df.get('bytes_in',0)+df.get('bytes_out',0))/(df['session_duration'].replace(0,np.nan))
df['throughput'] = (df.get('bytes_in',0)+df.get('bytes_out',0))/(df['session_duration'].replace(0,np.nan))
if 'rule_names' in df.columns:
    df['is_suspicious'] = df['rule_names'].astype(str).str.contains('attack|waf|rule',case=False,na=False).astype(int)
else:
    df['is_suspicious'] = 0
df[['session_duration','avg_packet_size','throughput','is_suspicious']].head()

In [None]:
# 4) EDA - distributions
import matplotlib.pyplot as plt
plt.hist(df['bytes_in'].dropna(), bins=50); plt.title('Bytes_in'); plt.show()
plt.hist(df['bytes_out'].dropna(), bins=50); plt.title('Bytes_out'); plt.show()

In [None]:
# 5) EDA - Protocol/Ports
if 'protocol' in df.columns:
    df['protocol'].value_counts().head(10).plot(kind='bar'); plt.title('Protocol counts'); plt.show()
if 'dst_port' in df.columns:
    df['dst_port'].value_counts().head(10).plot(kind='bar'); plt.title('Top Ports'); plt.show()

In [None]:
# 6) Correlation heatmap
import seaborn as sns
num_cols = df.select_dtypes(include=[np.number]).columns
sns.heatmap(df[num_cols].corr(), cmap='coolwarm'); plt.show()

In [None]:
# 7) Anomaly Detection - IsolationForest & LOF
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
features = ['bytes_in','bytes_out','session_duration','avg_packet_size']
X = df[features].fillna(0)
iso = IsolationForest(contamination=0.05, random_state=42)
df['iso_anomaly'] = iso.fit_predict(X).astype(int)
lof = LocalOutlierFactor(contamination=0.05)
df['lof_anomaly'] = pd.Series(lof.fit_predict(X)).map({1:0,-1:1})
print(df[['iso_anomaly','lof_anomaly']].value_counts())

In [None]:
# 8) Classification - RandomForest
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
X = df[features].fillna(0)
y = df['is_suspicious']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42,stratify=y)
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
print(classification_report(y_test,y_pred))

In [None]:
# 9) Neural Network - MLP
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
mlp = Sequential([Dense(64,activation='relu',input_shape=(X_train.shape[1],)),
                  Dense(32,activation='relu'),Dropout(0.3),
                  Dense(1,activation='sigmoid')])
mlp.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
mlp.fit(X_train,y_train,epochs=10,batch_size=32,verbose=0)
print("MLP Accuracy:", mlp.evaluate(X_test,y_test,verbose=0)[1])