# Packet Classification & Anomaly Detection
Use this notebook to load features exported by `detection.py` and classify/analyze packets using ML models.

In [75]:
import pandas as pd
import numpy as np
import logging
import json
import os
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
from collections import Counter

logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')

RAW_CSV = "captured_packets.csv"
CLEAN_CSV = "cleaned_packets.csv"
JSON_OUTPUT = "SCAM.json"

## 1. Load captured packet features

In [76]:
import ipaddress

def ip_to_int(ip):
    try:
        return int(ipaddress.IPv4Address(str(ip)))
    except Exception:
        return 0  # fallback for malformed IPs

def clean_dataframe(df):
    df.columns = df.columns.str.strip().str.lower()
    df = df.dropna()

    # Remove 0.0.0.0 only if label is 'good'
    if 'label' in df.columns:
        df = df[~((df['src_ip'] == '0.0.0.0') & (df['label'].astype(str).str.lower() == 'good'))]
        df = df[~((df['dst_ip'] == '0.0.0.0') & (df['label'].astype(str).str.lower() == 'good'))]

    # Convert IPs to integers using ipaddress for robustness
    df['src_ip'] = df['src_ip'].apply(ip_to_int)
    df['dst_ip'] = df['dst_ip'].apply(ip_to_int)

    # Encode protocol and flags
    df['protocol'] = LabelEncoder().fit_transform(df['protocol'].astype(str))
    df['flags'] = LabelEncoder().fit_transform(df['flags'].astype(str))

    # Encode label: good=0, bad=1, fallback for already numeric
    if df['label'].dtype == object or df['label'].dtype == 'str':
        df['label'] = df['label'].astype(str).str.lower().map({"good": 0, "bad": 1}).fillna(df['label'])
    df['label'] = pd.to_numeric(df['label'], errors='coerce').astype(int)

    # Ensure numeric columns
    for col in ['length', 'ttl', 'window_size', 'src_port', 'dst_port']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)

    df.to_csv(CLEAN_CSV, index=False)
    return df


In [77]:
def train_model(df):
    features = ['src_ip', 'dst_ip', 'src_port', 'dst_port', 'protocol', 'flags', 'length', 'ttl', 'window_size']
    X = df[features]
    y = df['label']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, stratify=y, random_state=42
    )

    model = GradientBoostingClassifier(n_estimators=150, learning_rate=0.1, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    logging.info("\n--- Model Evaluation ---")
    print(classification_report(y_test, y_pred, digits=4))
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))

    return model

In [78]:
from collections import Counter
import json
import logging

def analyze_and_export(df, model, protocol_encoder=None):
    features = ['src_ip', 'dst_ip', 'src_port', 'dst_port', 'protocol', 'flags', 'length', 'ttl', 'window_size']
    df['predicted'] = model.predict(df[features])

    total_packets = len(df)
    anomalies = df[df['predicted'] == 1]
    total_anomalies = len(anomalies)

    anomalies_by_type = Counter(anomalies['predicted'])

    # Protocol distribution counts (encoded)
    proto_dist = df['protocol'].value_counts().to_dict()

    # If you have a protocol_encoder (LabelEncoder) passed, decode keys for human-readable proto_dist
    if protocol_encoder:
        proto_dist = {protocol_encoder.inverse_transform([k])[0]: v for k, v in proto_dist.items()}

    anomaly_ips = {
        "1": anomalies['src_ip'].astype(str).unique().tolist(),
        "2": [],
        "3": []
    }

    anomalies_last_seen = {
        "1": anomalies['timestamp'].max() if not anomalies.empty else None
    }

    summary = {
        "total_packets": total_packets,
        "total_anomalies": total_anomalies,
        "anomalies_by_type": dict(anomalies_by_type),
        "anomaly_ips": anomaly_ips,
        "protocol_distribution": proto_dist,
        "anomalies_last_seen": anomalies_last_seen
    }

    with open(JSON_OUTPUT, 'w') as f:
        json.dump(summary, f, indent=4, default=str)

    logging.info(f"Summary saved to {JSON_OUTPUT}")
    return summary


In [68]:
# def notify_admin():
#     """Notify administrators about detected network anomalies via email"""
#     try:
#         import Email  # Import the Email.py module
#         success = Email.trigger_email()
#         if success:
#             logging.info("Alert email triggered successfully.")
#         else:
#             logging.warning("Alert email was triggered but may not have been sent to all recipients.")
#         return success
#     except Exception as e:
#         logging.error(f"Email trigger failed: {e}")
#         return False


In [80]:
if __name__ == "__main__":
    if not os.path.exists(RAW_CSV):
        raise FileNotFoundError(f"{RAW_CSV} not found!")

    df_raw = pd.read_csv(RAW_CSV, header=None)
    df_raw.columns = [
        'timestamp', 'src_ip', 'dst_ip',
        'src_port', 'dst_port', 'protocol',
        'flags', 'length', 'ttl', 'window_size', 'label'
    ]

    df_clean = clean_dataframe(df_raw)
    model = train_model(df_clean)
    summary = analyze_and_export(df_clean, model)



[INFO] 
--- Model Evaluation ---
[INFO] Summary saved to SCAM.json


              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000       306
           1     1.0000    1.0000    1.0000        39

    accuracy                         1.0000       345
   macro avg     1.0000    1.0000    1.0000       345
weighted avg     1.0000    1.0000    1.0000       345

Accuracy: 1.0
Precision: 1.0


In [None]:
##### end of ` network_traffic_analysis.py` #####
# tata bye bye