In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import plotly.express as px
import time

filepath = "/content/drive/MyDrive/Wednesday-workingHours.pcap_ISCX.csv"

# STEP 4: Load and preprocess dataset
df = pd.read_csv(filepath)
df.columns = df.columns.str.strip()

# Keep only attack samples
df = df[df['Label'] != 'BENIGN']
if df.empty:
    raise ValueError("No attack samples found in dataset.")

print("Unique attack types:", df['Label'].unique())

# Encode target
encoder = LabelEncoder()
df['Relevance'] = encoder.fit_transform(df['Label'])

# Clean features
X = df.drop(columns=['Label', 'Relevance'])
y = df['Relevance']

# Encode non-numeric features
for col in X.select_dtypes(include=['object']).columns:
    X[col] = LabelEncoder().fit_transform(X[col].astype(str))
X = X.fillna(0).replace([np.inf, -np.inf], 0)

# Subsample for speed (optional)
if len(df) > 30000:
    df = df.sample(30000, random_state=42)
    X = X.loc[df.index]
    y = y.loc[df.index]

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

def build_groups(data_len, max_size=10000):
    groups = []
    full_chunks = data_len // max_size
    for _ in range(full_chunks):
        groups.append(max_size)
    remainder = data_len % max_size
    if remainder:
        groups.append(remainder)
    return groups

group_train = build_groups(len(X_train))
group_test = build_groups(len(X_test))


def run_model(name, params):
    print(f"\nTraining {name}...")
    start = time.time()

    train_data = lgb.Dataset(X_train, label=y_train, group=group_train)
    valid_data = lgb.Dataset(X_test, label=y_test, group=group_test)

    model = lgb.train(
        params,
        train_data,
        valid_sets=[valid_data],
        num_boost_round=250,
        callbacks=[lgb.log_evaluation(0), lgb.early_stopping(20)]
    )

    elapsed = round(time.time() - start, 2)
    print(f" {name} training completed in {elapsed} seconds.")

    df_pred = df.copy()
    df_pred['Score'] = model.predict(X)
    df_pred['Rank'] = df_pred['Score'].rank(ascending=False, method='dense')
    df_pred = df_pred.sort_values('Score', ascending=False)

    top_attack = df_pred.iloc[0]['Label']
    top_score = df_pred.iloc[0]['Score']
    print(f"Top attack for {name}: {top_attack} (Score: {top_score:.4f})")

    return {
        'name': name,
        'model': model,
        'ranked': df_pred,
        'top_attack': top_attack,
        'top_score': top_score,
        'train_time': elapsed,
        'params': params
    }


# RankNet
ranknet_params = {
    'objective': 'rank_xendcg',
    'metric': 'ndcg',
    'learning_rate': 0.05,
    'num_leaves': 127,
    'ndcg_eval_at': [1,3,5],
    'verbose': -1
}

# LambdaRank
lambdarank_params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'learning_rate': 0.05,
    'num_leaves': 127,
    'ndcg_eval_at': [1,3,5],
    'verbose': -1
}

# LambdaMART (DART)
lambdamart_params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'boosting_type': 'dart',
    'learning_rate': 0.05,
    'num_leaves': 127,
    'ndcg_eval_at': [1,3,5],
    'verbose': -1
}

ranknet_result = run_model("RankNet", ranknet_params)
lambdarank_result = run_model("LambdaRank", lambdarank_params)
lambdamart_result = run_model("LambdaMART", lambdamart_params)

def visualize(result):
    df_ranked = result['ranked']
    algo = result['name']

    # Top 10 attacks
    top10 = df_ranked[['Label', 'Score']].head(10)
    fig1 = px.bar(top10, x='Label', y='Score', color='Score',
                  title=f"Top 10 Critical Alerts - {algo}", color_continuous_scale='Reds')
    fig1.show()

    # Score Distribution
    fig2 = px.histogram(df_ranked, x='Score', nbins=50, title=f"Score Distribution - {algo}")
    fig2.show()

    # Rank vs Score Scatter
    sample = df_ranked.sample(min(1000, len(df_ranked)))
    fig3 = px.scatter(sample, x='Rank', y='Score', color='Label', title=f"Score vs Rank - {algo}")
    fig3.show()

print("\nGenerating visualizations...")
visualize(ranknet_result)
visualize(lambdarank_result)
visualize(lambdamart_result)


summary_df = pd.DataFrame([
    {
        'Algorithm': 'RankNet',
        'Top Attack': ranknet_result['top_attack'],
        'Top Score': round(ranknet_result['top_score'], 4),
        'Training Time (s)': ranknet_result['train_time']
    },
    {
        'Algorithm': 'LambdaRank',
        'Top Attack': lambdarank_result['top_attack'],
        'Top Score': round(lambdarank_result['top_score'], 4),
        'Training Time (s)': lambdarank_result['train_time']
    },
    {
        'Algorithm': 'LambdaMART (DART)',
        'Top Attack': lambdamart_result['top_attack'],
        'Top Score': round(lambdamart_result['top_score'], 4),
        'Training Time (s)': lambdamart_result['train_time']
    }
])

print("\nCOMPARISON SUMMARY")
display(summary_df)

import plotly.graph_objects as go

fig = go.Figure()
for result in [ranknet_result, lambdarank_result, lambdamart_result]:
    df_ranked = result['ranked']
    sample = df_ranked.sample(min(1000, len(df_ranked)))
    fig.add_trace(go.Scatter(
        x=sample['Rank'],
        y=sample['Score'],
        mode='markers',
        name=result['name'],
        opacity=0.6
    ))

fig.update_layout(
    title="Overlay: Rank vs Score Comparison",
    xaxis_title="Rank",
    yaxis_title="Score"
)
fig.show()


Unique attack types: ['DoS slowloris' 'DoS Slowhttptest' 'DoS Hulk' 'DoS GoldenEye'
 'Heartbleed']

Training RankNet...
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[1]	valid_0's ndcg@1: 1	valid_0's ndcg@3: 1	valid_0's ndcg@5: 1
 RankNet training completed in 0.58 seconds.
Top attack for RankNet: DoS slowloris (Score: 0.1830)

Training LambdaRank...
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[1]	valid_0's ndcg@1: 1	valid_0's ndcg@3: 1	valid_0's ndcg@5: 1
 LambdaRank training completed in 0.82 seconds.
Top attack for LambdaRank: DoS slowloris (Score: 0.1000)

Training LambdaMART...



Early stopping is not available in dart mode



 LambdaMART training completed in 22.75 seconds.
Top attack for LambdaMART: Heartbleed (Score: 4.9646)

Generating visualizations...



COMPARISON SUMMARY


Unnamed: 0,Algorithm,Top Attack,Top Score,Training Time (s)
0,RankNet,DoS slowloris,0.183,0.58
1,LambdaRank,DoS slowloris,0.1,0.82
2,LambdaMART (DART),Heartbleed,4.9646,22.75
