In [None]:
%run  "./env_setup.py"

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
table = "phishing_data"
sql = f"""
select *
from {username}.{table} pd
"""

df = agent.execute_dml(sql)
df

In [None]:
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score, confusion_matrix, f1_score,
    roc_auc_score, matthews_corrcoef, cohen_kappa_score
)
import pandas as pd
import numpy as np

# Prepare dataset
X, y = df.drop("is_phishing", axis=1), df["is_phishing"]

# Drop non-numeric or large categorical columns
for col in ["attack_state", "severity_score", "timestamp", "source_ip", "dest_ip", "source_port", "dest_port"]:
    X = X.drop(col, axis=1)

# Encode categorical columns
encoder = LabelEncoder()
for col in ["protocol", "tcp_flags", "service", "is_weekend"]:
    X[col] = encoder.fit_transform(X[col])

# Compute scale_pos_weight for extreme imbalance
neg_count = sum(y == 0)
pos_count = sum(y == 1)
scale_pos_weight = neg_count / pos_count  # ~998k / 1.4k ≈ 710

# Stratified K-Fold CV
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Storage for metrics
acc_scores, roc_auc_scores, mcc_scores, kappa_scores, nir_scores = [], [], [], [], []
sensitivity_scores, specificity_scores, f1_scores = [], [], []

fold = 1
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Initialize XGBoost with class weighting
    xgb_model = xgb.XGBClassifier(
        objective="binary:logistic",
        n_estimators=200,
        learning_rate=0.1,
        max_depth=5,              # you mentioned this helped
        scale_pos_weight=scale_pos_weight,
        use_label_encoder=False,
        eval_metric="logloss"
    )
    
    # Fit model
    xgb_model.fit(X_train, y_train)
    
    # Predict
    y_pred = xgb_model.predict(X_test)
    y_prob = xgb_model.predict_proba(X_test)[:, 1]
    
    # Confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    
    # Metrics
    acc = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob)
    mcc = matthews_corrcoef(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    sensitivity = tp / (tp + fn)       # recall for phishing
    specificity = tn / (tn + fp)       # true negative rate
    most_freq_class = y_test.mode()[0]
    nir = (y_test == most_freq_class).mean()
    
    # Store metrics
    acc_scores.append(acc)
    roc_auc_scores.append(roc_auc)
    mcc_scores.append(mcc)
    kappa_scores.append(kappa)
    nir_scores.append(nir)
    f1_scores.append(f1)
    sensitivity_scores.append(sensitivity)
    specificity_scores.append(specificity)
    
    print(f"Fold {fold} Metrics:")
    print(f"Accuracy: {acc:.4f}, F1: {f1:.4f}, Sensitivity: {sensitivity:.4f}, Specificity: {specificity:.4f}")
    print(f"ROC-AUC: {roc_auc:.4f}, MCC: {mcc:.4f}, Kappa: {kappa:.4f}, NIR: {nir:.4f}")
    print("-" * 50)
    fold += 1

# Summary across folds
print("\n=== Cross-Validation Summary ===")
print(f"Mean Accuracy: {np.mean(acc_scores):.4f}")
print(f"Mean F1-score: {np.mean(f1_scores):.4f}")
print(f"Mean Sensitivity: {np.mean(sensitivity_scores):.4f}")
print(f"Mean Specificity: {np.mean(specificity_scores):.4f}")
print(f"Mean ROC-AUC: {np.mean(roc_auc_scores):.4f}")
print(f"Mean MCC: {np.mean(mcc_scores):.4f}")
print(f"Mean Cohen's Kappa: {np.mean(kappa_scores):.4f}")
print(f"Mean No Information Rate: {np.mean(nir_scores):.4f}")


In [None]:
# Visualize confusion matrix for the last fold
import seaborn as sns

# Create confusion matrix from last fold predictions
cm = confusion_matrix(y_test, y_pred)

# Create figure and plot
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Normal', 'Phishing'],
            yticklabels=['Normal', 'Phishing'],
            cbar_kws={'label': 'Count'})

plt.title('Confusion Matrix - XGBoost Model (Final Fold)', fontsize=14, fontweight='bold')
plt.ylabel('Actual', fontsize=12)
plt.xlabel('Predicted', fontsize=12)
plt.tight_layout()
plt.show()

# Print detailed confusion matrix metrics
print("\n=== Confusion Matrix Breakdown (Final Fold) ===")
print(f"True Negatives (TN): {tn:,}")
print(f"False Positives (FP): {fp:,}")
print(f"False Negatives (FN): {fn:,}")
print(f"True Positives (TP): {tp:,}")
print(f"\nTotal Predictions: {tn + fp + fn + tp:,}")
print(f"Phishing Detection Rate: {tp / (tp + fn):.2%}")
print(f"False Alarm Rate: {fp / (fp + tn):.2%}")

# Insider Threat Detection


In [1]:
%run  "./env_setup.py"

/home/jonahs23/networkInstrusion/.env
User:  jonahs23
Database:  postgresql://jonahs23:DataSci23@ads1.datasci.vt.edu:5432/ads_db5


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
table = "network_traffic_history_itd"
sql = f"""
select *
from {username}.{table} pd
"""

df = agent.execute_dml(sql)


  df = pd.read_sql_query(query, conn)


In [3]:
X, y = df.drop("is_itd", axis=1), df["is_itd"]

In [4]:
for col in ["attack_state", "severity_score", "timestamp", "source_ip", "dest_ip", "source_port", "dest_port"]:
    X = X.drop(col, axis=1)


In [5]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
for col in ["protocol", "tcp_flags", "service", "is_weekend"]:
    X[col] = encoder.fit_transform(X[col])

In [6]:
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold 
from sklearn.metrics import (
    accuracy_score, confusion_matrix, f1_score,
    roc_auc_score, matthews_corrcoef, cohen_kappa_score,
    classification_report, precision_score, recall_score
)
import numpy as np

# Display feature information
print(f"Feature shape: {X.shape}")
print(f"Target distribution:\n{y.value_counts()}")
print(f"\nFeatures used:\n{list(X.columns)}")

Feature shape: (4999980, 21)
Target distribution:
is_itd
0    4998141
1       1839
Name: count, dtype: int64

Features used:
['protocol', 'duration', 'packets', 'bytes', 'bytes_per_packet', 'packets_per_second', 'tcp_flags', 'service', 'is_weekend', 'hour_of_day', 'day_of_week', 'bytes_ratio', 'packet_size_variance', 'connection_frequency', 'unique_ports_per_source', 'off_hours', 'is_internal_source', 'is_internal_dest', 'internal_only', 'external_only', 'high_data_volume_off_hours_internal']


In [7]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]:,}")
print(f"Test set size: {X_test.shape[0]:,}")
print(f"\nTraining set target distribution:\n{y_train.value_counts()}")
print(f"\nTest set target distribution:\n{y_test.value_counts()}")

Training set size: 3,999,984
Test set size: 999,996

Training set target distribution:
is_itd
0    3998513
1       1471
Name: count, dtype: int64

Test set target distribution:
is_itd
0    999628
1       368
Name: count, dtype: int64


In [8]:
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)


In [None]:
# Storage for metrics across all folds

acc_scores, precision_scores_list, recall_scores_list, f1_scores = [], [], [], []
mcc_scores, kappa_scores, nir_scores = [], [], []
sensitivity_scores, specificity_scores = [], []

fold = 1
for train_index, test_index in skf.split(X, y):
    print(f"\n{'='*60}")
    print(f"FOLD {fold}/{n_splits}")
    print(f"{'='*60}")
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    print(f"Training set size: {X_train.shape[0]:,}")
    print(f"Test set size: {X_test.shape[0]:,}")
    print(f"Training ITD cases: {y_train.sum():,}")
    print(f"Test ITD cases: {y_test.sum():,}")
    
    # Scale features (important for SVM)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Calculate nu parameter (upper bound on fraction of outliers)
    nu = y_train.sum() / len(y_train)
    print(f"Nu parameter (expected outlier fraction): {nu:.6f}")
    
    # Initialize One-Class SVM model
    # nu: approximation of the fraction of outliers
    # kernel: RBF kernel for non-linear decision boundary
    # gamma: kernel coefficient (scale = 1/(n_features * X.var()))
    oc_svm = OneClassSVM(
        nu=nu,
        kernel='rbf',
        gamma='scale',
        verbose=False
    )
    
    print("Training One-Class SVM...")
    oc_svm.fit(X_train_scaled)
    
    # Make predictions
    # One-Class SVM returns +1 for inliers (normal) and -1 for outliers (ITD)
    y_pred_svm = oc_svm.predict(X_test_scaled)
    # Convert: -1 (outlier/ITD) -> 1, +1 (inlier/normal) -> 0
    y_pred = np.where(y_pred_svm == -1, 1, 0)
    
    # Calculate metrics
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    mcc = matthews_corrcoef(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)
    
    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    
    most_freq_class = y_test.mode()[0]
    nir = (y_test == most_freq_class).mean()
    
    # Store metrics
    acc_scores.append(acc)
    precision_scores_list.append(precision)
    recall_scores_list.append(recall)
    f1_scores.append(f1)
    mcc_scores.append(mcc)
    kappa_scores.append(kappa)
    nir_scores.append(nir)
    sensitivity_scores.append(sensitivity)
    specificity_scores.append(specificity)
    
    # Print fold results
    print(f"\nFold {fold} Results:")
    print(f"Accuracy: {acc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
    print(f"Sensitivity: {sensitivity:.4f}, Specificity: {specificity:.4f}")
    print(f"MCC: {mcc:.4f}, Kappa: {kappa:.4f}, NIR: {nir:.4f}")
    print(f"Confusion Matrix: TN={tn:,}, FP={fp:,}, FN={fn:,}, TP={tp:,}")
    print(f"ITD Detection Rate: {sensitivity:.2%}")
    
    fold += 1

print(f"\n{'='*60}")
print("CROSS-VALIDATION SUMMARY")
print(f"{'='*60}")


FOLD 1/5
Training set size: 3,999,984
Test set size: 999,996
Training ITD cases: 1,472
Test ITD cases: 367
Nu parameter (expected outlier fraction): 0.000368
Training One-Class SVM...


In [None]:
# Calculate comprehensive metrics
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, zero_division=0)
mcc = matthews_corrcoef(y_test, y_pred)
kappa = cohen_kappa_score(y_test, y_pred)

# Calculate sensitivity and specificity
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0  # recall for ITD
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0  # true negative rate

# No Information Rate
most_freq_class = y_test.mode()[0]
nir = (y_test == most_freq_class).mean()

print("=== Isolation Forest Model Performance ===")
print(f"\nAccuracy: {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall (Sensitivity): {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"\nMatthews Correlation Coefficient: {mcc:.4f}")
print(f"Cohen's Kappa: {kappa:.4f}")
print(f"No Information Rate: {nir:.4f}")
print(f"\n{'='*50}")
print("\nConfusion Matrix:")
print(f"True Negatives (TN): {tn:,}")
print(f"False Positives (FP): {fp:,}")
print(f"False Negatives (FN): {fn:,}")
print(f"True Positives (TP): {tp:,}")
print(f"\nITD Detection Rate: {sensitivity:.2%}")
print(f"False Alarm Rate: {fp / (fp + tn):.2%}")

In [None]:
# Create aggregated confusion matrix from all folds
import seaborn as sns

# Sum up confusion matrices from all folds
total_tn = sum([r['tn'] for r in results])
total_fp = sum([r['fp'] for r in results])
total_fn = sum([r['fn'] for r in results])
total_tp = sum([r['tp'] for r in results])

# Create confusion matrix array
cm_total = np.array([[total_tn, total_fp], 
                     [total_fn, total_tp]])

# Visualize aggregated confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_total, annot=True, fmt='d', cmap='Reds', 
            xticklabels=['Normal', 'Insider Threat'],
            yticklabels=['Normal', 'Insider Threat'],
            cbar_kws={'label': 'Count'})

plt.title('Aggregated Confusion Matrix - One-Class SVM (All Folds)', fontsize=14, fontweight='bold')
plt.ylabel('Actual', fontsize=12)
plt.xlabel('Predicted', fontsize=12)
plt.tight_layout()
plt.show()

# Print detailed confusion matrix breakdown
print("\n=== Aggregated Confusion Matrix (All Folds Combined) ===")
print(f"True Negatives (TN): {total_tn:,}")
print(f"False Positives (FP): {total_fp:,}")
print(f"False Negatives (FN): {total_fn:,}")
print(f"True Positives (TP): {total_tp:,}")
print(f"\nTotal Predictions: {total_tn + total_fp + total_fn + total_tp:,}")
print(f"Overall ITD Detection Rate: {total_tp / (total_tp + total_fn) * 100:.2f}%")
print(f"Overall False Alarm Rate: {total_fp / (total_fp + total_tn) * 100:.2f}%")
print(f"Overall Precision: {total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0:.4f}")
print(f"Overall Recall: {total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0:.4f}")

In [None]:
# Visualize confusion matrix
import seaborn as sns

plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Reds', 
            xticklabels=['Normal', 'Insider Threat'],
            yticklabels=['Normal', 'Insider Threat'],
            cbar_kws={'label': 'Count'})

plt.title('Confusion Matrix - Isolation Forest Model', fontsize=14, fontweight='bold')
plt.ylabel('Actual', fontsize=12)
plt.xlabel('Predicted', fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
# Analyze anomaly scores distribution
plt.figure(figsize=(12, 5))

# Plot 1: Anomaly score distribution by class
plt.subplot(1, 2, 1)
plt.hist(anomaly_scores[y_test == 0], bins=50, alpha=0.7, label='Normal', color='blue')
plt.hist(anomaly_scores[y_test == 1], bins=50, alpha=0.7, label='Insider Threat', color='red')
plt.xlabel('Anomaly Score', fontsize=11)
plt.ylabel('Frequency', fontsize=11)
plt.title('Anomaly Score Distribution by Class', fontsize=12, fontweight='bold')
plt.legend()
plt.grid(alpha=0.3)

# Plot 2: Box plot of anomaly scores
plt.subplot(1, 2, 2)
data_for_box = [anomaly_scores[y_test == 0], anomaly_scores[y_test == 1]]
plt.boxplot(data_for_box, labels=['Normal', 'Insider Threat'])
plt.ylabel('Anomaly Score', fontsize=11)
plt.title('Anomaly Score Box Plot', fontsize=12, fontweight='bold')
plt.grid(alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nAnomaly Score Statistics:")
print(f"Normal - Mean: {anomaly_scores[y_test == 0].mean():.4f}, Std: {anomaly_scores[y_test == 0].std():.4f}")
print(f"Insider Threat - Mean: {anomaly_scores[y_test == 1].mean():.4f}, Std: {anomaly_scores[y_test == 1].std():.4f}")

## Model Summary: One-Class SVM

The One-Class SVM model leverages the following key features from `network_traffic_history_itd`:

**Engineered Features (from data_preparation):**
- `off_hours`: Traffic outside business hours (weekends or before 9am/after 5pm)
- `is_internal_source`, `is_internal_dest`: Internal IP address indicators
- `internal_only`, `external_only`: Traffic direction flags
- `high_data_volume_off_hours_internal`: High data volume during off-hours from internal sources
- `ext_transfer`: External data transfer relay patterns

**Original Network Features:**
- `duration`, `packets`, `bytes`, `bytes_per_packet`
- `protocol`, `tcp_flags`, `service`
- `bytes_ratio`, `packet_size_variance`, `connection_frequency`

**One-Class SVM Approach:**
- **Unsupervised anomaly detection** using RBF kernel for non-linear decision boundaries
- **Feature scaling** applied via StandardScaler (critical for SVM performance)
- **Nu parameter** set to expected proportion of outliers (insider threats)
- Creates a hypersphere around normal traffic; anything outside is classified as anomaly
- More robust to high-dimensional data compared to Isolation Forest
- Better at capturing complex, non-linear relationships in the feature space

The One-Class SVM learns the boundary of normal network behavior and flags insider threats as outliers that fall outside this learned boundary.