In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import joblib
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import load_model
from geopy.distance import geodesic
from tqdm.auto import tqdm


# Initialize tqdm for pandas
tqdm.pandas()

# ==========================================
# PART 1: LOAD & CONSOLIDATE DATA
# ==========================================
print("1. Consolidating outputs from all models...")

# A. Load Base Data (User Logins)
df = pd.read_csv(r"..\phase1\user_logins.csv")
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values(by=['user_id', 'timestamp'])

# --- FIX: Handle Column Names ---
if 'device_user_agent' not in df.columns and 'device' in df.columns:
    df.rename(columns={'device': 'device_user_agent'}, inplace=True)

# B. Load Network Scores (Phase 1.C)
try:
    network_scores = pd.read_csv(r'..\phase1\network_risk_scores.csv')
    df = pd.merge(df, network_scores, on='user_id', how='left')
    df['network_risk_score'] = df['network_risk_score'].fillna(0)
    print("   ✅ Loaded Network Graph scores.")
except FileNotFoundError:
    print("   ⚠️ Network scores not found. Defaulting to 0.")
    df['network_risk_score'] = 0.0

# ==========================================
# PART 2: GENERATE SCORES FROM PHASE 1 MODELS
# ==========================================
print("2. Generating Meta-Features (Model Scores)...")

# A. Load Models
try:
    iso_forest = joblib.load(r'..\phase1\model_isolation_forest.pkl')
    scaler = joblib.load(r'..\phase1\scaler.pkl')
    autoencoder = load_model(r'..\phase1\model_autoencoder.h5')
    # We don't load LSTM here because it requires session sequences, 
    # not single login rows. We simulate its score below.
    print("   ✅ Loaded Phase 1 behavior models.")
except Exception as e:
    print(f"   ❌ CRITICAL: Could not load models. {e}")
    exit()

# B. Re-Engineer Features for Behavior Models (Must match Phase 1 logic)
print("   Re-calculating features for inference...")

# 1. Time Diff
df['prev_time'] = df.groupby('user_id')['timestamp'].shift(1)
df['time_diff_hours'] = (df['timestamp'] - df['prev_time']).dt.total_seconds() / 3600
df['time_diff_hours'] = df['time_diff_hours'].fillna(0)

# 2. Velocity
# Quick vectorized approx or 0 if heavy
df['prev_lat'] = df.groupby('user_id')['lat'].shift(1)
df['prev_lon'] = df.groupby('user_id')['lon'].shift(1)

def get_geo_dist(row):
    if pd.isna(row['prev_lat']): return 0.0
    try:
        return geodesic((row['prev_lat'], row['prev_lon']), (row['lat'], row['lon'])).km
    except:
        return 0.0

df['dist_km'] = df.progress_apply(get_geo_dist, axis=1)
df['velocity_kmh'] = df['dist_km'] / (df['time_diff_hours'] + 0.1)

# 3. Device Trust
device_counts = df.groupby(['user_id', 'device_user_agent']).size().reset_index(name='count')
total_counts = df.groupby('user_id').size().reset_index(name='total')
device_stats = pd.merge(device_counts, total_counts, on='user_id')
device_stats['device_trust_score'] = device_stats['count'] / device_stats['total']
df = pd.merge(df, device_stats[['user_id', 'device_user_agent', 'device_trust_score']], 
              on=['user_id', 'device_user_agent'], how='left')

# 4. Hour
df['hour_of_day'] = df['timestamp'].dt.hour

# Prepare Features
features_p1 = ['velocity_kmh', 'time_diff_hours', 'device_trust_score', 'hour_of_day']
X_behavior = scaler.transform(df[features_p1])

# --- SCORE 1: Isolation Forest ---
print("   Running Isolation Forest Inference...")
# -1 is anomaly, 1 is normal. We map -1 -> 1 (High Risk) and 1 -> 0 (Low Risk)
iso_preds = iso_forest.predict(X_behavior)
df['score_if'] = np.where(iso_preds == -1, 1.0, 0.0)

# --- SCORE 2: Autoencoder ---
print("   Running Autoencoder Inference...")
reconstructions = autoencoder.predict(X_behavior, verbose=0)
mse = np.mean(np.power(X_behavior - reconstructions, 2), axis=1)
df['score_ae'] = mse

# --- SCORE 3: LSTM (Sequence Simulation) ---
# In a real app, we would fetch the user's last 5 actions and run the LSTM.
# Here, we simulate the LSTM's output based on the ground truth to train the Ensembler.
print("   Simulating LSTM Session scores...")
df['score_lstm'] = 0.05 # Default low risk

# --- FIX: Updated Labels to match new Generator ---
high_risk_sequences = ['Brute Force', 'Brute Force Success', 'Device Spoofing']
df.loc[df['attack_type'].isin(high_risk_sequences), 'score_lstm'] = 0.95

# ==========================================
# PART 3: TRAIN THE RISK ENGINE (XGBoost)
# ==========================================
print("\n3. Training the Master Ensemble (XGBoost)...")

# Inputs: The scores from all sub-models
ensemble_features = ['score_if', 'score_ae', 'score_lstm', 'network_risk_score']
X = df[ensemble_features]
y = df['is_attack']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost
xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=4,
    use_label_encoder=False,
    eval_metric='logloss'
)

xgb_model.fit(X_train, y_train)

# ==========================================
# PART 4: EVALUATION
# ==========================================
print("\n4. Ensemble Model Evaluation:")

preds = xgb_model.predict(X_test)
print(f"   Accuracy: {accuracy_score(y_test, preds)*100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, preds))

# Feature Importance
print("\nWhich model does the Brain trust most?")
importance = xgb_model.feature_importances_
for i, feat in enumerate(ensemble_features):
    print(f"   {feat}: {importance[i]:.4f}")

# Save
xgb_model.save_model("model_risk_engine.json")
print("\n✅ Risk Engine Saved as 'model_risk_engine.json'")

  from .autonotebook import tqdm as notebook_tqdm


1. Consolidating outputs from all models...
   ✅ Loaded Network Graph scores.
2. Generating Meta-Features (Model Scores)...
   ✅ Loaded Phase 1 behavior models.
   Re-calculating features for inference...


100%|██████████| 20000/20000 [00:03<00:00, 5777.52it/s]


   Running Isolation Forest Inference...
   Running Autoencoder Inference...
   Simulating LSTM Session scores...

3. Training the Master Ensemble (XGBoost)...

4. Ensemble Model Evaluation:
   Accuracy: 98.12%

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      3204
           1       0.97      0.94      0.95       796

    accuracy                           0.98      4000
   macro avg       0.98      0.97      0.97      4000
weighted avg       0.98      0.98      0.98      4000


Which model does the Brain trust most?
   score_if: 0.0227
   score_ae: 0.0298
   score_lstm: 0.9421
   network_risk_score: 0.0054

✅ Risk Engine Saved as 'model_risk_engine.json'


Parameters: { "use_label_encoder" } are not used.

