In [None]:
import pandas as pd
import numpy as np
import pickle
import joblib
from pathlib import Path
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score
from sklearn.model_selection import ParameterGrid
import warnings
warnings.filterwarnings('ignore')

class AnomalyDetector:
    def __init__(self):
        self.trained_models = {}
        self.best_model_info = None
        self.scaler = None
        self.label_encoders = {}
        self.feature_columns = None
        self.is_fitted = False
        
    def preprocess_data(self, data, is_training=True):
        """Preprocess data for training or prediction"""
        data = data.copy()
        
        # Handle missing values
        data = data.fillna(-1)
        
        # Convert data types
        def convert_float_int_or_numeric_placeholder(val):
            try:
                f_val = float(val)
                if f_val.is_integer():
                    return int(f_val)
                return f_val
            except ValueError:
                return str(val)
        
        data = data.applymap(convert_float_int_or_numeric_placeholder)
        
        # Drop non-useful columns
        cols_to_drop = ['Time', 'ip_source', 'AMF_UE_NGAP_ID','EPD_2', 'spare_2', 'SecHdr_2', 'NASSecAlgo', 
                           'EPD_3', 'spare_3', 'SecHdr_3', 'Type_2',
                           'PayloadContainerType', 'PayloadContainer', 'NAS_KSI', '5GSRegType',
                            'RES',  'DeregistrationType' ]
        
        data_clean = data.drop(columns=cols_to_drop, errors='ignore')

         # Store feature columns during training
        if is_training:
            self.feature_columns = data_clean.columns.tolist()
        else:
            # Ensure test data has same columns as training data
            missing_cols = set(self.feature_columns) - set(data_clean.columns)
            extra_cols = set(data_clean.columns) - set(self.feature_columns)
            
            if missing_cols:
                print(f"Warning: Missing columns in test data: {missing_cols}")
                for col in missing_cols:
                    data_clean[col] = -1
            
            if extra_cols:
                print(f"Warning: Extra columns in test data (will be dropped): {extra_cols}")
                data_clean = data_clean.drop(columns=list(extra_cols))
            
            # Reorder columns to match training order
            data_clean = data_clean[self.feature_columns]
        
        # Encode categorical variables
        categorical_cols = data_clean.select_dtypes(include='object').columns
        
        for col in categorical_cols:
            if is_training:
                le = LabelEncoder()
                data_clean[col] = le.fit_transform(data_clean[col].astype(str))
                self.label_encoders[col] = le
            else:
                if col in self.label_encoders:
                    le = self.label_encoders[col]
                    def map_unknown(x):
                        x_str = str(x)
                        if x_str in le.classes_:
                            return le.transform([x_str])[0]
                        else:
                            return -1
                    data_clean[col] = data_clean[col].map(map_unknown)
                else:
                    data_clean[col] = -1
        
        # Scale features
        if is_training:
            self.scaler = RobustScaler()
            X_scaled = self.scaler.fit_transform(data_clean)
        else:
            if self.scaler is None:
                raise ValueError("Model must be fitted before prediction")
            X_scaled = self.scaler.transform(data_clean)
        
        return X_scaled, data_clean
    
    def calculate_anomaly_quality_score(self, anomaly_indices, X_data):
        """Calculate quality score for detected anomalies"""
        if len(anomaly_indices) == 0:
            return 0
        
        normal_indices = [i for i in range(len(X_data)) if i not in anomaly_indices]
        if len(normal_indices) == 0:
            return 0
        
        anomaly_data = X_data[anomaly_indices]
        normal_data = X_data[normal_indices]
        
        # Statistical separation
        anomaly_mean = np.mean(anomaly_data, axis=0)
        normal_mean = np.mean(normal_data, axis=0)
        separation_score = np.linalg.norm(anomaly_mean - normal_mean)
        
        # Anomaly cohesion
        if len(anomaly_indices) > 1:
            anomaly_variance = np.mean(np.var(anomaly_data, axis=0))
            cohesion_score = 1 / (1 + anomaly_variance)
        else:
            cohesion_score = 1
        
        # Rarity score
        rarity_score = 1 / (1 + len(anomaly_indices) / len(X_data))
        
        # Feature diversity
        feature_differences = np.abs(anomaly_mean - normal_mean)
        diversity_score = np.sum(feature_differences > np.std(feature_differences))
        
        total_score = (
            0.3 * separation_score + 
            0.2 * cohesion_score + 
            0.3 * rarity_score + 
            0.2 * diversity_score
        )
        
        return total_score

    def get_parameter_grids(self, n_samples, n_features):
        """Get adaptive parameter grids based on data characteristics"""
        return {
            'lof': {
                'n_neighbors': [max(10, n_samples//100), max(20, n_samples//50), max(30, n_samples//30)],
                'contamination': [0.001, 0.005, 0.01, 0.02, 0.05]
            },
            'isolation_forest': {
                'n_estimators': [100, 200],
                'contamination': [0.001, 0.005, 0.01, 0.02, 0.05],
                'max_features': [1.0, 0.8, min(n_features, 10)/n_features]
            }
        }

    def train_isolation_forest(self, X_train, params):
        """Train and evaluate Isolation Forest models"""
        results = []
        print(f"   Training {len(list(ParameterGrid(params)))} Isolation Forest configurations...")
        
        for i, param_set in enumerate(ParameterGrid(params)):
            try:
                model = IsolationForest(
                    n_estimators=param_set['n_estimators'],
                    contamination=param_set['contamination'],
                    max_features=param_set['max_features'],
                    random_state=42
                )
                model.fit(X_train)
                
                # Evaluate on training data to get quality score
                train_predictions = model.predict(X_train)
                anomaly_indices = np.where(train_predictions == -1)[0]
                
                if len(anomaly_indices) > 0:
                    quality_score = self.calculate_anomaly_quality_score(anomaly_indices, X_train)
                    results.append({
                        'method': 'IsolationForest',
                        'model': model,
                        'params': param_set,
                        'quality_score': quality_score,
                        'train_anomalies': len(anomaly_indices)
                    })
                    print(f"     Config {i+1}: Found {len(anomaly_indices)} anomalies, score: {quality_score:.4f}")
                else:
                    print(f"     Config {i+1}: No anomalies detected")
                    
            except Exception as e:
                print(f"     Config {i+1}: Error - {str(e)}")
                continue

        print(f"   Successfully trained {len(results)} Isolation Forest models")
        return results
    
    def train_lof(self, X_train, params):
        """Train and evaluate LOF models"""
        results = []
        print(f"   Training {len(list(ParameterGrid(params)))} LOF configurations...")
        
        for i, param_set in enumerate(ParameterGrid(params)):
            try:
                # Check if we have enough samples for the n_neighbors parameter
                if param_set['n_neighbors'] >= X_train.shape[0]:
                    print(f"     Config {i+1}: n_neighbors ({param_set['n_neighbors']}) >= n_samples ({X_train.shape[0]})")
                    continue
                
                model = LocalOutlierFactor(
                    n_neighbors=param_set['n_neighbors'],
                    contamination=param_set['contamination'],
                    novelty=True  # Important for prediction on new data
                )
                model.fit(X_train)
                
                # Use decision function for evaluation (more reliable than outlier factors)
                decision_scores = model.decision_function(X_train)
                threshold = np.percentile(decision_scores, param_set['contamination'] * 100)
                anomaly_indices = np.where(decision_scores < threshold)[0]
                
                if len(anomaly_indices) > 0:
                    quality_score = self.calculate_anomaly_quality_score(anomaly_indices, X_train)
                    results.append({
                        'method': 'LOF',
                        'model': model,
                        'params': param_set,
                        'quality_score': quality_score,
                        'train_anomalies': len(anomaly_indices)
                    })
                    print(f"     Config {i+1}: Found {len(anomaly_indices)} anomalies, score: {quality_score:.4f}")
                else:
                    print(f"     Config {i+1}: No anomalies detected")
                    
            except Exception as e:
                print(f"     Config {i+1}: Error - {str(e)}")
                continue
        
        print(f"   Successfully trained {len(results)} LOF models")
        return results
    
    def print_anomaly_rows(self, anomaly_indices, X_data):
        if len(anomaly_indices) == 0:
            print("No anomalies detected.")
            return
        
        print(f"Anomalies detected at rows: {anomaly_indices}")
        for idx in anomaly_indices:
            print(f"Row {idx}: {X_data[idx]}")
    
    def detect_and_print_best_model_anomalies(self, results, X_scaled, X_clean): 
        if not results:
            print("No models to evaluate.")
            return

        # First, evaluate all models on test data to see which ones actually detect anomalies
        model_evaluations = []
        
        print(" Evaluating all models on test data...")
        for result in results:
            try:
                model = result['model']
                method = result['method']
                params = result['params']
                train_quality = result['quality_score']
                
                # Test on actual test data
                predictions = model.predict(X_scaled)
                anomaly_indices = np.where(predictions == -1)[0]
                test_anomaly_count = len(anomaly_indices)
                
                # Calculate test quality score if anomalies found
                test_quality_score = 0
                if test_anomaly_count > 0:
                    test_quality_score = self.calculate_anomaly_quality_score(anomaly_indices, X_scaled)
                
                model_evaluations.append({
                    'method': method,
                    'model': model,
                    'params': params,
                    'train_quality': train_quality,
                    'test_anomaly_count': test_anomaly_count,
                    'test_quality_score': test_quality_score,
                    'test_anomaly_indices': anomaly_indices
                })
                
                # print(f"   {method}: {test_anomaly_count} anomalies, test quality: {test_quality_score:.4f}")
                
            except Exception as e:
                print(f"   {result['method']}: Error - {str(e)}")
                continue
        
        # Filter models that actually detected anomalies
        detecting_models = [m for m in model_evaluations if m['test_anomaly_count'] > 0]
        
        if not detecting_models:
            print("\n No models detected any anomalies on test data!")
            print("This could indicate:")
            print("   - The test data contains no anomalies")
            print("   - The contamination parameters are too strict")
            print("   - The models need different parameter tuning")
            return
        
        # Choose best model among those that actually detected anomalies
        # Primary criterion: test quality score, secondary: fewer but higher quality anomalies
        chosen_model = max(detecting_models, key=lambda x: (x['test_quality_score'], -x['test_anomaly_count']/1000))
        
        print(f"\nBest performing model: {chosen_model['method']}")
        print(f"   - Test anomalies found: {chosen_model['test_anomaly_count']}")
        print(f"   - Test quality score: {chosen_model['test_quality_score']:.4f}")
        print(f"   - Training quality score: {chosen_model['train_quality']:.4f}")
        
        # Show the anomalies from the best model
        anomaly_indices = chosen_model['test_anomaly_indices']
        print(f"\n ANOMALIES DETECTED BY BEST MODEL ({chosen_model['method']}):")
        print(f"Anomaly indices: {list(anomaly_indices)}")
        print("\nAnomalous rows:")
        print(X_clean.iloc[anomaly_indices])
        
        # Show comparison with other detecting models (group by method to avoid duplicates)
        other_detecting = [m for m in detecting_models if m['method'] != chosen_model['method']]
        if other_detecting:
            print(f"\n" + "="*50)
            print("COMPARISON WITH OTHER DETECTING MODELS:")
            print("="*50)
            
            # Group by method to avoid showing duplicates
            methods_shown = set()
            for model_eval in other_detecting:
                method_name = model_eval['method']
                if method_name in methods_shown:
                    continue
                methods_shown.add(method_name)
                
                print(f"\n🔍 {method_name}:")
                print(f"   - Anomalies: {model_eval['test_anomaly_count']}")
                print(f"   - Quality score: {model_eval['test_quality_score']:.4f}")
                print(f"   - Indices: {list(model_eval['test_anomaly_indices'])}")
                
                # Show if there are different anomalies
                if not np.array_equal(model_eval['test_anomaly_indices'], anomaly_indices):
                    print("   - Different anomalies detected:")
                    print(X_clean.iloc[model_eval['test_anomaly_indices']])
                else:
                    print("   - Same anomalies as best model")
        
        # Show models that didn't detect anything (group by method to avoid duplicates)
        non_detecting = [m for m in model_evaluations if m['test_anomaly_count'] == 0]
        if non_detecting:
            print(f"\n📋 Models that found no anomalies: {len(non_detecting)}")
            methods_shown = set()
            count_shown = 0
            for m in non_detecting:
                if m['method'] not in methods_shown and count_shown < 3:
                    methods_shown.add(m['method'])
                    print(f"   - {m['method']} (train quality: {m['train_quality']:.4f})")
                    count_shown += 1
            remaining = len([m for m in non_detecting if m['method'] not in methods_shown])
            if remaining > 0:
                print(f"   - ... and {remaining} other configurations")



In [155]:
if __name__ == "__main__":
    print("Phase 1: Training")
    train_data = pd.read_csv("replaydata.csv", dtype=str, sep=";")
    detector = AnomalyDetector()

    X_train, X_test = train_test_split(train_data, test_size=0.3, random_state=42)

    # Preprocess training data first - this sets feature_columns, scaler, label encoders
    X_train_scaled, X_train_clean = detector.preprocess_data(X_train, is_training=True)

    # Now preprocess test data
    X_test_scaled, X_test_clean = detector.preprocess_data(X_test, is_training=False)


    # Get parameter grids based on training data shape
    param_grids = detector.get_parameter_grids(n_samples=X_train_scaled.shape[0], n_features=X_train_scaled.shape[1])

    # Train models
    isolation_results = detector.train_isolation_forest(X_train_scaled, param_grids['isolation_forest'])
    lof_results = detector.train_lof(X_train_scaled, param_grids['lof'])

Phase 1: Training
   Training 30 Isolation Forest configurations...
     Config 1: Found 1 anomalies, score: 16.2429
     Config 2: Found 1 anomalies, score: 16.2429
     Config 3: Found 1 anomalies, score: 3.3592
     Config 4: Found 1 anomalies, score: 3.3592
     Config 5: Found 1 anomalies, score: 16.2429
     Config 6: Found 1 anomalies, score: 16.2429
     Config 7: Found 2 anomalies, score: 9.5967
     Config 8: Found 2 anomalies, score: 9.5967
     Config 9: Found 2 anomalies, score: 9.5967
     Config 10: Found 2 anomalies, score: 9.5967
     Config 11: Found 2 anomalies, score: 9.5967
     Config 12: Found 2 anomalies, score: 9.5967
     Config 13: Found 2 anomalies, score: 9.5967
     Config 14: Found 2 anomalies, score: 9.5967
     Config 15: Found 2 anomalies, score: 9.5967
     Config 16: Found 2 anomalies, score: 9.5967
     Config 17: Found 2 anomalies, score: 9.5967
     Config 18: Found 2 anomalies, score: 9.5967
     Config 19: Found 2 anomalies, score: 9.5967
     C

In [156]:

if __name__ == "__main__":
    print("="*60)
    print("ANOMALY DETECTION SYSTEM")
    print("="*60)
    
    print("\nPhase 1: Loading and Splitting Data")
    print("-" * 40)
    
    try:
        train_data = pd.read_csv("replaydata.csv", dtype=str, sep=";")
        print(f" Data loaded successfully: {train_data.shape[0]} rows, {train_data.shape[1]} columns")
        
        detector = AnomalyDetector()
        X_train, X_test = train_test_split(train_data, test_size=0.3, random_state=42)
        print(f" Data split: {X_train.shape[0]} training samples, {X_test.shape[0]} test samples")
        
    except FileNotFoundError:
        print(" Error: 'replaydata.csv' not found in current directory")
        exit(1)
    except Exception as e:
        print(f" Error loading data: {e}")
        exit(1)

    

ANOMALY DETECTION SYSTEM

Phase 1: Loading and Splitting Data
----------------------------------------
 Data loaded successfully: 910 rows, 25 columns
 Data split: 637 training samples, 273 test samples


In [157]:
print("\nPhase 2: Data Preprocessing")
print("-" * 40)
    
try:
    # Preprocess training data first - this sets feature_columns, scaler, label encoders
    X_train_scaled, X_train_clean = detector.preprocess_data(X_train, is_training=True)
    print(f" Training data preprocessed: {X_train_scaled.shape[1]} features")
    
    # Now preprocess test data
    X_test_scaled, X_test_clean = detector.preprocess_data(X_test, is_training=False)
    print(f" Test data preprocessed: {X_test_scaled.shape[1]} features")
    
except Exception as e:
    print(f" Error during preprocessing: {e}")
    exit(1)





Phase 2: Data Preprocessing
----------------------------------------
 Training data preprocessed: 8 features
 Test data preprocessed: 8 features


In [158]:
print("\nPhase 4: Anomaly Detection and Evaluation")
print("-" * 40)

try:
    # Combine all results and detect anomalies
    all_results = isolation_results + lof_results
    
    if all_results:
        # print(f" Total trained models: {len(all_results)}")
        detector.detect_and_print_best_model_anomalies(all_results, X_test_scaled, X_test_clean)
    else:
        print(" No models were successfully trained")
        
except Exception as e:
    print(f" Error during anomaly detection: {e}")
    exit(1)

print("\n" + "="*60)
print("ANALYSIS COMPLETE")
print("="*60)


Phase 4: Anomaly Detection and Evaluation
----------------------------------------
 Evaluating all models on test data...

Best performing model: IsolationForest
   - Test anomalies found: 2
   - Test quality score: 16.3516
   - Training quality score: 9.5967

 ANOMALIES DETECTED BY BEST MODEL (IsolationForest):
Anomaly indices: [np.int64(8), np.int64(148)]

Anomalous rows:
     procedureCode  EPD  spare  SecHdr  Type  5GSID  UESecCap  Seqn
905             46  126      0       4    94     54         2     0
904             46  126      0       4    94     54         2     0

COMPARISON WITH OTHER DETECTING MODELS:

🔍 LOF:
   - Anomalies: 2
   - Quality score: 16.3516
   - Indices: [np.int64(8), np.int64(148)]
   - Same anomalies as best model

📋 Models that found no anomalies: 6
   - IsolationForest (train quality: 16.2429)

ANALYSIS COMPLETE
