In [None]:
# Install required packages
!pip install dask[complete] scikit-learn joblib



In [None]:

# Import all required libraries
import os
import re
import pandas as pd
import numpy as np
import dask.bag as db
import dask.dataframe as dd
from datetime import datetime
import logging
import gzip
from glob import glob
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import joblib
import warnings
warnings.filterwarnings('ignore')

print("✅ All dependencies installed and imported successfully!")

✅ All dependencies installed and imported successfully!


In [None]:
def setup_logging():
    """Setup detailed logging for tracking pipeline execution"""
    log_format = '%(asctime)s - %(levelname)s - %(funcName)s - %(message)s'
    logging.basicConfig(
        level=logging.INFO,
        format=log_format,
        handlers=[
            logging.FileHandler('/content/vulnerability_detection.log'),
            logging.StreamHandler()
        ]
    )
    return logging.getLogger(__name__)

# Initialize logger
logger = setup_logging()
logger.info("🔧 Logging system initialized")

In [None]:
def mount_drive():
    """Mount Google Drive and verify access"""
    try:
        from google.colab import drive
        drive.mount('/content/drive')
        logger.info("✅ Google Drive mounted successfully")
        return True
    except Exception as e:
        logger.error(f"❌ Failed to mount Google Drive: {e}")
        return False

# Mount Google Drive
if mount_drive():
    print("✅ Google Drive is ready!")
else:
    print("❌ Please check your Google Drive connection")

Mounted at /content/drive
✅ Google Drive is ready!


In [None]:
def mount_drive():
    """Mount Google Drive and verify access"""
    try:
        from google.colab import drive
        drive.mount('/content/drive')
        logger.info("✅ Google Drive mounted successfully")
        return True
    except Exception as e:
        logger.error(f"❌ Failed to mount Google Drive: {e}")
        return False

# Mount Google Drive
if mount_drive():
    print("✅ Google Drive is ready!")
else:
    print("❌ Please check your Google Drive connection")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Google Drive is ready!


In [None]:
def verify_directory(log_dir):
    """Verify log directory exists and contains files"""
    try:
        if not os.path.exists(log_dir):
            logger.error(f"❌ Directory not found: {log_dir}")
            return False

        files = os.listdir(log_dir)
        log_files = [f for f in files if f.endswith(('.log', '.gz', '.txt'))]

        logger.info(f"📁 Directory: {log_dir}")
        logger.info(f"📊 Total files: {len(files)}, Log files: {len(log_files)}")

        if len(log_files) == 0:
            logger.warning("⚠️ No log files found in directory")
            return False

        # Sample first few filenames
        sample_files = log_files[:5]
        logger.info(f"📄 Sample files: {sample_files}")

        return True
    except Exception as e:
        logger.error(f"❌ Error verifying directory: {e}")
        return False

def get_log_files(log_dir):
    """Get and filter log files from directory"""
    try:
        all_files = os.listdir(log_dir)
        log_files = [f for f in all_files if f.endswith(('.log', '.gz', '.txt'))]
        full_paths = [os.path.join(log_dir, f) for f in log_files]

        logger.info(f"🔍 Found {len(log_files)} log files for processing")

        # Check file sizes for processing estimation
        total_size = 0
        for path in full_paths[:10]:  # Sample first 10 files
            try:
                size = os.path.getsize(path)
                total_size += size
            except:
                continue

        avg_size = total_size / min(10, len(full_paths)) if full_paths else 0
        estimated_total = (avg_size * len(full_paths)) / (1024*1024)  # MB
        logger.info(f"📏 Estimated total data size: {estimated_total:.2f} MB")

        return full_paths
    except Exception as e:
        logger.error(f"❌ Error getting log files: {e}")
        return []

# Verify your log directory (update the path as needed)
log_dir = '/content/drive/MyDrive/self_logs'
if verify_directory(log_dir):
    file_paths = get_log_files(log_dir)
    print(f"✅ Found {len(file_paths)} log files to process")
else:
    print("❌ Please check your log directory path")

✅ Found 2741 log files to process


In [None]:
def parse_log_line(line):
    """Parse Apache Combined Log Format with vulnerability focus"""
    # Enhanced regex for Apache Combined Log Format
    patterns = [
        # Standard Combined Log Format
        r'(\S+) \S+ \S+ \[(.*?)\] "(\S+) (.*?) (\S+)" (\d+) (\d+|-) "(.*?)" "(.*?)"',
        # Common Log Format fallback
        r'(\S+) \S+ \S+ \[(.*?)\] "(\S+) (.*?) (\S+)" (\d+) (\d+|-)',
        # Simple format fallback
        r'(\S+).*?\[(.*?)\].*?"(\S+) (.*?) (\S+)" (\d+)'
    ]

    for pattern in patterns:
        match = re.match(pattern, line.strip())
        if match:
            groups = match.groups()
            # Pad with empty strings if needed
            while len(groups) < 9:
                groups = groups + ('',)
            return groups[:9]  # Ensure exactly 9 fields
    return None

def process_log_file(file_path):
    """Process individual log file with error handling"""
    try:
        logger.info(f"📖 Processing: {os.path.basename(file_path)}")

        # Handle different file types
        if file_path.endswith('.gz'):
            with gzip.open(file_path, 'rt', encoding='utf-8', errors='ignore') as f:
                lines = f.readlines()
        else:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                lines = f.readlines()

        logger.info(f"📝 File contains {len(lines)} lines")

        # Parse lines using Dask for efficiency
        parsed_lines = []
        for line in lines:
            parsed = parse_log_line(line)
            if parsed:
                parsed_lines.append(parsed)

        success_rate = len(parsed_lines) / len(lines) * 100 if lines else 0
        logger.info(f"✅ Parsed {len(parsed_lines)}/{len(lines)} lines ({success_rate:.1f}% success)")

        return db.from_sequence(parsed_lines)

    except Exception as e:
        logger.error(f"❌ Error processing {file_path}: {e}")
        return db.from_sequence([])

print("✅ Log parsing functions defined")

✅ Log parsing functions defined


In [None]:
def parse_all_logs(file_paths):
    """Parse all log files and combine into DataFrame"""
    try:
        logger.info(f"🚀 Starting to parse {len(file_paths)} log files...")
        start_time = datetime.now()

        # Process files in batches to manage memory
        batch_size = min(50, len(file_paths))  # Process 50 files at a time
        all_bags = []

        for i in range(0, len(file_paths), batch_size):
            batch_files = file_paths[i:i+batch_size]
            logger.info(f"📦 Processing batch {i//batch_size + 1}/{(len(file_paths)-1)//batch_size + 1}")

            batch_bags = [process_log_file(path) for path in batch_files]
            all_bags.extend(batch_bags)

        # Combine all bags
        logger.info("🔄 Combining all parsed data...")
        combined_bag = db.concat(all_bags)

        # Convert to DataFrame
        columns = ['ip', 'timestamp', 'method', 'url', 'protocol', 'status', 'bytes', 'referer', 'user_agent']
        combined_ddf = combined_bag.to_dataframe(columns=columns)

        # Compute to pandas (with memory monitoring)
        logger.info("💾 Converting to pandas DataFrame...")
        combined_df = combined_ddf.compute()

        processing_time = datetime.now() - start_time
        logger.info(f"✅ Successfully parsed {len(combined_df)} log entries in {processing_time}")

        return combined_df

    except Exception as e:
        logger.error(f"❌ Error in parse_all_logs: {e}")
        return pd.DataFrame()

# Parse all log files (this may take several minutes)
print("🚀 Starting log parsing process...")
combined_df = parse_all_logs(file_paths)

if not combined_df.empty:
    print(f"✅ Successfully parsed {len(combined_df)} log entries")
    # Save raw parsed data
    combined_df.to_parquet('/content/drive/My Drive/parsed_logs.parquet', index=False)
    print("💾 Raw data saved to parsed_logs.parquet")

    # Show sample data
    print("\n📊 Sample of parsed data:")
    print(combined_df.head())
else:
    print("❌ Failed to parse logs")

🚀 Starting log parsing process...
✅ Successfully parsed 4296797 log entries
💾 Raw data saved to parsed_logs.parquet

📊 Sample of parsed data:
               ip                   timestamp method  \
0  34.206.152.150  03/Jul/2022:02:32:54 -0700   HEAD   
1  34.206.152.150  03/Jul/2022:02:32:54 -0700   HEAD   
2    85.208.98.17  03/Jul/2022:02:39:07 -0700    GET   
0    85.208.98.17  03/Jul/2022:02:39:10 -0700    GET   
1   51.222.253.17  03/Jul/2022:02:39:28 -0700    GET   

                                               url  protocol status  bytes  \
0                                                /  HTTP/1.1    301    186   
1                                                /  HTTP/1.1    200    359   
2                                                /  HTTP/1.1    301    419   
0                                                /  HTTP/1.1    200  13142   
1  /Datasets%20Description/HTML_Bro_log_2/?C=M;O=D  HTTP/1.1    200    788   

                referer                             

In [None]:
def extract_vulnerability_features(df):
    """Extract features specifically for vulnerability detection"""
    try:
        logger.info("🔍 Extracting vulnerability-focused features...")
        start_time = datetime.now()

        # Basic cleaning
        df = df.copy()
        df['status'] = pd.to_numeric(df['status'], errors='coerce').fillna(0).astype(int)
        df['bytes'] = pd.to_numeric(df['bytes'].replace('-', '0'), errors='coerce').fillna(0)

        # Parse timestamp with better error handling
        logger.info("📅 Parsing timestamps...")

        # Try multiple timestamp formats
        timestamp_formats = [
            '%d/%b/%Y:%H:%M:%S %z',  # Standard Apache format with timezone
            '%d/%b/%Y:%H:%M:%S',     # Without timezone
            '%Y-%m-%d %H:%M:%S',     # Standard format
            '%Y/%m/%d %H:%M:%S'      # Alternative format
        ]

        df['timestamp'] = None
        for fmt in timestamp_formats:
            if df['timestamp'].isna().all():
                try:
                    df['timestamp'] = pd.to_datetime(df['timestamp'], format=fmt, errors='coerce')
                    if not df['timestamp'].isna().all():
                        logger.info(f"✅ Successfully parsed timestamps with format: {fmt}")
                        break
                except:
                    continue

        # If timestamp parsing still fails, create a dummy timestamp
        if df['timestamp'].isna().all():
            logger.warning("⚠️ Timestamp parsing failed, using sequential timestamps")
            df['timestamp'] = pd.date_range(start='2024-01-01', periods=len(df), freq='1min')

        # Vulnerability-specific features
        logger.info("🎯 Creating vulnerability indicators...")

        # 1. CMS and Framework Probing
        cms_patterns = [
            'wp-login', 'wp-admin', 'wp-content', 'wp-includes',  # WordPress
            'joomla', 'administrator', 'com_',  # Joomla
            'drupal', 'node/', 'admin/',  # Drupal
            'magento', 'app/etc', 'downloader',  # Magento
            'udd.php', 'shell.php', 'c99.php',  # Common shells
        ]
        df['is_cms_probe'] = df['url'].str.contains('|'.join(cms_patterns), case=False, na=False)

        # 2. Sensitive File Access
        sensitive_patterns = [
            r'\.gz$', r'\.zip$', r'\.tar$', r'\.log$', r'\.sql$',  # Archives & logs
            r'\.env$', r'\.config$', r'\.ini$', r'\.conf$',  # Config files
            r'backup', r'dump', r'export',  # Backup files
            r'passwd', r'shadow', r'htpasswd',  # System files
        ]
        df['is_sensitive_file'] = df['url'].str.contains('|'.join(sensitive_patterns), case=False, na=False)

        # 3. Attack Patterns
        attack_patterns = [
            r'union.*select', r'script.*alert', r'javascript:',  # SQLi, XSS
            r'\.\./\.\./\.\.',  # Directory traversal
            r'<script', r'eval\(', r'exec\(',  # Code injection
            r'cmd=', r'exec=', r'shell=',  # Command injection
        ]
        df['is_attack_pattern'] = df['url'].str.contains('|'.join(attack_patterns), case=False, na=False)

        # 4. Error Patterns (Reconnaissance)
        df['is_404'] = (df['status'] == 404)
        df['is_403'] = (df['status'] == 403)
        df['is_500'] = (df['status'] == 500)
        df['is_error'] = df['status'].isin([400, 401, 403, 404, 500, 502, 503])

        # 5. Bot Detection
        bot_patterns = [
            'bot', 'crawler', 'spider', 'scraper',
            'curl', 'wget', 'python', 'php',
            'scanner', 'nikto', 'sqlmap'
        ]

        # Handle missing user_agent column
        if 'user_agent' not in df.columns:
            logger.warning("⚠️ user_agent column not found, creating dummy column")
            df['user_agent'] = ''

        df['is_bot'] = df['user_agent'].str.contains('|'.join(bot_patterns), case=False, na=False)

        # 6. Suspicious User Agents
        df['is_empty_ua'] = (df['user_agent'] == '') | (df['user_agent'] == '-') | df['user_agent'].isna()
        df['is_old_browser'] = df['user_agent'].str.contains('MSIE [1-6]|Windows 95|Windows 98', case=False, na=False)

        logger.info("📊 Aggregating features by IP address...")

        # IP-based aggregations for vulnerability assessment
        agg_dict = {
            'url': ['count', 'nunique'],
            'status': ['mean', 'std'],
            'bytes': ['sum', 'mean'],
            'is_404': 'sum',
            'is_403': 'sum',
            'is_500': 'sum',
            'is_error': 'sum',
            'is_cms_probe': 'sum',
            'is_sensitive_file': 'sum',
            'is_attack_pattern': 'sum',
            'is_bot': 'any',
            'is_empty_ua': 'any',
            'is_old_browser': 'any',
            'timestamp': ['min', 'max', 'count']
        }

        # Add method aggregation if column exists
        if 'method' in df.columns:
            agg_dict['method'] = lambda x: x.value_counts().index[0] if len(x) > 0 else 'GET'

        ip_features = df.groupby('ip').agg(agg_dict).reset_index()

        # Flatten column names
        new_columns = ['ip']
        for col in ip_features.columns[1:]:
            if isinstance(col, tuple):
                if col[1] == '<lambda>':
                    new_columns.append(f"common_{col[0]}")
                else:
                    new_columns.append(f"{col[0]}_{col[1]}")
            else:
                new_columns.append(col)

        ip_features.columns = new_columns

        # Ensure we have the expected columns (with fallbacks)
        expected_cols = {
            'request_count': 'url_count',
            'unique_urls': 'url_nunique',
            'avg_status': 'status_mean',
            'status_std': 'status_std',
            'total_bytes': 'bytes_sum',
            'avg_bytes': 'bytes_mean',
            'count_404': 'is_404_sum',
            'count_403': 'is_403_sum',
            'count_500': 'is_500_sum',
            'count_errors': 'is_error_sum',
            'cms_probes': 'is_cms_probe_sum',
            'sensitive_files': 'is_sensitive_file_sum',
            'attack_patterns': 'is_attack_pattern_sum',
            'is_bot': 'is_bot_any',
            'empty_user_agent': 'is_empty_ua_any',
            'old_browser': 'is_old_browser_any',
            'first_seen': 'timestamp_min',
            'last_seen': 'timestamp_max',
            'total_requests': 'timestamp_count'
        }

        # Rename columns to expected names
        rename_dict = {}
        for expected, actual in expected_cols.items():
            if actual in ip_features.columns:
                rename_dict[actual] = expected

        ip_features = ip_features.rename(columns=rename_dict)

        # Fill missing columns with defaults
        for expected_col in expected_cols.keys():
            if expected_col not in ip_features.columns:
                if expected_col in ['is_bot', 'empty_user_agent', 'old_browser']:
                    ip_features[expected_col] = False
                elif 'count' in expected_col or 'sum' in expected_col:
                    ip_features[expected_col] = 0
                elif 'rate' in expected_col:
                    ip_features[expected_col] = 0.0
                else:
                    ip_features[expected_col] = 0

        # Calculate derived features
        ip_features['error_rate'] = ip_features['count_errors'] / ip_features['request_count'].replace(0, 1)
        ip_features['404_rate'] = ip_features['count_404'] / ip_features['request_count'].replace(0, 1)
        ip_features['diversity_score'] = ip_features['unique_urls'] / ip_features['request_count'].replace(0, 1)

        # Time-based features (with error handling)
        try:
            if 'first_seen' in ip_features.columns and 'last_seen' in ip_features.columns:
                # Ensure timestamps are datetime
                ip_features['first_seen'] = pd.to_datetime(ip_features['first_seen'])
                ip_features['last_seen'] = pd.to_datetime(ip_features['last_seen'])

                duration_seconds = (ip_features['last_seen'] - ip_features['first_seen']).dt.total_seconds()
                ip_features['session_duration'] = duration_seconds / 3600  # hours
                ip_features['requests_per_hour'] = ip_features['request_count'] / (ip_features['session_duration'] + 0.01)
            else:
                ip_features['session_duration'] = 1.0  # Default 1 hour
                ip_features['requests_per_hour'] = ip_features['request_count']
        except Exception as e:
            logger.warning(f"⚠️ Time-based feature calculation failed: {e}, using defaults")
            ip_features['session_duration'] = 1.0
            ip_features['requests_per_hour'] = ip_features['request_count']

        # Vulnerability scores
        ip_features['vulnerability_score'] = (
            ip_features['cms_probes'] * 3 +
            ip_features['sensitive_files'] * 2 +
            ip_features['attack_patterns'] * 5 +
            ip_features['404_rate'] * ip_features['request_count'] * 0.1
        )

        processing_time = datetime.now() - start_time
        logger.info(f"✅ Feature extraction completed in {processing_time}")
        logger.info(f"📈 Generated features for {len(ip_features)} unique IP addresses")

        # Log feature statistics
        vulnerable_ips = ip_features[ip_features['vulnerability_score'] > 0]
        logger.info(f"🚨 Found {len(vulnerable_ips)} IPs with vulnerability indicators")
        logger.info(f"🤖 Bot traffic detected from {ip_features['is_bot'].sum()} IPs")
        logger.info(f"🔍 CMS probing detected: {ip_features['cms_probes'].sum()} attempts")
        logger.info(f"📁 Sensitive file access: {ip_features['sensitive_files'].sum()} attempts")

        return ip_features

    except Exception as e:
        logger.error(f"❌ Error in feature extraction: {e}")
        import traceback
        logger.error(f"Full traceback: {traceback.format_exc()}")
        return pd.DataFrame()

# Extract vulnerability features
print("🔍 Extracting vulnerability features...")
features_df = extract_vulnerability_features(combined_df)

if not features_df.empty:
    print(f"✅ Features extracted for {len(features_df)} unique IPs")
    # Save features
    features_df.to_parquet('/content/drive/My Drive/vulnerability_features.parquet', index=False)
    print("💾 Features saved to vulnerability_features.parquet")

    # Show sample features
    print("\n📊 Sample features:")
    display_cols = ['ip', 'request_count', 'vulnerability_score', 'cms_probes', 'attack_patterns']
    available_cols = [col for col in display_cols if col in features_df.columns]
    print(features_df[available_cols].head())
else:
    print("❌ Failed to extract features")

🔍 Extracting vulnerability features...




✅ Features extracted for 259629 unique IPs
💾 Features saved to vulnerability_features.parquet

📊 Sample features:
            ip  request_count  vulnerability_score  cms_probes  \
0   1.0.138.21              1                  0.0           0   
1   1.0.144.68              1                  0.0           0   
2  1.0.197.183              2                  3.0           1   
3  1.0.200.186              2                  3.0           1   
4   1.0.201.89              2                  6.2           2   

   attack_patterns  
0                0  
1                0  
2                0  
3                0  
4                0  


In [None]:
def analyze_vulnerabilities(features_df):
    """Perform detailed vulnerability analysis"""
    try:
        logger.info("🔬 Performing detailed vulnerability analysis...")

        # Top vulnerable IPs
        top_vulnerable = features_df.nlargest(10, 'vulnerability_score')[['ip', 'vulnerability_score', 'request_count', 'cms_probes', 'attack_patterns']]
        logger.info("🚨 Top 10 most suspicious IPs:")
        for _, row in top_vulnerable.iterrows():
            logger.info(f"   {row['ip']}: Score={row['vulnerability_score']:.2f}, Requests={row['request_count']}, CMS={row['cms_probes']}, Attacks={row['attack_patterns']}")

        # Attack summary
        total_cms_probes = features_df['cms_probes'].sum()
        total_sensitive_files = features_df['sensitive_files'].sum()
        total_attack_patterns = features_df['attack_patterns'].sum()

        logger.info("📈 Attack Summary:")
        logger.info(f"   🎯 CMS Probing Attempts: {total_cms_probes}")
        logger.info(f"   📁 Sensitive File Access: {total_sensitive_files}")
        logger.info(f"   ⚔️ Attack Patterns Detected: {total_attack_patterns}")

        # Bot analysis
        bot_ips = features_df[features_df['is_bot'] == True]
        logger.info(f"🤖 Bot Activity: {len(bot_ips)} IPs identified as bots")

        # Error analysis
        high_error_ips = features_df[features_df['error_rate'] > 0.5]
        logger.info(f"❗ High Error Rate: {len(high_error_ips)} IPs with >50% error rate")

        # Save detailed analysis
        analysis_results = {
            'top_vulnerable_ips': top_vulnerable,
            'attack_summary': {
                'cms_probes': int(total_cms_probes),
                'sensitive_files': int(total_sensitive_files),
                'attack_patterns': int(total_attack_patterns)
            },
            'bot_ips': len(bot_ips),
            'high_error_ips': len(high_error_ips)
        }

        return analysis_results

    except Exception as e:
        logger.error(f"❌ Error in vulnerability analysis: {e}")
        return None

# Perform vulnerability analysis
print("🔬 Analyzing vulnerabilities...")
analysis_results = analyze_vulnerabilities(features_df)

if analysis_results:
    print("✅ Vulnerability analysis completed")
    print(f"📊 Top vulnerable IPs identified: {len(analysis_results['top_vulnerable_ips'])}")
    print(f"🎯 Total attack patterns: {analysis_results['attack_summary']['attack_patterns']}")
else:
    print("❌ Vulnerability analysis failed")

🔬 Analyzing vulnerabilities...
✅ Vulnerability analysis completed
📊 Top vulnerable IPs identified: 10
🎯 Total attack patterns: 1782


In [None]:
def train_vulnerability_models(features_df):
    """Train multiple models for vulnerability detection"""
    try:
        logger.info("🤖 Training vulnerability detection models...")
        start_time = datetime.now()

        # Prepare features for modeling
        feature_columns = [
            'request_count', 'unique_urls', 'avg_status', 'status_std',
            'total_bytes', 'avg_bytes', 'count_404', 'count_403', 'count_500',
            'count_errors', 'cms_probes', 'sensitive_files', 'attack_patterns',
            'error_rate', '404_rate', 'diversity_score', 'session_duration',
            'requests_per_hour', 'vulnerability_score'
        ]

        X = features_df[feature_columns].fillna(0)

        # Create labels using multiple heuristics
        labels = (
            (features_df['cms_probes'] > 0) |
            (features_df['attack_patterns'] > 0) |
            (features_df['vulnerability_score'] > 5) |
            ((features_df['404_rate'] > 0.3) & (features_df['request_count'] > 10))
        ).astype(int)

        logger.info(f"🏷️ Label distribution - Malicious: {labels.sum()}, Benign: {len(labels) - labels.sum()}")

        if labels.sum() == 0:
            logger.warning("⚠️ No malicious IPs detected with current heuristics")
            return None

        # Scale features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X_scaled, labels, test_size=0.3, random_state=42, stratify=labels)

        # Train Random Forest
        logger.info("🌲 Training Random Forest classifier...")
        rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
        rf_model.fit(X_train, y_train)

        # Evaluate Random Forest
        rf_pred = rf_model.predict(X_test)
        logger.info("📊 Random Forest Results:")
        logger.info(f"\n{classification_report(y_test, rf_pred)}")

        # Train Isolation Forest for anomaly detection
        logger.info("🔍 Training Isolation Forest for anomaly detection...")
        iso_model = IsolationForest(contamination=0.1, random_state=42)
        iso_model.fit(X_train[y_train == 0])  # Train only on benign data

        # Feature importance
        feature_importance = pd.DataFrame({
            'feature': feature_columns,
            'importance': rf_model.feature_importances_
        }).sort_values('importance', ascending=False)

        logger.info("🎯 Top 5 most important features:")
        for _, row in feature_importance.head().iterrows():
            logger.info(f"   {row['feature']}: {row['importance']:.4f}")

        # Save models
        models = {
            'random_forest': rf_model,
            'isolation_forest': iso_model,
            'scaler': scaler,
            'feature_columns': feature_columns,
            'feature_importance': feature_importance
        }

        joblib.dump(models, '/content/drive/My Drive/vulnerability_models.pkl')
        logger.info("💾 Models saved to /content/drive/My Drive/vulnerability_models.pkl")

        processing_time = datetime.now() - start_time
        logger.info(f"✅ Model training completed in {processing_time}")

        return models

    except Exception as e:
        logger.error(f"❌ Error in model training: {e}")
        return None

# Train machine learning models
print("🤖 Training vulnerability detection models...")
models = train_vulnerability_models(features_df)

if models:
    print("✅ Models trained successfully")
    print("📊 Feature importance:")
    print(models['feature_importance'].head())
else:
    print("❌ Model training failed")

🤖 Training vulnerability detection models...
✅ Models trained successfully
📊 Feature importance:
                feature  importance
18  vulnerability_score    0.395290
10           cms_probes    0.132072
2            avg_status    0.128353
11      sensitive_files    0.102298
6             count_404    0.048026


In [None]:
# Generate final summary
print("="*60)
print("🎉 VULNERABILITY DETECTION PIPELINE COMPLETED!")
print("="*60)

if not combined_df.empty:
    print(f"📊 Processed {len(combined_df)} log entries from {len(file_paths)} files")

if not features_df.empty:
    print(f"🎯 Analyzed {len(features_df)} unique IP addresses")

    # Summary statistics
    vulnerable_ips = features_df[features_df['vulnerability_score'] > 0]
    print(f"🚨 Vulnerable IPs detected: {len(vulnerable_ips)}")
    print(f"🤖 Bot IPs detected: {features_df['is_bot'].sum()}")
    print(f"🔍 CMS probe attempts: {features_df['cms_probes'].sum()}")
    print(f"⚔️ Attack patterns: {features_df['attack_patterns'].sum()}")

print("\n📁 Output files saved to Google Drive:")
print("   - parsed_logs.parquet (raw parsed data)")
print("   - vulnerability_features.parquet (extracted features)")
if models:
    print("   - vulnerability_models.pkl (trained ML models)")
print("   - vulnerability_detection.log (execution log)")

print("\n💡 Next steps:")
print("   1. Review the vulnerability analysis results")
print("   2. Investigate suspicious IP addresses")
print("   3. Use trained models for real-time detection")
print("   4. Implement security measures based on findings")

🎉 VULNERABILITY DETECTION PIPELINE COMPLETED!
📊 Processed 4296797 log entries from 2741 files
🎯 Analyzed 259629 unique IP addresses
🚨 Vulnerable IPs detected: 149207
🤖 Bot IPs detected: 37228
🔍 CMS probe attempts: 187227
⚔️ Attack patterns: 1782

📁 Output files saved to Google Drive:
   - parsed_logs.parquet (raw parsed data)
   - vulnerability_features.parquet (extracted features)
   - vulnerability_models.pkl (trained ML models)
   - vulnerability_detection.log (execution log)

💡 Next steps:
   1. Review the vulnerability analysis results
   2. Investigate suspicious IP addresses
   3. Use trained models for real-time detection
   4. Implement security measures based on findings
