In [1]:
!pip install scikit-learn pandas numpy tqdm matplotlib seaborn joblib
!pip install sentence-transformers



In [11]:
def smart_sample_data(df, max_samples=50000, anomaly_ratio=0.3):
    """
    Intelligently sample data to maintain class balance while reducing size
    """
    # Use 'is_anomaly' column instead of 'label' since we haven't preprocessed yet
    anomalies = df[df['is_anomaly'] == 1]
    normal = df[df['is_anomaly'] == 0]

    # Calculate optimal sampling
    n_anomalies = min(len(anomalies), int(max_samples * anomaly_ratio))
    n_normal = min(len(normal), max_samples - n_anomalies)

    # Sample with stratification
    sampled_anomalies = anomalies.sample(n=n_anomalies, random_state=42) if len(anomalies) > n_anomalies else anomalies
    sampled_normal = normal.sample(n=n_normal, random_state=42) if len(normal) > n_normal else normal

    result = pd.concat([sampled_anomalies, sampled_normal], ignore_index=True)
    print(f"Sampled {len(result)} rows from {len(df)} (Anomalies: {len(sampled_anomalies)}, Normal: {len(sampled_normal)})")
    return result.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle

print("Smart sampling function defined!")

def fast_preprocess(df):
    """
    Vectorized preprocessing - much faster than apply()
    """
    # Vectorized string operations
    df['tag'] = df['tag'].fillna('').astype(str)
    df['text'] = df['text'].fillna('').astype(str)
    df['message'] = df['text'].str.split(pat=': ', n=1).str[-1]
    df['input_text'] = df['tag'] + ': ' + df['message']
    df['label'] = df['is_anomaly'].astype(int)
    return df

print("Fast preprocessing function defined!")

class FastAnomalyDetector:
    def __init__(self, method='sgd'):  # SGD is much faster than LogisticRegression for large data
        self.method = method
        self.vectorizer = TfidfVectorizer(
            max_features=10000,  # Reduced from 50000
            ngram_range=(1, 1),  # Only unigrams for speed
            max_df=0.95,
            min_df=2,
            strip_accents='ascii',
            lowercase=True,
            stop_words='english'  # Remove common words
        )

        if method == 'sgd':
            self.classifier = SGDClassifier(
                loss='log_loss',
                class_weight='balanced',
                random_state=42,
                max_iter=100,  # Early stopping
                tol=1e-3,
                n_jobs=-1
            )
        elif method == 'rf':
            self.classifier = RandomForestClassifier(
                n_estimators=50,  # Reduced from default 100
                max_depth=10,
                class_weight='balanced',
                random_state=42,
                n_jobs=-1
            )
        else:
            self.classifier = LogisticRegression(
                class_weight='balanced',
                max_iter=100,
                random_state=42,
                n_jobs=-1
            )

    def fit(self, X, y):
        print(f"Fitting TF-IDF vectorizer on {len(X)} samples...")
        X_vec = self.vectorizer.fit_transform(X)
        print(f"TF-IDF shape: {X_vec.shape}")

        print(f"Training {self.method} classifier...")
        self.classifier.fit(X_vec, y)
        return self

    def predict_proba(self, X):
        X_vec = self.vectorizer.transform(X)
        if hasattr(self.classifier, 'predict_proba'):
            return self.classifier.predict_proba(X_vec)[:, 1]
        else:
            return self.classifier.decision_function(X_vec)

    def predict(self, X):
        scores = self.predict_proba(X)
        return (scores >= 0.5).astype(int)

print("FastAnomalyDetector class defined!")

def fast_transformer_approach(train_texts, train_labels, val_texts, val_labels):
    """
    Use a much smaller model or skip transformer entirely for initial experiments
    """
    try:
        from sentence_transformers import SentenceTransformer
        from sklearn.linear_model import LogisticRegression

        # Use a tiny, fast model
        model = SentenceTransformer('all-MiniLM-L6-v2')  # Very fast, small model

        # Sample for embedding (embeddings are expensive)
        sample_size = min(5000, len(train_texts))
        indices = np.random.choice(len(train_texts), sample_size, replace=False)

        print(f"Creating embeddings for {sample_size} samples...")
        train_embeddings = model.encode(np.array(train_texts)[indices], show_progress_bar=True)

        # Train a simple classifier on embeddings
        clf = LogisticRegression(class_weight='balanced', max_iter=100)
        clf.fit(train_embeddings, np.array(train_labels)[indices])

        # Evaluate on small validation set
        val_sample = min(1000, len(val_texts))
        val_indices = np.random.choice(len(val_texts), val_sample, replace=False)
        val_embeddings = model.encode(np.array(val_texts)[val_indices])

        val_scores = clf.predict_proba(val_embeddings)[:, 1]
        val_true = np.array(val_labels)[val_indices]

        return {
            'roc_auc': roc_auc_score(val_true, val_scores),
            'pr_auc': average_precision_score(val_true, val_scores)
        }
    except ImportError:
        print("Sentence transformers not available, skipping...")
        return None

print("Transformer function defined!")

def train_fast_anomaly_detector(csv_files):
    """
    Main training pipeline with all optimizations
    """
    print("=== FAST LOG ANOMALY DETECTION ===")

    # Load and combine data
    print("Loading CSV files...")
    dfs = []
    for file in csv_files:
        # Load from the 'data' directory
        file_path = os.path.join(file)
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            print(f"Loaded {file}: {df.shape[0]} rows")
            dfs.append(df)
        else:
            print(f"File not found: {file_path}")

    if not dfs:
        print("No CSV files found!")
        return None

    df = pd.concat(dfs, ignore_index=True)
    print(f"Total dataset: {df.shape[0]} rows")

    # Check available columns
    print("Available columns:", list(df.columns))
    print("Sample of is_anomaly column:", df['is_anomaly'].value_counts().to_dict())

    # OPTIMIZATION: Smart sampling for large datasets BEFORE preprocessing
    if len(df) > 50000:
        print("Large dataset detected, applying intelligent sampling...")
        df = smart_sample_data(df, max_samples=50000)

    # Fast preprocessing AFTER sampling
    print("Preprocessing...")
    df = fast_preprocess(df)

    print("Class distribution after preprocessing:", df['label'].value_counts().to_dict())

    # Train-test split
    X = df['input_text']
    y = df['label']
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Train multiple fast models and compare
    models = {
        'SGD': FastAnomalyDetector('sgd'),
        'Random Forest': FastAnomalyDetector('rf'),
    }

    results = {}

    for name, model in models.items():
        print(f"\n=== Training {name} ===")
        start_time = pd.Timestamp.now()

        model.fit(X_train, y_train)

        # Evaluate
        train_scores = model.predict_proba(X_train)
        test_scores = model.predict_proba(X_test)
        test_preds = model.predict(X_test)

        train_time = (pd.Timestamp.now() - start_time).total_seconds()

        # Metrics
        metrics = {
            'train_time': train_time,
            'roc_auc': roc_auc_score(y_test, test_scores),
            'pr_auc': average_precision_score(y_test, test_scores),
            'classification_report': classification_report(y_test, test_preds, output_dict=True)
        }

        results[name] = metrics

        print(f"Training time: {train_time:.2f} seconds")
        print(f"ROC-AUC: {metrics['roc_auc']:.4f}")
        print(f"PR-AUC: {metrics['pr_auc']:.4f}")
        print("\nClassification Report:")
        print(classification_report(y_test, test_preds))

        # Save best model
        if name == 'SGD':  # Usually fastest and good enough
            joblib.dump(model, f'fast_anomaly_detector_{name.lower()}.joblib')

    # Optional: Try lightweight transformer approach
    print("\n=== Trying Lightweight Transformer ===")
    transformer_results = fast_transformer_approach(
        X_train.tolist(), y_train.tolist(),
        X_test.tolist(), y_test.tolist()
    )
    if transformer_results:
        print(f"Transformer ROC-AUC: {transformer_results['roc_auc']:.4f}")
        print(f"Transformer PR-AUC: {transformer_results['pr_auc']:.4f}")

    return results

print("Main training pipeline defined!")

Smart sampling function defined!
Fast preprocessing function defined!
FastAnomalyDetector class defined!
Transformer function defined!
Main training pipeline defined!


In [2]:
# -*- coding: utf-8 -*-
"""
Optimized Log Anomaly Detection - Fast Training Version
Major optimizations for 100k+ line datasets
"""

import os
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

print("All libraries imported successfully!")

All libraries imported successfully!


In [4]:
# ADDITIONAL OPTIMIZATION: Incremental Learning for Very Large Datasets
class IncrementalAnomalyDetector:
    """
    For datasets too large to fit in memory
    """
    def __init__(self):
        self.vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,1))
        self.classifier = SGDClassifier(
            loss='log_loss',
            class_weight='balanced',
            random_state=42
        )
        self.fitted = False

    def partial_fit(self, texts, labels, batch_size=1000):
        """
        Train incrementally on batches
        """
        for i in tqdm(range(0, len(texts), batch_size), desc="Training batches"):
            batch_texts = texts[i:i+batch_size]
            batch_labels = labels[i:i+batch_size]

            if not self.fitted:
                # First batch - fit vectorizer
                X_batch = self.vectorizer.fit_transform(batch_texts)
                self.classifier.partial_fit(X_batch, batch_labels, classes=[0, 1])
                self.fitted = True
            else:
                # Subsequent batches
                X_batch = self.vectorizer.transform(batch_texts)
                self.classifier.partial_fit(X_batch, batch_labels)

    def predict_proba(self, texts):
        X = self.vectorizer.transform(texts)
        return self.classifier.predict_proba(X)[:, 1]

print("IncrementalAnomalyDetector class defined!")

IncrementalAnomalyDetector class defined!


In [5]:
import os
import requests

# List of GitHub raw CSV URLs (all 10 datasets)
github_files = [
    "https://raw.githubusercontent.com/Legend-Recalls/log-anamoly/main/realistic_android_log_dataset_1.csv",
    "https://raw.githubusercontent.com/Legend-Recalls/log-anamoly/main/realistic_android_log_dataset_2.csv",
    "https://raw.githubusercontent.com/Legend-Recalls/log-anamoly/main/realistic_android_log_dataset_3.csv",
    "https://raw.githubusercontent.com/Legend-Recalls/log-anamoly/main/realistic_android_log_dataset_4.csv",
    "https://raw.githubusercontent.com/Legend-Recalls/log-anamoly/main/realistic_android_log_dataset_5.csv",
    "https://raw.githubusercontent.com/Legend-Recalls/log-anamoly/main/realistic_android_log_dataset_6.csv",
    "https://raw.githubusercontent.com/Legend-Recalls/log-anamoly/main/realistic_android_log_dataset_7.csv",
    "https://raw.githubusercontent.com/Legend-Recalls/log-anamoly/main/realistic_android_log_dataset_8.csv",
    "https://raw.githubusercontent.com/Legend-Recalls/log-anamoly/main/realistic_android_log_dataset_9.csv",
    "https://raw.githubusercontent.com/Legend-Recalls/log-anamoly/main/realistic_android_log_dataset_10.csv",
]

# Local filenames list (so rest of your code works without changes)
csv_files = []

# Download files if not already present
for url in github_files:
    filename = os.path.basename(url)
    if not os.path.exists(filename):
        print(f"⬇️ Downloading {filename} ...")
        r = requests.get(url)
        with open(filename, "wb") as f:
            f.write(r.content)
    else:
        print(f"✓ Already exists: {filename}")
    csv_files.append(filename)

# Check files exist
for file in csv_files:
    if os.path.exists(file):
        print(f"✓ Found: {file}")
    else:
        print(f"✗ Not found: {file}")


⬇️ Downloading realistic_android_log_dataset_1.csv ...
⬇️ Downloading realistic_android_log_dataset_2.csv ...
⬇️ Downloading realistic_android_log_dataset_3.csv ...
⬇️ Downloading realistic_android_log_dataset_4.csv ...
⬇️ Downloading realistic_android_log_dataset_5.csv ...
⬇️ Downloading realistic_android_log_dataset_6.csv ...
⬇️ Downloading realistic_android_log_dataset_7.csv ...
⬇️ Downloading realistic_android_log_dataset_8.csv ...
⬇️ Downloading realistic_android_log_dataset_9.csv ...
⬇️ Downloading realistic_android_log_dataset_10.csv ...
✓ Found: realistic_android_log_dataset_1.csv
✓ Found: realistic_android_log_dataset_2.csv
✓ Found: realistic_android_log_dataset_3.csv
✓ Found: realistic_android_log_dataset_4.csv
✓ Found: realistic_android_log_dataset_5.csv
✓ Found: realistic_android_log_dataset_6.csv
✓ Found: realistic_android_log_dataset_7.csv
✓ Found: realistic_android_log_dataset_8.csv
✓ Found: realistic_android_log_dataset_9.csv
✓ Found: realistic_android_log_dataset_10.cs

In [12]:
# Run fast training
results = train_fast_anomaly_detector(csv_files)

# Print summary
if results:
    print("\n=== FINAL SUMMARY ===")
    for model_name, metrics in results.items():
        print(f"{model_name}: {metrics['train_time']:.2f}s, ROC-AUC: {metrics['roc_auc']:.4f}")

=== FAST LOG ANOMALY DETECTION ===
Loading CSV files...
Loaded realistic_android_log_dataset_1.csv: 100000 rows
Loaded realistic_android_log_dataset_2.csv: 100000 rows
Loaded realistic_android_log_dataset_3.csv: 100000 rows
Loaded realistic_android_log_dataset_4.csv: 100000 rows
Loaded realistic_android_log_dataset_5.csv: 100000 rows
Loaded realistic_android_log_dataset_6.csv: 100000 rows
Loaded realistic_android_log_dataset_7.csv: 100000 rows
Loaded realistic_android_log_dataset_8.csv: 100000 rows
Loaded realistic_android_log_dataset_9.csv: 100000 rows
Loaded realistic_android_log_dataset_10.csv: 100000 rows
Total dataset: 1000000 rows
Available columns: ['log_id', 'line_no', 'text', 'tag', 'is_anomaly']
Sample of is_anomaly column: {0: 960656, 1: 39344}
Large dataset detected, applying intelligent sampling...
Sampled 50000 rows from 1000000 (Anomalies: 15000, Normal: 35000)
Preprocessing...
Class distribution after preprocessing: {0: 35000, 1: 15000}

=== Training SGD ===
Fitting TF-

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Creating embeddings for 5000 samples...


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

Transformer ROC-AUC: 1.0000
Transformer PR-AUC: 1.0000

=== FINAL SUMMARY ===
SGD: 0.82s, ROC-AUC: 1.0000
Random Forest: 1.34s, ROC-AUC: 1.0000


In [None]:
# For extremely large datasets, use incremental learning:
# Example usage:

# detector = IncrementalAnomalyDetector()
# detector.partial_fit(all_texts, all_labels, batch_size=5000)

print("Incremental learning example ready!")
print("Uncomment the lines above to use with very large datasets")

In [16]:
from google.colab import files
import joblib
import pandas as pd
import numpy as np

# Upload raw text log file
uploaded = files.upload()  # Choose your log file (e.g. "system_log.txt")

# Get uploaded file name
log_filename = list(uploaded.keys())[0]

# Load the saved model
try:
    loaded_model = joblib.load('fast_anomaly_detector_sgd.joblib')
    print("Model loaded successfully!")

    # Read entire raw log file
    with open(log_filename, "r", encoding="utf-8", errors="ignore") as f:
        raw_lines = f.readlines()

    # Convert to DataFrame for batch prediction
    df_logs = pd.DataFrame({"text": [line.strip() for line in raw_lines if line.strip()]})

    # Predict
    predictions = loaded_model.predict(df_logs["text"])
    probabilities = loaded_model.predict_proba(df_logs["text"])

    # Ensure probabilities are always 2D
    if probabilities.ndim == 1:
        probabilities = np.vstack([1 - probabilities, probabilities]).T

    # Collect anomalies only
    anomalies = []
    for text, pred, prob in zip(df_logs["text"], predictions, probabilities):
        anomaly_prob = prob[1]  # class 1 = anomaly
        if pred == 1 and anomaly_prob > 0.7:
            anomalies.append((text, anomaly_prob))

    # Show results
    if anomalies:
        print("\n⚠️ Anomalies Detected (prob > 0.7):")
        for text, anomaly_prob in anomalies:
            print(f"{text[:80]:<80} | ANOMALY (prob: {anomaly_prob:.3f})")
    else:
        print("\n✅ No anomalies detected above threshold 0.7.")

    # Save anomalies to CSV
    df_anomalies = pd.DataFrame(anomalies, columns=["log_line", "anomaly_prob"])
    df_anomalies.to_csv("anomalies_filtered.csv", index=False)
    #files.download("anomalies_filtered.csv")

except FileNotFoundError:
    print("No saved model found. Run the training pipeline first!")


Saving android_log_dataset_8_raw.txt to android_log_dataset_8_raw.txt
Model loaded successfully!

⚠️ Anomalies Detected (prob > 0.7):
07-23 14:26:44.427  1702  1709 E WifiStateMachine: Connection to network failed: | ANOMALY (prob: 0.709)
07-23 14:26:55.511  6372  6372 E AndroidRuntime: FATAL EXCEPTION: main           | ANOMALY (prob: 0.749)
07-23 14:26:55.833  6372  6372 E AndroidRuntime: Process: com.facebook.katana, P | ANOMALY (prob: 0.882)
07-23 14:26:56.208  6372  6372 E AndroidRuntime: java.lang.IllegalStateException | ANOMALY (prob: 0.855)
07-23 14:27:14.622  1702  1706 W KeyguardUpdateMonitor: Multiple failed unlock a | ANOMALY (prob: 0.743)
07-23 14:27:27.807  1702  1708 W KeyguardUpdateMonitor: Multiple failed unlock a | ANOMALY (prob: 0.776)
07-23 14:27:54.695  1702  1706 E WifiStateMachine: Connection to network failed: | ANOMALY (prob: 0.720)
07-23 14:27:55.878  1702  1708 E WifiStateMachine: Connection to network failed: | ANOMALY (prob: 0.751)
07-23 14:28:56.197  1702  

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>