In [2]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import logging

# Reduce logging verbosity
logging.basicConfig(level=logging.WARNING)

class CodeBertTransformer(nn.Module):
    def __init__(
        self,
        service_num_labels,
        activity_num_labels,
        model_name="microsoft/codebert-base"
    ):
        super().__init__()

        # Load CodeBERT model
        self.transformer = AutoModel.from_pretrained(model_name)

        # Freeze initial layers
        for param in list(self.transformer.parameters())[:6]:
            param.requires_grad = False

        # Dropout for regularization
        self.dropout = nn.Dropout(0.3)

        # Advanced classification heads
        self.service_classifier = nn.Sequential(
            nn.Linear(self.transformer.config.hidden_size, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, service_num_labels)
        )

        self.activity_classifier = nn.Sequential(
            nn.Linear(self.transformer.config.hidden_size, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, activity_num_labels)
        )

    def forward(self, input_ids, attention_mask):
        # Efficient forward pass
        outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        # Use CLS token representation
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)

        # Classification
        service_pred = self.service_classifier(pooled_output)
        activity_pred = self.activity_classifier(pooled_output)

        return service_pred, activity_pred

class CodeBertDataset(Dataset):
    def __init__(
        self,
        texts,
        service_labels,
        activity_labels,
        tokenizer,
        max_length=128
    ):
        # Efficient tokenization
        self.encodings = tokenizer(
            texts.tolist(),
            truncation=True,
            padding=True,
            max_length=max_length,
            return_tensors='pt'
        )

        self.service_labels = torch.tensor(service_labels, dtype=torch.long)
        self.activity_labels = torch.tensor(activity_labels, dtype=torch.long)

    def __len__(self):
        return len(self.service_labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'service_label': self.service_labels[idx],
            'activity_label': self.activity_labels[idx]
        }

class CodeBertClassifier:
    def __init__(
        self,
        training_data_path,
        model_name="microsoft/codebert-base"
    ):
        # Efficient device selection
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu'
        )
        print(f"Using device: {self.device}")

        # Load data efficiently
        self.training_df = pd.read_csv(
            training_data_path,
            low_memory=False
        )

        # Prepare data
        self._prepare_data(model_name)

    def _prepare_data(self, model_name):
        # Validate and clean data
        self.training_df['service'] = self.training_df['service'].fillna('Unknown')
        self.training_df['activityType'] = self.training_df['activityType'].fillna('Unknown')

        # Tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Label encoding
        self.service_encoder = LabelEncoder()
        self.activity_encoder = LabelEncoder()

        # Encode labels
        self.encoded_services = self.service_encoder.fit_transform(
            self.training_df['service']
        )
        self.encoded_activities = self.activity_encoder.fit_transform(
            self.training_df['activityType']
        )

        # Prepare text features with technical context
        self.texts = self.training_df.apply(
            self._prepare_text_features,
            axis=1
        )

    def _prepare_text_features(self, row):
        # Enhanced feature extraction with technical context
        technical_features = [
            str(row.get('url', '')),
            str(row.get('method', '')),
            str(row.get('headers_Host', '')),
            str(row.get('requestHeaders_Content_Type', '')),
            str(row.get('responseHeaders_Content_Type', ''))
        ]

        # Join features, limit length
        return " ".join(technical_features)[:512]

    def train(
        self,
        test_size=0.2,
        batch_size=32,
        epochs=10,
        learning_rate=2e-5
    ):
        # Split data
        (train_texts, val_texts,
         train_service_labels, val_service_labels,
         train_activity_labels, val_activity_labels) = train_test_split(
            self.texts,
            self.encoded_services,
            self.encoded_activities,
            test_size=test_size,
            random_state=42
        )

        # Create datasets
        train_dataset = CodeBertDataset(
            train_texts,
            train_service_labels,
            train_activity_labels,
            self.tokenizer
        )
        val_dataset = CodeBertDataset(
            val_texts,
            val_service_labels,
            val_activity_labels,
            self.tokenizer
        )

        # DataLoaders with optimization
        train_loader = DataLoader(
            train_dataset,
            batch_size=batch_size,
            shuffle=True,
            pin_memory=True,
            num_workers=2
        )
        val_loader = DataLoader(
            val_dataset,
            batch_size=batch_size,
            pin_memory=True,
            num_workers=2
        )

        # Model initialization
        service_num_labels = len(self.service_encoder.classes_)
        activity_num_labels = len(self.activity_encoder.classes_)

        model = CodeBertTransformer(
            service_num_labels,
            activity_num_labels
        ).to(self.device)

        # Loss and optimizer
        service_criterion = nn.CrossEntropyLoss()
        activity_criterion = nn.CrossEntropyLoss()
        optimizer = optim.AdamW(
            model.parameters(),
            lr=learning_rate,
            weight_decay=0.01
        )

        # Learning rate scheduler
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode='max',
            factor=0.5,
            patience=2
        )

        # Training loop
        best_val_accuracy = 0
        for epoch in range(epochs):
            model.train()
            total_train_loss = 0

            for batch in train_loader:
                optimizer.zero_grad()

                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                service_labels = batch['service_label'].to(self.device)
                activity_labels = batch['activity_label'].to(self.device)

                service_pred, activity_pred = model(
                    input_ids, attention_mask
                )

                service_loss = service_criterion(
                    service_pred, service_labels
                )
                activity_loss = activity_criterion(
                    activity_pred, activity_labels
                )

                total_loss = service_loss + activity_loss
                total_loss.backward()
                optimizer.step()

                total_train_loss += total_loss.item()

            # Validation phase
            model.eval()
            val_service_preds, val_activity_preds = [], []
            val_service_true, val_activity_true = [], []

            with torch.no_grad():
                for batch in val_loader:
                    input_ids = batch['input_ids'].to(self.device)
                    attention_mask = batch['attention_mask'].to(self.device)

                    service_pred, activity_pred = model(
                        input_ids, attention_mask
                    )

                    val_service_preds.extend(
                        torch.argmax(service_pred, dim=1).cpu().numpy()
                    )
                    val_activity_preds.extend(
                        torch.argmax(activity_pred, dim=1).cpu().numpy()
                    )

                    val_service_true.extend(batch['service_label'].numpy())
                    val_activity_true.extend(batch['activity_label'].numpy())

            # Accuracy calculation
            service_accuracy = np.mean(
                np.array(val_service_preds) == np.array(val_service_true)
            )
            activity_accuracy = np.mean(
                np.array(val_activity_preds) == np.array(val_activity_true)
            )

            print(f"Epoch {epoch+1}: "
                  f"Service Accuracy: {service_accuracy:.4f}, "
                  f"Activity Accuracy: {activity_accuracy:.4f}")

            # Update learning rate
            scheduler.step(service_accuracy + activity_accuracy)

            # Save best model
            current_accuracy = service_accuracy + activity_accuracy
            if current_accuracy > best_val_accuracy:
                best_val_accuracy = current_accuracy
                torch.save(model.state_dict(), 'best_codebert_model.pth')
                print("Saved best model")

        return model

def main():
    training_data_path = '/kaggle/input/network-dataset/shuffled_train.csv'

    try:
        # Initialize and train classifier
        classifier = CodeBertClassifier(training_data_path)
        model = classifier.train()

        print("Training completed successfully!")

    except Exception as e:
        print(f"An error occurred: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

Using device: cuda


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Epoch 1: Service Accuracy: 0.9729, Activity Accuracy: 0.9910
Saved best model
Epoch 2: Service Accuracy: 0.9785, Activity Accuracy: 0.9958
Saved best model
Epoch 3: Service Accuracy: 0.9771, Activity Accuracy: 0.9986
Saved best model
Epoch 4: Service Accuracy: 0.9833, Activity Accuracy: 0.9986
Saved best model
Epoch 5: Service Accuracy: 0.9861, Activity Accuracy: 0.9986
Saved best model
Epoch 6: Service Accuracy: 0.9833, Activity Accuracy: 0.9979


KeyboardInterrupt: 

In [11]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import LabelEncoder

class CodeBertTransformer(nn.Module):
    def __init__(
        self,
        service_num_labels,
        activity_num_labels,
        model_name="microsoft/codebert-base"
    ):
        super().__init__()

        # Load CodeBERT model
        self.transformer = AutoModel.from_pretrained(model_name)

        # Dropout for regularization
        self.dropout = nn.Dropout(0.3)

        # Advanced classification heads
        self.service_classifier = nn.Sequential(
            nn.Linear(self.transformer.config.hidden_size, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, service_num_labels)
        )

        self.activity_classifier = nn.Sequential(
            nn.Linear(self.transformer.config.hidden_size, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, activity_num_labels)
        )

    def forward(self, input_ids, attention_mask):
        # Efficient forward pass
        outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        # Use CLS token representation
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)

        # Classification
        service_pred = self.service_classifier(pooled_output)
        activity_pred = self.activity_classifier(pooled_output)

        return service_pred, activity_pred

class CodeBertPredictor:
    def __init__(
        self,
        model_path,
        training_data_path,
        model_name="microsoft/codebert-base"
    ):
        # Device configuration
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu'
        )
        print(f"Using device: {self.device}")

        # Load training data for label encoding
        self.training_df = pd.read_csv(
            training_data_path,
            low_memory=False
        )

        # Prepare data and tokenizer
        self._prepare_data(model_name)

        # Load trained model
        self._load_model(model_path)

    def _prepare_data(self, model_name):
        # Validate and clean data
        self.training_df['service'] = self.training_df['service'].fillna('Unknown')
        self.training_df['activityType'] = self.training_df['activityType'].fillna('Unknown')

        # Tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Label encoding
        self.service_encoder = LabelEncoder()
        self.activity_encoder = LabelEncoder()

        # Fit encoders on training data
        self.service_encoder.fit(self.training_df['service'])
        self.activity_encoder.fit(self.training_df['activityType'])

    def _load_model(self, model_path):
        # Model initialization
        service_num_labels = len(self.service_encoder.classes_)
        activity_num_labels = len(self.activity_encoder.classes_)

        # Create model
        self.model = CodeBertTransformer(
            service_num_labels,
            activity_num_labels
        ).to(self.device)

        # Load trained weights
        self.model.load_state_dict(torch.load(model_path, map_location=self.device))
        self.model.eval()  # Set to evaluation mode

    def _prepare_text_features(self, row):
        # Enhanced feature extraction with technical context
        technical_features = [
            str(row.get('url', '')),
            str(row.get('method', '')),
            str(row.get('headers_Host', '')),
            str(row.get('requestHeaders_Content_Type', '')),
            str(row.get('responseHeaders_Content_Type', ''))
        ]

        # Join features, limit length
        return " ".join(technical_features)[:512]

    def predict(self, test_df, confidence_threshold=0.5):
        # Prepare text features
        test_texts = test_df.apply(self._prepare_text_features, axis=1)

        # Tokenize test data
        encodings = self.tokenizer(
            test_texts.tolist(),
            truncation=True,
            padding=True,
            max_length=128,
            return_tensors='pt'
        )

        # Prediction results
        predictions = {
            'predicted_service': [],
            'service_confidence': [],
            'predicted_activity': [],
            'activity_confidence': []
        }

        # Disable gradient computation
        with torch.no_grad():
            # Move data to device
            input_ids = encodings['input_ids'].to(self.device)
            attention_mask = encodings['attention_mask'].to(self.device)

            # Forward pass
            service_pred, activity_pred = self.model(
                input_ids, attention_mask
            )

            # Apply softmax to get probabilities
            service_probs = F.softmax(service_pred, dim=1)
            activity_probs = F.softmax(activity_pred, dim=1)

            # Get top predictions and confidences
            service_max_probs, service_preds = torch.max(service_probs, dim=1)
            activity_max_probs, activity_preds = torch.max(activity_probs, dim=1)

            # Convert to numpy for easier processing
            service_preds = service_preds.cpu().numpy()
            service_max_probs = service_max_probs.cpu().numpy()
            activity_preds = activity_preds.cpu().numpy()
            activity_max_probs = activity_max_probs.cpu().numpy()

            # Decode predictions for ALL entries
            for i in range(len(service_preds)):
                # Decode labels
                service = self.service_encoder.inverse_transform([service_preds[i]])[0]
                activity = self.activity_encoder.inverse_transform([activity_preds[i]])[0]

                # Check confidence threshold
                if (service_max_probs[i] < confidence_threshold or
                    activity_max_probs[i] < confidence_threshold):
                    # If below threshold, mark as Unknown
                    service = 'Unknown'
                    activity = 'Unknown'

                predictions['predicted_service'].append(service)
                predictions['service_confidence'].append(float(service_max_probs[i]))
                predictions['predicted_activity'].append(activity)
                predictions['activity_confidence'].append(float(activity_max_probs[i]))

        # Add predictions to original DataFrame
        test_df['predicted_service'] = predictions['predicted_service']
        test_df['service_confidence'] = predictions['service_confidence']
        test_df['predicted_activity'] = predictions['predicted_activity']
        test_df['activity_confidence'] = predictions['activity_confidence']

        return test_df

    def save_predictions(self, predictions_df, output_path='predictions.csv'):
        """Save predictions to a CSV file"""
        predictions_df.to_csv(output_path, index=False)
        print(f"Predictions saved to {output_path}")

def main():
    # Paths
    training_data_path = '/kaggle/input/network-dataset/shuffled_train.csv'
    test_data_path = '/kaggle/input/network-dataset/shuffled_test.csv'
    model_path = 'best_codebert_model.pth'
    output_path = 'test_data_with_predictions_code_bert_1.csv'

    try:
        # Load test data
        test_df = pd.read_csv(test_data_path, low_memory=False)

        # Initialize predictor
        predictor = CodeBertPredictor(
            model_path=model_path,
            training_data_path=training_data_path
        )

        # Predict with confidence threshold
        predictions_df = predictor.predict(
            test_df,
            confidence_threshold=0.5  # Adjust as needed
        )

        # Print predictions summary
        print("\nPrediction Summary:")
        print(predictions_df[['predicted_service', 'service_confidence', 'predicted_activity', 'activity_confidence']].head())

        # Save predictions
        predictor.save_predictions(predictions_df, output_path)

    except Exception as e:
        print(f"An error occurred: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

Using device: cuda


  self.model.load_state_dict(torch.load(model_path, map_location=self.device))



Prediction Summary:
  predicted_service  service_confidence predicted_activity  \
0          OneDrive            0.985593              Login   
1           4shared            0.992092             Upload   
2         MediaFire            0.994190             Upload   
3           Dropbox            0.994486             Upload   
4           4shared            0.989075           Download   

   activity_confidence  
0             0.997256  
1             0.996235  
2             0.996443  
3             0.995549  
4             0.995101  
Predictions saved to test_data_with_predictions_code_bert_1.csv


In [12]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
import re

class CodeBertTransformer(nn.Module):
    def __init__(
        self,
        service_num_labels,
        activity_num_labels,
        model_name="microsoft/codebert-base"
    ):
        super().__init__()

        # Load CodeBERT model
        self.transformer = AutoModel.from_pretrained(model_name)

        # Dropout for regularization
        self.dropout = nn.Dropout(0.3)

        # Advanced classification heads
        self.service_classifier = nn.Sequential(
            nn.Linear(self.transformer.config.hidden_size, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, service_num_labels)
        )

        self.activity_classifier = nn.Sequential(
            nn.Linear(self.transformer.config.hidden_size, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, activity_num_labels)
        )

    def forward(self, input_ids, attention_mask):
        # Efficient forward pass
        outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        # Use CLS token representation
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)

        # Classification
        service_pred = self.service_classifier(pooled_output)
        activity_pred = self.activity_classifier(pooled_output)

        return service_pred, activity_pred, pooled_output  # Return embeddings too

class CodeBertPredictor:
    def __init__(
        self,
        model_path,
        training_data_path,
        model_name="microsoft/codebert-base"
    ):
        # Device configuration
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu'
        )
        print(f"Using device: {self.device}")

        # Comprehensive activity labels with semantic hierarchy
        self.predefined_activities = [
            # Primary action types
            "Login", "Logout", "Access", "Create", "Update", "Delete",
            "View", "Edit", "Share", "Download", "Upload",

            # Secondary action types
            "Sync", "Connect", "Disconnect", "Authorize",
            "Request", "Attempt", "Validate",

            # Anomaly and special states
            "Timeout", "Anomaly", "Error", "Suspend", "Resume"
        ]

        # Semantic similarity matrix for activity mapping
        self.activity_mapping = {
            # Mapping ONLY unknown or vague terms to more specific labels
            "unknown": ["Attempt", "Access", "Request", "Unknown"]
        }

        # Load training data for label encoding
        self.training_df = pd.read_csv(
            training_data_path,
            low_memory=False
        )

        # Prepare data and tokenizer
        self._prepare_data(model_name)

        # Load trained model
        self._load_model(model_path)

    def _prepare_data(self, model_name):
        # Validate and clean data
        self.training_df['service'] = self.training_df['service'].fillna('Unknown')
        self.training_df['activityType'] = self.training_df['activityType'].fillna('Unknown')

        # Tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Label encoding
        self.service_encoder = LabelEncoder()
        self.activity_encoder = LabelEncoder()

        # Fit encoders on training data
        self.service_encoder.fit(self.training_df['service'])
        self.activity_encoder.fit(self.training_df['activityType'])

    def _load_model(self, model_path):
        # Model initialization
        service_num_labels = len(self.service_encoder.classes_)
        activity_num_labels = len(self.activity_encoder.classes_)

        # Create model
        self.model = CodeBertTransformer(
            service_num_labels,
            activity_num_labels
        ).to(self.device)

        # Load trained weights
        self.model.load_state_dict(torch.load(model_path, map_location=self.device))
        self.model.eval()  # Set to evaluation mode

    def _prepare_text_features(self, row):
        # Enhanced feature extraction with technical context
        technical_features = [
            str(row.get('url', '')),
            str(row.get('method', '')),
            str(row.get('headers_Host', '')),
            str(row.get('requestHeaders_Content_Type', '')),
            str(row.get('responseHeaders_Content_Type', ''))
        ]

        # Join features, limit length
        return " ".join(technical_features)[:512]

    def _advanced_activity_mapping(self, original_activity):
        """
        Advanced mapping for activity types

        Args:
            original_activity (str): Original activity type

        Returns:
            tuple: (mapped_activity, confidence_score)
        """
        # Normalize input
        norm_activity = str(original_activity).lower().strip()

        # Exact match first
        if norm_activity in [a.lower() for a in self.predefined_activities]:
            return original_activity, 1.0

        # Check ONLY unknown mapping
        if norm_activity in ['unknown', 'unspecified', '']:
            for mapping in self.activity_mapping.get('unknown', ['Access']):
                return mapping, 0.7

        # Text-based pattern matching for truly unknown cases
        pattern_mapping = [
            (r'^unknown$|^unspecified$|^\s*$', 'Access'),
            (r'generic|undefined|null', 'Request')
        ]

        for pattern, activity in pattern_mapping:
            if re.search(pattern, norm_activity, re.IGNORECASE):
                return activity, 0.5

        # Fallback to most generic activity
        return "Access", 0.3

    def _find_closest_activity(self, embedding):
        """
        Find the closest activity label using cosine similarity

        Args:
            embedding (torch.Tensor): Input embedding

        Returns:
            tuple: (closest activity, similarity score)
        """
        # Tokenize predefined activities
        activity_encodings = self.tokenizer(
            self.predefined_activities,
            truncation=True,
            padding=True,
            max_length=128,
            return_tensors='pt'
        )

        # Compute embeddings for predefined activities
        with torch.no_grad():
            activity_input_ids = activity_encodings['input_ids'].to(self.device)
            activity_attention_mask = activity_encodings['attention_mask'].to(self.device)

            # Get embeddings for predefined activities
            _, _, activity_embeddings = self.model(activity_input_ids, activity_attention_mask)

        # Convert embeddings to numpy for cosine similarity
        input_embedding_np = embedding.cpu().numpy().reshape(1, -1)
        activity_embeddings_np = activity_embeddings.cpu().numpy()

        # Compute cosine similarities
        similarities = cosine_similarity(input_embedding_np, activity_embeddings_np)[0]

        # Find the index of the most similar activity
        closest_idx = np.argmax(similarities)
        closest_similarity = similarities[closest_idx]

        # Return closest activity with its similarity score
        return self.predefined_activities[closest_idx], closest_similarity

    def predict(self, test_df, confidence_threshold=0.5, semantic_threshold=0.3):
        # Prepare text features
        test_texts = test_df.apply(self._prepare_text_features, axis=1)

        # Tokenize test data
        encodings = self.tokenizer(
            test_texts.tolist(),
            truncation=True,
            padding=True,
            max_length=128,
            return_tensors='pt'
        )

        # Prediction results
        predictions = {
            'predicted_service': [],
            'service_confidence': [],
            'predicted_activity': [],
            'activity_confidence': []
        }

        # Disable gradient computation
        with torch.no_grad():
            # Move data to device
            input_ids = encodings['input_ids'].to(self.device)
            attention_mask = encodings['attention_mask'].to(self.device)

            # Forward pass
            service_pred, activity_pred, embeddings = self.model(
                input_ids, attention_mask
            )

            # Apply softmax to get probabilities
            service_probs = F.softmax(service_pred, dim=1)
            activity_probs = F.softmax(activity_pred, dim=1)

            # Get top predictions and confidences
            service_max_probs, service_preds = torch.max(service_probs, dim=1)
            activity_max_probs, activity_preds = torch.max(activity_probs, dim=1)

            # Convert to numpy for easier processing
            service_preds = service_preds.cpu().numpy()
            service_max_probs = service_max_probs.cpu().numpy()
            activity_preds = activity_preds.cpu().numpy()
            activity_max_probs = activity_max_probs.cpu().numpy()

            # Decode predictions for ALL entries
            for i in range(len(service_preds)):
                # Decode labels
                service = self.service_encoder.inverse_transform([service_preds[i]])[0]
                original_activity = self.activity_encoder.inverse_transform([activity_preds[i]])[0]

                # Normalize original activity
                norm_original_activity = str(original_activity).lower().strip()

                # Check if activity is truly unknown or not in predefined list
                if (norm_original_activity == 'unknown' or
                    norm_original_activity not in [a.lower() for a in self.predefined_activities]):
                    # Try semantic matching for truly unknown activities
                    semantic_activity, semantic_score = self._find_closest_activity(embeddings[i])

                    # Use semantic match if above threshold
                    if semantic_score >= semantic_threshold:
                        mapped_activity = semantic_activity
                        confidence = semantic_score
                    else:
                        # Fallback to advanced mapping
                        mapped_activity, confidence = self._advanced_activity_mapping(original_activity)
                else:
                    # For known activities, use original prediction
                    mapped_activity = original_activity
                    confidence = float(activity_max_probs[i])

                # Adjust confidence if below threshold
                if confidence < confidence_threshold:
                    mapped_activity = "Unknown"
                    confidence = 0.1  # Very low confidence

                predictions['predicted_service'].append(service)
                predictions['service_confidence'].append(float(service_max_probs[i]))
                predictions['predicted_activity'].append(mapped_activity)
                predictions['activity_confidence'].append(confidence)

        # Add predictions to original DataFrame
        test_df['predicted_service'] = predictions['predicted_service']
        test_df['service_confidence'] = predictions['service_confidence']
        test_df['predicted_activity'] = predictions['predicted_activity']
        test_df['activity_confidence'] = predictions['activity_confidence']

        return test_df

    def save_predictions(self, predictions_df, output_path='predictions.csv'):
        """Save predictions to a CSV file"""
        predictions_df.to_csv(output_path, index=False)
        print(f"Predictions saved to {output_path}")

def main():
    # Paths
    training_data_path = '/kaggle/input/network-dataset/shuffled_train.csv'
    test_data_path = '/kaggle/input/network-dataset/shuffled_test.csv'
    model_path = 'best_codebert_model.pth'
    output_path = 'test_data_with_predictions_code_bert_2.csv'

    try:
        # Load test data
        test_df = pd.read_csv(test_data_path, low_memory=False)

        # Initialize predictor
        predictor = CodeBertPredictor(
            model_path=model_path,
            training_data_path=training_data_path
        )

        # Predict with confidence threshold
        predictions_df = predictor.predict(
            test_df,
            confidence_threshold=0.5,  # Model confidence threshold
            semantic_threshold=0.3     # Semantic matching threshold
        )

        # Print predictions summary
        print("\nPrediction Summary:")
        print(predictions_df[['predicted_service', 'service_confidence', 'predicted_activity', 'activity_confidence']].head())

        # Save predictions
        predictor.save_predictions(predictions_df, output_path)

    except Exception as e:
        print(f"An error occurred: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

Using device: cuda


  self.model.load_state_dict(torch.load(model_path, map_location=self.device))



Prediction Summary:
  predicted_service  service_confidence predicted_activity  \
0          OneDrive            0.985593              Login   
1           4shared            0.992092             Upload   
2         MediaFire            0.994190             Upload   
3           Dropbox            0.994486             Upload   
4           4shared            0.989075           Download   

   activity_confidence  
0             0.997256  
1             0.996235  
2             0.996443  
3             0.995549  
4             0.995101  
Predictions saved to test_data_with_predictions_code_bert_2.csv


In [14]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

class ZeroShotActivityPredictor:
    def __init__(self, activity_labels, model_name="microsoft/codebert-base"):
        # Load the model and tokenizer for ZSL
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        
        # Define activity labels for zero-shot classification
        self.activity_labels = activity_labels
        self.activity_embeddings = self._get_activity_embeddings()

    def _get_activity_embeddings(self):
        """Generate embeddings for predefined activities to be used for similarity comparison"""
        activity_embeddings = {}
        for activity in self.activity_labels:
            # Tokenize and encode each activity label
            inputs = self.tokenizer(activity, return_tensors="pt", truncation=True, padding=True, max_length=128)
            with torch.no_grad():
                outputs = self.model(**inputs)
            # Use [CLS] token embedding for activity representation
            activity_embeddings[activity] = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        return activity_embeddings

    def predict(self, activity_text):
        """Predict the activity for an unknown label using ZSL"""
        # Tokenize the unknown activity text
        inputs = self.tokenizer(activity_text, return_tensors="pt", truncation=True, padding=True, max_length=128)
        with torch.no_grad():
            outputs = self.model(**inputs)
        # Use [CLS] token embedding for the activity
        activity_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()

        # Calculate cosine similarities with predefined activity embeddings
        similarities = {}
        for activity, embedding in self.activity_embeddings.items():
            sim = cosine_similarity(activity_embedding, embedding)
            similarities[activity] = sim

        # Find the most similar predefined activity
        best_match = max(similarities, key=similarities.get)
        return best_match, similarities[best_match]

class CodeBertTransformer(nn.Module):
    def __init__(self, service_num_labels, activity_num_labels, model_name="microsoft/codebert-base"):
        super().__init__()

        # Load CodeBERT model
        self.transformer = AutoModel.from_pretrained(model_name)

        # Dropout for regularization
        self.dropout = nn.Dropout(0.3)

        # Advanced classification heads
        self.service_classifier = nn.Sequential(
            nn.Linear(self.transformer.config.hidden_size, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, service_num_labels)
        )

        self.activity_classifier = nn.Sequential(
            nn.Linear(self.transformer.config.hidden_size, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, activity_num_labels)
        )

    def forward(self, input_ids, attention_mask):
        # Efficient forward pass
        outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        # Use CLS token representation
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)

        # Classification
        service_pred = self.service_classifier(pooled_output)
        activity_pred = self.activity_classifier(pooled_output)

        return service_pred, activity_pred, pooled_output  # Return embeddings too

class CodeBertPredictor:
    def __init__(self, model_path, training_data_path, model_name="microsoft/codebert-base"):
        # Device configuration
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu'
        )
        print(f"Using device: {self.device}")

        # Predefined activities
        self.predefined_activities = [
            "Login", "Logout", "Access", "Create", "Update", "Delete",
            "View", "Edit", "Share", "Download", "Upload",
            "Request", "Timeout", "Error"
        ]

        # Load training data for label encoding
        self.training_df = pd.read_csv(
            training_data_path,
            low_memory=False
        )

        # Prepare data and tokenizer
        self._prepare_data(model_name)

        # Load trained model
        self._load_model(model_path)

    def _prepare_data(self, model_name):
        # Validate and clean data
        self.training_df['service'] = self.training_df['service'].fillna('Unknown')
        self.training_df['activityType'] = self.training_df['activityType'].fillna('Unknown')

        # Tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Label encoding
        self.service_encoder = LabelEncoder()
        self.activity_encoder = LabelEncoder()

        # Fit encoders on training data
        self.service_encoder.fit(self.training_df['service'])
        self.activity_encoder.fit(self.training_df['activityType'])

    def _load_model(self, model_path):
        # Model initialization
        service_num_labels = len(self.service_encoder.classes_)
        activity_num_labels = len(self.activity_encoder.classes_)

        # Create model
        self.model = CodeBertTransformer(
            service_num_labels,
            activity_num_labels
        ).to(self.device)

        # Load trained weights
        self.model.load_state_dict(torch.load(model_path, map_location=self.device))
        self.model.eval()  # Set to evaluation mode

    def _prepare_text_features(self, row):
        # Enhanced feature extraction with technical context
        technical_features = [
            str(row.get('url', '')),
            str(row.get('method', '')),
            str(row.get('headers_Host', '')),
            str(row.get('requestHeaders_Content_Type', '')),
            str(row.get('responseHeaders_Content_Type', ''))
        ]

        # Join features, limit length
        return " ".join(technical_features)[:512]

    def predict(self, test_df):
        # Prepare text features
        test_texts = test_df.apply(self._prepare_text_features, axis=1)

        # Tokenize test data
        encodings = self.tokenizer(
            test_texts.tolist(),
            truncation=True,
            padding=True,
            max_length=128,
            return_tensors='pt'
        )

        # Prediction results
        predictions = {
            'predicted_service': [],
            'service_confidence': [],
            'predicted_activity': [],
            'activity_confidence': []
        }

        # Disable gradient computation
        with torch.no_grad():
            # Move data to device
            input_ids = encodings['input_ids'].to(self.device)
            attention_mask = encodings['attention_mask'].to(self.device)

            # Forward pass
            service_pred, activity_pred, embeddings = self.model(
                input_ids, attention_mask
            )

            # Apply softmax to get probabilities
            service_probs = F.softmax(service_pred, dim=1)
            activity_probs = F.softmax(activity_pred, dim=1)

            # Get top predictions and confidences
            service_max_probs, service_preds = torch.max(service_probs, dim=1)
            activity_max_probs, activity_preds = torch.max(activity_probs, dim=1)

            # Convert to numpy for easier processing
            service_preds = service_preds.cpu().numpy()
            service_max_probs = service_max_probs.cpu().numpy()
            activity_preds = activity_preds.cpu().numpy()
            activity_max_probs = activity_max_probs.cpu().numpy()

            # Decode predictions for all entries
            for i in range(len(service_preds)):
                # Decode labels
                service = self.service_encoder.inverse_transform([service_preds[i]])[0]
                original_activity = self.activity_encoder.inverse_transform([activity_preds[i]])[0]

                # If activity is unknown, use ZSL to predict
                if original_activity.lower() == 'unknown':
                    mapped_activity, activity_confidence = self.zsl_model.predict(test_texts[i])
                else:
                    mapped_activity = original_activity
                    activity_confidence = float(activity_max_probs[i])

                # Ensure activity_confidence is a scalar before appending
                if isinstance(activity_confidence, (torch.Tensor, np.ndarray)):
                    activity_confidence = activity_confidence.item()  # Extract scalar value

                predictions['predicted_service'].append(service)
                predictions['service_confidence'].append(float(service_max_probs[i]))
                predictions['predicted_activity'].append(mapped_activity)
                predictions['activity_confidence'].append(float(activity_confidence))

        # Add predictions to original DataFrame
        test_df['predicted_service'] = predictions['predicted_service']
        test_df['service_confidence'] = predictions['service_confidence']
        test_df['predicted_activity'] = predictions['predicted_activity']
        test_df['activity_confidence'] = predictions['activity_confidence']

        return test_df

    def save_predictions(self, predictions_df, output_path='predictions.csv'):
        """Save predictions to a CSV file"""
        predictions_df.to_csv(output_path, index=False)
        print(f"Predictions saved to {output_path}")

class CodeBertPredictorWithZSL(CodeBertPredictor):
    def __init__(self, model_path, training_data_path, zsl_model, model_name="microsoft/codebert-base"):
        super().__init__(model_path, training_data_path, model_name)
        # Initialize Zero-Shot Learner
        self.zsl_model = zsl_model

# Main Execution
def main():
    # Paths
    training_data_path = '/kaggle/input/network-dataset/shuffled_train.csv'
    test_data_path = '/kaggle/input/network-dataset/shuffled_test.csv'
    model_path = 'best_codebert_model.pth'
    output_path = 'test_data_with_predictions_code_bert_zsl.csv'

    try:
        # Initialize Zero-Shot Activity Predictor
        predefined_activities = [
            "Login", "Logout", "Access", "Create", "Update", "Delete",
            "View", "Edit", "Share", "Download", "Upload",
            "Request", "Timeout", "Error"
        ]
        zsl_model = ZeroShotActivityPredictor(predefined_activities)

        # Load test data
        test_df = pd.read_csv(test_data_path, low_memory=False)

        # Initialize predictor with ZSL
        predictor = CodeBertPredictorWithZSL(
            model_path=model_path,
            training_data_path=training_data_path,
            zsl_model=zsl_model
        )

        # Predict with confidence threshold
        predictions_df = predictor.predict(
            test_df
        )

        # Print predictions summary
        print("\nPrediction Summary:")
        print(predictions_df[['predicted_service', 'service_confidence', 'predicted_activity', 'activity_confidence']].head())

        # Save predictions
        predictor.save_predictions(predictions_df, output_path)

    except Exception as e:
        print(f"An error occurred: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()




Using device: cuda


  self.model.load_state_dict(torch.load(model_path, map_location=self.device))



Prediction Summary:
  predicted_service  service_confidence predicted_activity  \
0          OneDrive            0.985593              Login   
1           4shared            0.992092             Upload   
2         MediaFire            0.994190             Upload   
3           Dropbox            0.994486             Upload   
4           4shared            0.989075           Download   

   activity_confidence  
0             0.997256  
1             0.996235  
2             0.996443  
3             0.995549  
4             0.995101  
Predictions saved to test_data_with_predictions_code_bert_zsl.csv


In [16]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

class ZeroShotActivityPredictor:
    def __init__(self, activity_labels, model_name="microsoft/codebert-base"):
        # Load the model and tokenizer for ZSL
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        
        # Define activity labels for zero-shot classification
        self.activity_labels = activity_labels
        self.activity_embeddings = self._get_activity_embeddings()

    def _get_activity_embeddings(self):
        """Generate embeddings for predefined activities to be used for similarity comparison"""
        activity_embeddings = {}
        for activity in self.activity_labels:
            # Tokenize and encode each activity label
            inputs = self.tokenizer(activity, return_tensors="pt", truncation=True, padding=True, max_length=128)
            with torch.no_grad():
                outputs = self.model(**inputs)
            # Use [CLS] token embedding for activity representation
            activity_embeddings[activity] = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        return activity_embeddings

    def predict(self, activity_text):
        """Predict the activity for an unknown label using ZSL"""
        # Tokenize the unknown activity text
        inputs = self.tokenizer(activity_text, return_tensors="pt", truncation=True, padding=True, max_length=128)
        with torch.no_grad():
            outputs = self.model(**inputs)
        # Use [CLS] token embedding for the activity
        activity_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()

        # Calculate cosine similarities with predefined activity embeddings
        similarities = {}
        for activity, embedding in self.activity_embeddings.items():
            sim = cosine_similarity(activity_embedding, embedding)
            similarities[activity] = sim

        # Find the most similar predefined activity
        best_match = max(similarities, key=similarities.get)
        return best_match, similarities[best_match]

class CodeBertTransformer(nn.Module):
    def __init__(self, service_num_labels, activity_num_labels, model_name="microsoft/codebert-base"):
        super().__init__()

        # Load CodeBERT model
        self.transformer = AutoModel.from_pretrained(model_name)

        # Dropout for regularization
        self.dropout = nn.Dropout(0.3)

        # Advanced classification heads
        self.service_classifier = nn.Sequential(
            nn.Linear(self.transformer.config.hidden_size, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, service_num_labels)
        )

        self.activity_classifier = nn.Sequential(
            nn.Linear(self.transformer.config.hidden_size, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, activity_num_labels)
        )

    def forward(self, input_ids, attention_mask):
        # Efficient forward pass
        outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        # Use CLS token representation
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)

        # Classification
        service_pred = self.service_classifier(pooled_output)
        activity_pred = self.activity_classifier(pooled_output)

        return service_pred, activity_pred, pooled_output  # Return embeddings too

class CodeBertPredictor:
    def __init__(self, model_path, training_data_path, model_name="microsoft/codebert-base"):
        # Device configuration
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu'
        )
        print(f"Using device: {self.device}")

        # Predefined activities
        self.predefined_activities = [
            "Login", "Logout", "Access", "Create", "Update", "Delete",
            "View", "Edit", "Share", "Download", "Upload",
            "Request", "Timeout", "Error"
        ]

        # Load training data for label encoding
        self.training_df = pd.read_csv(
            training_data_path,
            low_memory=False
        )

        # Prepare data and tokenizer
        self._prepare_data(model_name)

        # Load trained model
        self._load_model(model_path)

    def _prepare_data(self, model_name):
        # Validate and clean data
        self.training_df['service'] = self.training_df['service'].fillna('Unknown')
        self.training_df['activityType'] = self.training_df['activityType'].fillna('Unknown')

        # Tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Label encoding
        self.service_encoder = LabelEncoder()
        self.activity_encoder = LabelEncoder()

        # Fit encoders on training data
        self.service_encoder.fit(self.training_df['service'])
        self.activity_encoder.fit(self.training_df['activityType'])

    def _load_model(self, model_path):
        # Model initialization
        service_num_labels = len(self.service_encoder.classes_)
        activity_num_labels = len(self.activity_encoder.classes_)

        # Create model
        self.model = CodeBertTransformer(
            service_num_labels,
            activity_num_labels
        ).to(self.device)

        # Load trained weights
        self.model.load_state_dict(torch.load(model_path, map_location=self.device))
        self.model.eval()  # Set to evaluation mode

    def _prepare_text_features(self, row):
        # Enhanced feature extraction with technical context
        technical_features = [
            str(row.get('url', '')),
            str(row.get('method', '')),
            str(row.get('headers_Host', '')),
            str(row.get('requestHeaders_Content_Type', '')),
            str(row.get('responseHeaders_Content_Type', ''))
        ]

        # Join features, limit length
        return " ".join(technical_features)[:512]

    def predict(self, test_df):
        # Prepare text features
        test_texts = test_df.apply(self._prepare_text_features, axis=1)

        # Tokenize test data
        encodings = self.tokenizer(
            test_texts.tolist(),
            truncation=True,
            padding=True,
            max_length=128,
            return_tensors='pt'
        )

        # Prediction results
        predictions = {
            'predicted_service': [],
            'service_confidence': [],
            'predicted_activity': [],
            'activity_confidence': []
        }

        # Disable gradient computation
        with torch.no_grad():
            # Move data to device
            input_ids = encodings['input_ids'].to(self.device)
            attention_mask = encodings['attention_mask'].to(self.device)

            # Forward pass
            service_pred, activity_pred, embeddings = self.model(
                input_ids, attention_mask
            )

            # Apply softmax to get probabilities
            service_probs = F.softmax(service_pred, dim=1)
            activity_probs = F.softmax(activity_pred, dim=1)

            # Get top predictions and confidences
            service_max_probs, service_preds = torch.max(service_probs, dim=1)
            activity_max_probs, activity_preds = torch.max(activity_probs, dim=1)

            # Convert to numpy for easier processing
            service_preds = service_preds.cpu().numpy()
            service_max_probs = service_max_probs.cpu().numpy()
            activity_preds = activity_preds.cpu().numpy()
            activity_max_probs = activity_max_probs.cpu().numpy()

            # Decode predictions for all entries
            for i in range(len(service_preds)):
                # Decode labels
                service = self.service_encoder.inverse_transform([service_preds[i]])[0]
                original_activity = self.activity_encoder.inverse_transform([activity_preds[i]])[0]

                # If activity is unknown, use ZSL to predict
                if original_activity.lower() == 'unknown':
                    mapped_activity, activity_confidence = self.zsl_model.predict(test_texts[i])
                else:
                    mapped_activity = original_activity
                    activity_confidence = float(activity_max_probs[i])

                # Ensure activity_confidence is a scalar before appending
                if isinstance(activity_confidence, (torch.Tensor, np.ndarray)):
                    activity_confidence = activity_confidence.item()  # Extract scalar value

                predictions['predicted_service'].append(service)
                predictions['service_confidence'].append(float(service_max_probs[i]))
                predictions['predicted_activity'].append(mapped_activity)
                predictions['activity_confidence'].append(float(activity_confidence))

        # Add predictions to original DataFrame
        test_df['predicted_service'] = predictions['predicted_service']
        test_df['service_confidence'] = predictions['service_confidence']
        test_df['predicted_activity'] = predictions['predicted_activity']
        test_df['activity_confidence'] = predictions['activity_confidence']

        return test_df

    def save_predictions(self, predictions_df, output_path='predictions.csv'):
        """Save predictions to a CSV file"""
        predictions_df.to_csv(output_path, index=False)
        print(f"Predictions saved to {output_path}")

class CodeBertPredictorWithZSL(CodeBertPredictor):
    def __init__(self, model_path, training_data_path, zsl_model, model_name="microsoft/codebert-base"):
        super().__init__(model_path, training_data_path, model_name)
        # Initialize Zero-Shot Learner
        self.zsl_model = zsl_model

# Main Execution
def main():
    # Paths
    training_data_path = '/kaggle/input/network-dataset/shuffled_train.csv'
    test_data_path = '/kaggle/input/network-dataset/shuffled_test.csv'
    model_path = 'best_codebert_model.pth'
    output_path = 'test_data_with_predictions_code_bert_zsl.csv'

    try:
        # Initialize Zero-Shot Activity Predictor
        predefined_activities = [
            "Login", "Logout", "Access", "Create", "Update", "Delete",
            "View", "Edit", "Share", "Download", "Upload",
            "Request", "Timeout", "Error"
        ]
        zsl_model = ZeroShotActivityPredictor(predefined_activities)

        # Load test data
        test_df = pd.read_csv(test_data_path, low_memory=False)

        # Initialize predictor with ZSL
        predictor = CodeBertPredictorWithZSL(
            model_path=model_path,
            training_data_path=training_data_path,
            zsl_model=zsl_model
        )

        # Predict with confidence threshold
        predictions_df = predictor.predict(
            test_df
        )

        # Print predictions summary
        print("\nPrediction Summary:")
        print(predictions_df[['predicted_service', 'service_confidence', 'predicted_activity', 'activity_confidence']].head())

        # Save predictions
        predictor.save_predictions(predictions_df, output_path)

    except Exception as e:
        print(f"An error occurred: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()




Using device: cuda


  self.model.load_state_dict(torch.load(model_path, map_location=self.device))



Prediction Summary:
  predicted_service  service_confidence predicted_activity  \
0          OneDrive            0.985593              Login   
1           4shared            0.992092             Upload   
2         MediaFire            0.994190             Upload   
3           Dropbox            0.994486             Upload   
4           4shared            0.989075           Download   

   activity_confidence  
0             0.997256  
1             0.996235  
2             0.996443  
3             0.995549  
4             0.995101  
Predictions saved to test_data_with_predictions_code_bert_zsl.csv


In [32]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

class ZeroShotActivityPredictor:
    def __init__(self, activity_labels, model_name="microsoft/codebert-base"):
        # Load the model and tokenizer for ZSL
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        
        # Define activity labels for zero-shot classification
        self.activity_labels = activity_labels
        self.activity_embeddings = self._get_activity_embeddings()

    def _get_activity_embeddings(self):
        """Generate embeddings for predefined activities to be used for similarity comparison"""
        activity_embeddings = {}
        for activity in self.activity_labels:
            # Tokenize and encode each activity label
            inputs = self.tokenizer(activity, return_tensors="pt", truncation=True, padding=True, max_length=128)
            with torch.no_grad():
                outputs = self.model(**inputs)
            # Use [CLS] token embedding for activity representation
            activity_embeddings[activity] = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        return activity_embeddings

    def predict(self, activity_text):
        """Predict the activity for an unknown label using ZSL"""
        # Tokenize the unknown activity text
        inputs = self.tokenizer(activity_text, return_tensors="pt", truncation=True, padding=True, max_length=128)
        with torch.no_grad():
            outputs = self.model(**inputs)
        # Use [CLS] token embedding for the activity
        activity_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()

        # Calculate cosine similarities with predefined activity embeddings
        similarities = {}
        for activity, embedding in self.activity_embeddings.items():
            sim = cosine_similarity(activity_embedding, embedding)
            similarities[activity] = sim

        # Find the most similar predefined activity
        best_match = max(similarities, key=similarities.get)
        return best_match, similarities[best_match]

class CodeBertTransformer(nn.Module):
    def __init__(self, service_num_labels, activity_num_labels, model_name="microsoft/codebert-base"):
        super().__init__()

        # Load CodeBERT model
        self.transformer = AutoModel.from_pretrained(model_name)

        # Dropout for regularization
        self.dropout = nn.Dropout(0.3)

        # Advanced classification heads
        self.service_classifier = nn.Sequential(
            nn.Linear(self.transformer.config.hidden_size, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, service_num_labels)
        )

        self.activity_classifier = nn.Sequential(
            nn.Linear(self.transformer.config.hidden_size, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, activity_num_labels)
        )

    def forward(self, input_ids, attention_mask):
        # Efficient forward pass
        outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        # Use CLS token representation
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)

        # Classification
        service_pred = self.service_classifier(pooled_output)
        activity_pred = self.activity_classifier(pooled_output)

        return service_pred, activity_pred, pooled_output

class CodeBertPredictor:
    def __init__(self, model_path, training_data_path, model_name="microsoft/codebert-base"):
        # Device configuration
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")

        # Predefined activities
        self.predefined_activities = [
            "Login", "Logout", "Access", "Create", "Update", "Delete",
            "View", "Edit", "Share", "Download", "Upload",
            "Request", "Timeout", "Error"
        ]

        # Load training data for label encoding
        self.training_df = pd.read_csv(training_data_path, low_memory=False)

        # Prepare data and tokenizer
        self._prepare_data(model_name)

        # Load trained model
        self._load_model(model_path)

    def _prepare_data(self, model_name):
        # Validate and clean data
        self.training_df['service'] = self.training_df['service'].fillna('Unknown')
        self.training_df['activityType'] = self.training_df['activityType'].fillna('Unknown')

        # Tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Label encoding
        self.service_encoder = LabelEncoder()
        self.activity_encoder = LabelEncoder()

        # Fit encoders on training data
        self.service_encoder.fit(self.training_df['service'])
        self.activity_encoder.fit(self.training_df['activityType'])

    def _load_model(self, model_path):
        # Model initialization
        service_num_labels = len(self.service_encoder.classes_)
        activity_num_labels = len(self.activity_encoder.classes_)

        # Create model
        self.model = CodeBertTransformer(service_num_labels, activity_num_labels).to(self.device)

        # Load trained weights
        self.model.load_state_dict(torch.load(model_path, map_location=self.device))
        self.model.eval()

    def _prepare_text_features(self, row):
        # Enhanced feature extraction with technical context
        technical_features = [
            str(row.get('url', '')),
            str(row.get('method', '')),
            str(row.get('headers_Host', '')),
            str(row.get('requestHeaders_Content_Type', '')),
            str(row.get('responseHeaders_Content_Type', ''))
        ]

        # Join features, limit length
        return " ".join(technical_features)[:512]

    def predict(self, test_df):
        # Prepare text features
        test_texts = test_df.apply(self._prepare_text_features, axis=1)

        # Tokenize test data
        encodings = self.tokenizer(
            test_texts.tolist(),
            truncation=True,
            padding=True,
            max_length=128,
            return_tensors='pt'
        )

        # Disable gradient computation
        with torch.no_grad():
            # Move data to device
            input_ids = encodings['input_ids'].to(self.device)
            attention_mask = encodings['attention_mask'].to(self.device)

            # Forward pass
            service_pred, activity_pred, embeddings = self.model(input_ids, attention_mask)

            # Apply softmax to get probabilities
            service_probs = F.softmax(service_pred, dim=1)
            activity_probs = F.softmax(activity_pred, dim=1)

            # Get top predictions and confidences
            service_max_probs, service_preds = torch.max(service_probs, dim=1)
            activity_max_probs, activity_preds = torch.max(activity_probs, dim=1)

            # Convert to numpy for easier processing
            service_preds = service_preds.cpu().numpy()
            service_max_probs = service_max_probs.cpu().numpy()
            activity_preds = activity_preds.cpu().numpy()
            activity_max_probs = activity_max_probs.cpu().numpy()

            # Calculate overall confidence scores
            overall_service_confidence = float(np.mean(service_max_probs))
            overall_activity_confidence = float(np.mean(activity_max_probs))

            # Print only overall confidence scores
            print("\nOverall Confidence Scores:")
            print(f"Service Confidence: {overall_service_confidence:.4f}")
            print(f"Activity Confidence: {overall_activity_confidence:.4f}")

        return test_df

class CodeBertPredictorWithZSL(CodeBertPredictor):
    def __init__(self, model_path, training_data_path, zsl_model, model_name="microsoft/codebert-base"):
        super().__init__(model_path, training_data_path, model_name)
        # Initialize Zero-Shot Learner
        self.zsl_model = zsl_model

def main():
    # Paths
    training_data_path = '/kaggle/input/network-dataset/shuffled_train.csv'
    test_data_path = '/kaggle/input/network-dataset/shuffled_test.csv'
    model_path = 'best_codebert_model.pth'

    try:
        # Initialize Zero-Shot Activity Predictor
        predefined_activities = [
            "Login", "Logout", "Access", "Create", "Update", "Delete",
            "View", "Edit", "Share", "Download", "Upload",
            "Request", "Timeout", "Error"
        ]
        zsl_model = ZeroShotActivityPredictor(predefined_activities)

        # Load test data
        test_df = pd.read_csv(test_data_path, low_memory=False)

        # Initialize predictor with ZSL
        predictor = CodeBertPredictorWithZSL(
            model_path=model_path,
            training_data_path=training_data_path,
            zsl_model=zsl_model
        )

        # Predict with confidence threshold
        predictions_df = predictor.predict(test_df)

    except Exception as e:
        print(f"An error occurred: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()



Using device: cuda


  self.model.load_state_dict(torch.load(model_path, map_location=self.device))



Overall Confidence Scores:
Service Confidence: 0.9790
Activity Confidence: 0.9941


In [33]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

class ZeroShotActivityPredictor:
    def __init__(self, activity_labels, model_name="microsoft/codebert-base"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.activity_labels = activity_labels
        self.activity_embeddings = self._get_activity_embeddings()

    def _get_activity_embeddings(self):
        activity_embeddings = {}
        for activity in self.activity_labels:
            inputs = self.tokenizer(activity, return_tensors="pt", truncation=True, padding=True, max_length=128)
            with torch.no_grad():
                outputs = self.model(**inputs)
            activity_embeddings[activity] = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        return activity_embeddings

    def predict(self, activity_text):
        inputs = self.tokenizer(activity_text, return_tensors="pt", truncation=True, padding=True, max_length=128)
        with torch.no_grad():
            outputs = self.model(**inputs)
        activity_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()

        similarities = {}
        for activity, embedding in self.activity_embeddings.items():
            sim = cosine_similarity(activity_embedding, embedding)[0, 0]  # Extract scalar value
            similarities[activity] = sim

        best_match = max(similarities, key=similarities.get)
        return best_match, similarities[best_match]

class CodeBertTransformer(nn.Module):
    def __init__(self, service_num_labels, activity_num_labels, model_name="microsoft/codebert-base"):
        super().__init__()
        self.transformer = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.service_classifier = nn.Sequential(
            nn.Linear(self.transformer.config.hidden_size, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, service_num_labels)
        )
        self.activity_classifier = nn.Sequential(
            nn.Linear(self.transformer.config.hidden_size, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, activity_num_labels)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        service_pred = self.service_classifier(pooled_output)
        activity_pred = self.activity_classifier(pooled_output)
        return service_pred, activity_pred, pooled_output

class CodeBertPredictor:
    def __init__(self, model_path, training_data_path, model_name="microsoft/codebert-base"):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")
        
        self.predefined_activities = [
            "Login", "Logout", "Access", "Create", "Update", "Delete",
            "View", "Edit", "Share", "Download", "Upload",
            "Request", "Timeout", "Error"
        ]
        
        self.training_df = pd.read_csv(training_data_path, low_memory=False)
        self._prepare_data(model_name)
        self._load_model(model_path)

    def _prepare_data(self, model_name):
        self.training_df['service'] = self.training_df['service'].fillna('Unknown')
        self.training_df['activityType'] = self.training_df['activityType'].fillna('Unknown')
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.service_encoder = LabelEncoder()
        self.activity_encoder = LabelEncoder()
        self.service_encoder.fit(self.training_df['service'])
        self.activity_encoder.fit(self.training_df['activityType'])

    def _load_model(self, model_path):
        service_num_labels = len(self.service_encoder.classes_)
        activity_num_labels = len(self.activity_encoder.classes_)
        self.model = CodeBertTransformer(service_num_labels, activity_num_labels).to(self.device)
        self.model.load_state_dict(torch.load(model_path, map_location=self.device))
        self.model.eval()

    def _prepare_text_features(self, row):
        technical_features = [
            str(row.get('url', '')),
            str(row.get('method', '')),
            str(row.get('headers_Host', '')),
            str(row.get('requestHeaders_Content_Type', '')),
            str(row.get('responseHeaders_Content_Type', ''))
        ]
        return " ".join(technical_features)[:512]

    def predict(self, test_df):
        test_texts = test_df.apply(self._prepare_text_features, axis=1)
        encodings = self.tokenizer(
            test_texts.tolist(),
            truncation=True,
            padding=True,
            max_length=128,
            return_tensors='pt'
        )

        service_confidences = defaultdict(list)
        activity_confidences = defaultdict(list)
        all_predictions = {
            'predicted_service': [],
            'service_confidence': [],
            'predicted_activity': [],
            'activity_confidence': []
        }

        with torch.no_grad():
            input_ids = encodings['input_ids'].to(self.device)
            attention_mask = encodings['attention_mask'].to(self.device)
            service_pred, activity_pred, embeddings = self.model(input_ids, attention_mask)
            
            service_probs = F.softmax(service_pred, dim=1)
            activity_probs = F.softmax(activity_pred, dim=1)
            
            service_max_probs, service_preds = torch.max(service_probs, dim=1)
            activity_max_probs, activity_preds = torch.max(activity_probs, dim=1)
            
            service_max_probs = service_max_probs.cpu().numpy()
            service_preds = service_preds.cpu().numpy()
            activity_max_probs = activity_max_probs.cpu().numpy()
            activity_preds = activity_preds.cpu().numpy()

            for i in range(len(service_preds)):
                # Handle service predictions
                service = self.service_encoder.inverse_transform([service_preds[i]])[0]
                service_conf = float(service_max_probs[i])
                service_confidences[service].append(service_conf)
                all_predictions['predicted_service'].append(service)
                all_predictions['service_confidence'].append(service_conf)

                # Handle activity predictions
                activity = self.activity_encoder.inverse_transform([activity_preds[i]])[0]
                if activity.lower() == 'unknown':
                    mapped_activity, confidence = self.zsl_model.predict(test_texts[i])
                    activity_conf = float(confidence.item())  # Extract scalar value from numpy array
                else:
                    mapped_activity = activity
                    activity_conf = float(activity_max_probs[i])
                
                activity_confidences[mapped_activity].append(activity_conf)
                all_predictions['predicted_activity'].append(mapped_activity)
                all_predictions['activity_confidence'].append(activity_conf)

            # Calculate overall scores
            overall_service_confidence = np.mean(service_max_probs)
            overall_activity_confidence = np.mean([conf for conf_list in activity_confidences.values() for conf in conf_list])

            # Print results
            print("\n=== Overall Confidence Scores ===")
            print(f"Service Confidence: {float(overall_service_confidence):.4f}")
            print(f"Activity Confidence: {float(overall_activity_confidence):.4f}")

            print("\n=== Service Confidence Scores ===")
            for service, confidences in sorted(service_confidences.items()):
                mean_conf = np.mean(confidences)
                count = len(confidences)
                print(f"{service:30} Confidence: {float(mean_conf):.4f} (Count: {count})")

            print("\n=== Activity Confidence Scores ===")
            for activity, confidences in sorted(activity_confidences.items()):
                mean_conf = np.mean(confidences)
                count = len(confidences)
                print(f"{activity:30} Confidence: {float(mean_conf):.4f} (Count: {count})")

        # Add predictions to DataFrame
        for key, values in all_predictions.items():
            test_df[key] = values

        return test_df

class CodeBertPredictorWithZSL(CodeBertPredictor):
    def __init__(self, model_path, training_data_path, zsl_model, model_name="microsoft/codebert-base"):
        super().__init__(model_path, training_data_path, model_name)
        self.zsl_model = zsl_model

def main():
    training_data_path = '/kaggle/input/network-dataset/shuffled_train.csv'
    test_data_path = '/kaggle/input/network-dataset/shuffled_test.csv'
    model_path = 'best_codebert_model.pth'

    try:
        predefined_activities = [
            "Login", "Logout", "Access", "Create", "Update", "Delete",
            "View", "Edit", "Share", "Download", "Upload",
            "Request", "Timeout", "Error"
        ]
        zsl_model = ZeroShotActivityPredictor(predefined_activities)
        test_df = pd.read_csv(test_data_path, low_memory=False)
        predictor = CodeBertPredictorWithZSL(
            model_path=model_path,
            training_data_path=training_data_path,
            zsl_model=zsl_model
        )
        predictions_df = predictor.predict(test_df)

    except Exception as e:
        print(f"An error occurred: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()



Using device: cuda


  self.model.load_state_dict(torch.load(model_path, map_location=self.device))



=== Overall Confidence Scores ===
Service Confidence: 0.9790
Activity Confidence: 0.9898

=== Service Confidence Scores ===
4shared                        Confidence: 0.9895 (Count: 126)
Box                            Confidence: 0.9935 (Count: 95)
Dropbox                        Confidence: 0.9918 (Count: 119)
Icedrive                       Confidence: 0.9478 (Count: 139)
Jumpshare                      Confidence: 0.9874 (Count: 25)
Koofr                          Confidence: 0.9909 (Count: 48)
MediaFire                      Confidence: 0.9919 (Count: 68)
OneDrive                       Confidence: 0.9665 (Count: 143)
Zippyshare                     Confidence: 0.9853 (Count: 21)
pCloud                         Confidence: 0.9838 (Count: 16)

=== Activity Confidence Scores ===
Download                       Confidence: 0.9962 (Count: 203)
Edit                           Confidence: 0.9761 (Count: 4)
Error                          Confidence: 0.9540 (Count: 26)
Login                        