In [5]:
import pandas as pd
import torch
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    pipeline
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Select GPU if available
device = 0 if torch.cuda.is_available() else -1

# Paths for saving fine-tuned models
service_model_dir = "./fine_tuned_service_model"
activity_model_dir = "./fine_tuned_activity_model"

# Define datasets paths
train_path = '/content/shuffled_train.csv'
test_path = '/content/shuffled_test.csv'

# Define SASE Services and Activities
sase_services = [
    "Sync", "Dropbox", "Mediafire", "OneDrive", "Jumpshare",
    "Box", "4shared", "Mega", "pCloud", "ZippyShare",
    "SharePoint", "Salesforce", "Koofr"
]
activity_types = [
    "Login", "Upload", "Download", "Access", "Attempt",
    "Change", "Request", "Timeout", "Anomaly", "Sharing",
    "Editing", "Deleting", "Creating", "Updating", "Syncing",
    "Navigation", "Authentication"
]

def load_dataset(path):
    try:
        return pd.read_csv(path)
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return pd.DataFrame()

def prepare_service_text(row):
    return " ".join([
        str(row.get('headers_Host', '')),
        str(row.get('url', '')),
        str(row.get('requestHeaders_Origin', ''))
    ])

def prepare_activity_text(row):
    return " ".join([
        str(row.get('url', '')),
        str(row.get('method', '')),
        str(row.get('requestHeaders_Content_Type', '')),
        str(row.get('responseHeaders_Content_Type', '')),
        str(row.get('requestHeaders_Referer', ''))
    ])

def fine_tune_model(model_name, texts, labels, output_dir):
    from torch.utils.data import Dataset

    class TextDataset(Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __len__(self):
            return len(self.labels)

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx])
            return item

    # Tokenizer and Model Initialization
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=len(set(labels)), ignore_mismatched_sizes=True
    )

    # Tokenize dataset
    encodings = tokenizer(
        texts.tolist(),
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors="pt"
    )

    # Create dataset
    dataset = TextDataset(encodings, labels)

    # Training Arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=3,
        per_device_train_batch_size=8,
        save_steps=500,
        save_total_limit=2,
        evaluation_strategy="no",
        logging_dir=f"{output_dir}/logs",
        logging_steps=100,
        learning_rate=5e-5
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        tokenizer=tokenizer
    )

    # Train Model
    print(f"Fine-tuning {model_name}...")
    trainer.train()

    # Save Fine-Tuned Model
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Model saved to {output_dir}")

def perform_predictions(test_df, service_model_dir, activity_model_dir):
    # Load Fine-tuned Models
    service_pipeline = pipeline("text-classification", model=service_model_dir, tokenizer=service_model_dir, device=device)
    activity_pipeline = pipeline("text-classification", model=activity_model_dir, tokenizer=activity_model_dir, device=device)

    predictions = []
    for _, row in tqdm(test_df.iterrows(), total=test_df.shape[0], desc="Predicting"):
        service_result = service_pipeline(row['service_text'])[0]
        activity_result = activity_pipeline(row['activity_text'])[0]

        predictions.append({
            'predicted_service': service_result['label'],
            'predicted_service_confidence': service_result['score'],
            'predicted_activity': activity_result['label'],
            'predicted_activity_confidence': activity_result['score']
        })

    return pd.DataFrame(predictions)

def main():
    # Load datasets
    df_train = load_dataset(train_path)
    df_test = load_dataset(test_path)

    if df_train.empty or df_test.empty:
        print("Error: One or more datasets could not be loaded.")
        return

    # Prepare texts
    df_train['service_text'] = df_train.apply(prepare_service_text, axis=1)
    df_train['activity_text'] = df_train.apply(prepare_activity_text, axis=1)
    df_test['service_text'] = df_test.apply(prepare_service_text, axis=1)
    df_test['activity_text'] = df_test.apply(prepare_activity_text, axis=1)

    # Encode labels
    service_labels = df_train['service_text'].astype('category').cat.codes.tolist()
    activity_labels = df_train['activity_text'].astype('category').cat.codes.tolist()

    # Fine-tune service model
    fine_tune_model("MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7", df_train['service_text'], service_labels, service_model_dir)

    # Fine-tune activity model
    fine_tune_model("MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli", df_train['activity_text'], activity_labels, activity_model_dir)

    # Perform predictions
    predictions_df = perform_predictions(df_test, service_model_dir, activity_model_dir)

    # Combine results with test data
    results = pd.concat([df_test.reset_index(drop=True), predictions_df], axis=1)

    # Save results
    results.to_csv("predictions.csv", index=False)
    print("Predictions saved to predictions.csv")

if __name__ == "__main__":
    main()


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([1367]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([1367, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Fine-tuning MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7...


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss
100,6.7735
200,5.8823
300,5.4459
400,5.2361
500,4.5332
600,4.6541
700,4.2307
800,4.0345
900,3.7603
1000,3.8748


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Model saved to ./fine_tuned_service_model


tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([1430]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([1430, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Fine-tuning MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss
100,6.7892
200,6.1553
300,5.6184
400,5.2884
500,4.6952
600,4.5581
700,4.053
800,3.8574
900,3.5418
1000,3.5539


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Model saved to ./fine_tuned_activity_model


Device set to use cuda:0
Device set to use cuda:0
Predicting:   1%|          | 7/800 [00:01<01:38,  8.02it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1013 > 512). Running this sequence through the model will result in indexing errors
Predicting:   1%|          | 9/800 [00:01<01:56,  6.82it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Predicting: 100%|██████████| 800/800 [00:58<00:00, 13.68it/s]


Predictions saved to predictions.csv
