In [1]:
!pip install pandas torch transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

Mounted at /content/drive


In [3]:


# Define the MultiTaskDataset class
class MultiTaskDataset(Dataset):
    def __init__(self, dataframe, encodings, max_length, tasks):
        self.data = dataframe
        self.encodings = encodings
        self.max_length = max_length
        self.tasks = tasks
        for task, info in tasks.items():
            if info['type'] == 'multi-class':
                info['label_map'] = {label: idx for idx, label in enumerate(info['classes'])}
                info['num_classes'] = len(info['classes'])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        inputs = {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx]
        }
        labels = {}
        masks = {}
        for task, info in self.tasks.items():
            if info['type'] == 'binary':
                label_col = info['column']
                label = self.data.iloc[idx][label_col]
                labels[task] = torch.tensor(label if pd.notna(label) else -1, dtype=torch.float)
                masks[task] = 1 if pd.notna(label) else 0
            elif info['type'] == 'multi-class':
                label_col = info['column']
                label = self.data.iloc[idx][label_col]
                if pd.notna(label) and label != 'NaN':
                    labels[task] = torch.tensor(info['label_map'][label], dtype=torch.long)
                else:
                    labels[task] = torch.tensor(-1, dtype=torch.long)
                masks[task] = 1 if pd.notna(label) and label != 'NaN' else 0
            elif info['type'] == 'multi-label':
                label_cols = info['columns']
                label = [self.data.iloc[idx][col] for col in label_cols]
                if all(pd.notna(l) for l in label):
                    labels[task] = torch.tensor(label, dtype=torch.float)
                    masks[task] = 1
                else:
                    labels[task] = torch.tensor([-1] * len(label_cols), dtype=torch.float)
                    masks[task] = 0
        return inputs, labels, masks

# Define the MultiTaskDistilBERT model
class MultiTaskDistilBERT(nn.Module):
    def __init__(self, distilbert_model, tasks):
        super().__init__()
        self.distilbert = distilbert_model
        self.tasks = tasks
        self.heads = nn.ModuleDict()
        for task, info in tasks.items():
            if info['type'] == 'binary':
                self.heads[task] = nn.Linear(distilbert_model.config.dim, 1)
            elif info['type'] == 'multi-class':
                self.heads[task] = nn.Linear(distilbert_model.config.dim, info['num_classes'])
            elif info['type'] == 'multi-label':
                self.heads[task] = nn.Linear(distilbert_model.config.dim, len(info['columns']))

    def forward(self, input_ids, attention_mask):
        outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # Use [CLS] token representation
        task_outputs = {}
        for task, head in self.heads.items():
            task_outputs[task] = head(pooled_output)
        return task_outputs

# Define tasks based on dataset structure
tasks = {
    'spam': {'type': 'binary', 'column': 'spam'},
    'sentiment': {'type': 'multi-class', 'column': 'sentiment', 'classes': ['Positive', 'Neutral', 'Negative', 'Irrelevant']},
    'toxicity': {'type': 'multi-label', 'columns': ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']},
    'hate_speech': {'type': 'multi-class', 'column': 'hate_speech', 'classes': ['normal', 'offensive', 'hatespeech']}
}




In [4]:
# Load and preprocess dataset
df = pd.read_csv('/content/drive/MyDrive/MLBD_Project/final_comment_analysis_data.csv')
df['text'] = df['text'].fillna('')
binary_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate', 'spam']
for col in binary_cols:
    df[col] = df[col].fillna(0).astype(int)
df['sentiment'] = df['sentiment'].fillna('NaN')
df['hate_speech'] = df['hate_speech'].fillna('NaN')

  df = pd.read_csv('/content/drive/MyDrive/MLBD_Project/final_comment_analysis_data.csv')


In [5]:
# Re-tokenize dataset
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
encodings = tokenizer(df['text'].tolist(), max_length=128, padding='max_length', truncation=True, return_tensors='pt')
torch.save(encodings, '/content/drive/MyDrive/MLBD_Project/tokenized_dataset.pt')
print(f"Length of encodings['input_ids']: {len(encodings['input_ids'])}")
print(f"Length of dataframe: {len(df)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Length of encodings['input_ids']: 246378
Length of dataframe: 246378


In [6]:
# Initialize model
distilbert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
dataset = MultiTaskDataset(df, encodings, max_length=128, tasks=tasks)
model = MultiTaskDistilBERT(distilbert_model, tasks)

# Create dataset and dataloader
dataloader = DataLoader(dataset, batch_size=8, shuffle=True, num_workers=0, pin_memory=False)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [7]:
# Define optimizer and device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

Using device: cuda


In [8]:
# Start training from epoch 1
num_epochs = 3
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for batch_idx, batch in enumerate(dataloader):
        print(f"Epoch {epoch + 1}, Processing batch {batch_idx+1}/{len(dataloader)}")
        try:
            inputs, labels, masks = batch
            inputs = {k: v.to(device) for k, v in inputs.items()}
            labels = {k: v.to(device) for k, v in labels.items()}
            masks = {k: v.to(device) for k, v in masks.items()}
            optimizer.zero_grad()
            outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
            loss = 0
            for task, output in outputs.items():
                task_labels = labels[task]
                task_mask = masks[task]
                if task_mask.sum() > 0:
                    indices = task_mask.nonzero().squeeze(1)
                    if tasks[task]['type'] == 'binary':
                        loss_fn = nn.BCEWithLogitsLoss()
                        loss += loss_fn(output[indices], task_labels[indices].unsqueeze(1))
                    elif tasks[task]['type'] == 'multi-class':
                        loss_fn = nn.CrossEntropyLoss()
                        loss += loss_fn(output[indices], task_labels[indices])
                    elif tasks[task]['type'] == 'multi-label':
                        loss_fn = nn.BCEWithLogitsLoss()
                        loss += loss_fn(output[indices], task_labels[indices])
            print(f"Batch {batch_idx+1} Loss: {loss.item()}")
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        except Exception as e:
            print(f"Error in batch {batch_idx+1}: {e}")
            break
    print(f"Epoch {epoch + 1} Loss: {total_loss / len(dataloader)}")
    torch.save(model.state_dict(), f'/content/drive/MyDrive/MLBD_Project/multi_task_distilbert_epoch_{epoch + 1}.pth')
    tokenizer.save_pretrained(f'/content/drive/MyDrive/MLBD_Project/tokenizer_epoch_{epoch + 1}')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Batch 28299 Loss: 0.08113669604063034
Epoch 3, Processing batch 28300/30798
Batch 28300 Loss: 0.1831081509590149
Epoch 3, Processing batch 28301/30798
Batch 28301 Loss: 0.14321500062942505
Epoch 3, Processing batch 28302/30798
Batch 28302 Loss: 0.06600596755743027
Epoch 3, Processing batch 28303/30798
Batch 28303 Loss: 0.04115360230207443
Epoch 3, Processing batch 28304/30798
Batch 28304 Loss: 0.006646761670708656
Epoch 3, Processing batch 28305/30798
Batch 28305 Loss: 0.7241858839988708
Epoch 3, Processing batch 28306/30798
Batch 28306 Loss: 0.04994899407029152
Epoch 3, Processing batch 28307/30798
Batch 28307 Loss: 0.3716566562652588
Epoch 3, Processing batch 28308/30798
Batch 28308 Loss: 0.4348825216293335
Epoch 3, Processing batch 28309/30798
Batch 28309 Loss: 0.03132839500904083
Epoch 3, Processing batch 28310/30798
Batch 28310 Loss: 0.014272008091211319
Epoch 3, Processing batch 28311/30798
Batch 28311 Loss: 0.19343

In [9]:
# Save the final model and tokenizer
torch.save(model.state_dict(), '/content/drive/MyDrive/MLBD_Project/multi_task_distilbert_final.pth')
tokenizer.save_pretrained('/content/drive/MyDrive/MLBD_Project/tokenizer_final')

('/content/drive/MyDrive/MLBD_Project/tokenizer_final/tokenizer_config.json',
 '/content/drive/MyDrive/MLBD_Project/tokenizer_final/special_tokens_map.json',
 '/content/drive/MyDrive/MLBD_Project/tokenizer_final/vocab.txt',
 '/content/drive/MyDrive/MLBD_Project/tokenizer_final/added_tokens.json')

In [10]:
# Inference function with confidence scores
def predict_with_confidence(model, tokenizer, text, tasks, device):
    model.eval()
    inputs = tokenizer(text, return_tensors='pt', max_length=128, padding='max_length', truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    predictions = {}
    confidences = {}
    for task, output in outputs.items():
        if tasks[task]['type'] == 'binary':
            prob = torch.sigmoid(output).item()
            predictions[task] = int(prob > 0.5)
            confidences[task] = prob if predictions[task] == 1 else 1 - prob
        elif tasks[task]['type'] == 'multi-class':
            probs = torch.softmax(output, dim=1)
            class_idx = torch.argmax(probs, dim=1).item()
            predictions[task] = tasks[task]['classes'][class_idx]
            confidences[task] = probs[0, class_idx].item()
        elif tasks[task]['type'] == 'multi-label':
            probs = torch.sigmoid(output)
            preds = (probs > 0.5).int().tolist()[0]
            predictions[task] = {col: pred for col, pred in zip(tasks[task]['columns'], preds)}
            confidences[task] = {col: prob.item() if pred == 1 else 1 - prob.item() for col, pred, prob in zip(tasks[task]['columns'], preds, probs[0])}
    return predictions, confidences


In [11]:
# Load model and tokenizer for inference
def load_model_and_predict(model_path, tokenizer_path, text, tasks, device):
    distilbert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
    model = MultiTaskDistilBERT(distilbert_model, tasks)
    model.load_state_dict(torch.load(model_path))
    model.to(device)
    model.eval()
    tokenizer = DistilBertTokenizer.from_pretrained(tokenizer_path)
    return predict_with_confidence(model, tokenizer, text, tasks, device)

In [12]:
sample_texts = [
        "This video is amazing! I loved every minute of it.",
        "Check out my channel for free iPhone giveaway! Click the link now!",
        "You are so stupid and ugly, nobody likes you.",
        "I respectfully disagree with your opinion on this matter."
    ]

In [13]:
for text in sample_texts:
    predictions, confidences = predict_with_confidence(model, tokenizer, text, tasks, device)
    print(f"Text: {text}")
    print("Predictions:", predictions)
    print("Confidences:", confidences)
    print("-" * 20)  # Separator for clarity

Text: This video is amazing! I loved every minute of it.
Predictions: {'spam': 0, 'sentiment': 'Positive', 'toxicity': {'toxic': 0, 'severe_toxic': 0, 'obscene': 0, 'threat': 0, 'insult': 0, 'identity_hate': 0}, 'hate_speech': 'normal'}
Confidences: {'spam': 0.9995343281771056, 'sentiment': 0.5267459154129028, 'toxicity': {'toxic': 0.9986151882912964, 'severe_toxic': 0.9998073807946639, 'obscene': 0.9994457864086144, 'threat': 0.9998818508465774, 'insult': 0.9996507137548178, 'identity_hate': 0.9998153176129563}, 'hate_speech': 0.9656565189361572}
--------------------
Text: Check out my channel for free iPhone giveaway! Click the link now!
Predictions: {'spam': 1, 'sentiment': 'Irrelevant', 'toxicity': {'toxic': 0, 'severe_toxic': 0, 'obscene': 0, 'threat': 0, 'insult': 0, 'identity_hate': 0}, 'hate_speech': 'normal'}
Confidences: {'spam': 0.9995025396347046, 'sentiment': 0.9373131990432739, 'toxicity': {'toxic': 0.9962404500693083, 'severe_toxic': 0.9849546300247312, 'obscene': 0.9940