In [None]:
# These training steps are the same for first and second stage training. There are two differences:

# 1. In the first stage training human annotated rationales are inlcuded, 
#   whereas in the second stage training rationales are included suggested by the porposed explainabilitt method.

# 2. In the first stage training run for 3 epochs, for second stage training run for 100 epochs (or till you get the best performance)

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="5"

In [None]:
import torch
print(torch.cuda.is_available())

In [None]:
import pandas as pd
df = pd.read_excel("read file here")

In [None]:
# prompt: train test split

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assuming 'df' is your pandas DataFrame with 'cleaned' and 'is_hateful' columns

train_df, test_df = train_test_split(df, test_size=0.15, random_state=42)  # Adjust test_size and random_state as needed

print(f"Training data size: {len(train_df)}")
print(f"Testing data size: {len(test_df)}")


In [None]:
train_df['text'] = train_df['text'].astype(str).str.strip()  # Ensure all entries are strings and strip whitespace
print(f"Training data size: {len(train_df)}")

In [None]:
test_df['text'] = test_df['text'].astype(str).str.strip()  # Ensure all entries are strings and strip whitespace
print(f"Training data size: {len(test_df)}")

In [None]:
df.head(5)

In [None]:
import torch
import torch.nn as nn
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.utils.rnn import pad_sequence

class CustomTextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Get text data, ensure it's a string
        text = self.data.iloc[idx]["clean_text"]
        if not isinstance(text, str):
            text = ""
            
        # Get the label
        label = self.data.iloc[idx]["label"]

        # Safely evaluate the 'rational_3' column to get the attention vector
        try:
            attention_vector = eval(self.data.iloc[idx]["pred_rationales"])
            if not isinstance(attention_vector, list):
                attention_vector = []
        except (SyntaxError, NameError):
            attention_vector = []

        # Tokenize the text
        inputs = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_len, return_tensors="pt")

        # Convert attention_vector to a tensor and pad if necessary
        attention_vector = torch.tensor(attention_vector, dtype=torch.float)

        
#          # Safely evaluate the 'rational_3' column to get the attention vector
#         try:
#             attention_vector = eval(self.data.iloc[idx]["rational_2"])
#             if not isinstance(attention_vector, list):
#                 attention_vector = []
#         except (SyntaxError, NameError):
#             attention_vector = []
        
 # Tokenize the text
        inputs = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_len, return_tensors="pt")
        tokens = self.tokenizer.tokenize(text)

        # Adjust ground truth attention to match subword tokens
        adjusted_ground_truth_attention = self.adjust_ground_truth_attention(text, tokens, attention_vector)

        # Convert to torch tensor and pad to match max_len
        attention_vector = torch.tensor(adjusted_ground_truth_attention, dtype=torch.float)

        padding_length = self.max_len - attention_vector.size(0)

        if padding_length > 0:
            # Pad the attention vector with zeros to match max_len
            attention_vector = torch.cat([attention_vector, torch.zeros(padding_length)], dim=0)
        else:
            # Truncate the attention vector if it's longer than max_len
            attention_vector = attention_vector[:self.max_len]

        # Tokenized inputs
        input_ids = inputs['input_ids'].squeeze(0)  # Remove batch dimension
        attention_mask = inputs['attention_mask'].squeeze(0)
        token_type_ids = inputs['token_type_ids'].squeeze(0)

        # Convert label to tensor
        label = torch.tensor(label, dtype=torch.long)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'token_type_ids': token_type_ids,
            'label': label,
            'attention_vector': attention_vector  # No unsqueeze(0) here
        }
    
    
    def adjust_ground_truth_attention(self, sentence, tokens, ground_truth_attention):
        """
        Adjusts ground truth attention to match the subword tokens generated by the BERT tokenizer.
        If a word is split into subwords, the attention for the original word is repeated for each subword.
        """
        adjusted_attention = []
        word_idx = 0  # Index for words in sentence.split()

        for token in tokens:
            # If token starts with '##', it's a subword, so repeat the last attention value
            if token.startswith("##"):
                adjusted_attention.append(adjusted_attention[-1])
            else:
                # For new words, append the original ground truth attention value
                if word_idx < len(ground_truth_attention):
                    adjusted_attention.append(ground_truth_attention[word_idx])
                    word_idx += 1
                else:
                    # Handle cases where the number of tokens exceeds the attention values
                    adjusted_attention.append(0)

        return adjusted_attention

In [None]:
class BertWithAttentionSupervision(nn.Module):
    def __init__(self, num_labels=2):
        super(BertWithAttentionSupervision, self).__init__()
        self.bert_classifier = BertForSequenceClassification.from_pretrained("google/muril-base-cased", num_labels=num_labels)
        self.attention_loss_fn = nn.BCEWithLogitsLoss()

    def forward(self, input_ids, attention_mask, token_type_ids, labels, ground_truth_attention=None):
        outputs = self.bert_classifier(input_ids=input_ids,
                                       attention_mask=attention_mask,
                                       token_type_ids=token_type_ids,
                                       labels=labels,
                                       output_attentions=True,
                                       return_dict=True)

        classification_loss = outputs.loss
        logits = outputs.logits

        # Extract attention scores from the final layer
        attention_scores = outputs.attentions[-1][:, :, 0, :]  # CLS token attention across all heads
        avg_attention_scores = attention_scores.mean(dim=1)  # Average attention scores across heads

        # Calculate attention loss only if ground_truth_attention is provided
        attention_loss = 0
        if ground_truth_attention is not None:
            if ground_truth_attention.size(0) != avg_attention_scores.size(0):
                raise ValueError(f"Expected ground_truth_attention to be of size {avg_attention_scores.size(0)}, but got {ground_truth_attention.size(0)}")
            attention_loss = self.attention_loss_fn(avg_attention_scores, ground_truth_attention)

        return logits, classification_loss, attention_loss

In [None]:
# Function to train the model
def train_model(model, train_dataloader, eval_dataloader, optimizer, device, alph=0.7, bet=0.3, epochs=30):
    model.train()
    
    acc1 = 0
    acc = 0

    for epoch in range(epochs):
        total_loss = 0
        for batch in train_dataloader:
            optimizer.zero_grad()

            # Move data to device (GPU or CPU)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['label'].to(device)
            ground_truth_attention = batch['attention_vector'].to(device)

            try:
                # Forward pass
                logits, classification_loss, attention_loss = model(input_ids=input_ids,
                                                                attention_mask=attention_mask,
                                                                token_type_ids=token_type_ids,
                                                                labels=labels,
                                                                ground_truth_attention=ground_truth_attention)

                # Total loss with weights
                total_batch_loss = alph * attention_loss + bet * classification_loss

                # Backpropagation
                total_batch_loss.backward()
                optimizer.step()

                # Accumulate total loss
                total_loss += total_batch_loss.item()

            except RuntimeError as e:
                print(f"Error in batch: {batch}")
                raise e

        avg_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch + 1}/{epochs}, Total Loss: {avg_loss:.4f}")

        # Evaluate the model after each epoch
        acc1 = evaluate_model(model, eval_dataloader, device)
        if acc1 > acc:
                acc = acc1
                print(acc1)
            
            # Save the trained model
                torch.save(model.state_dict(), "model path")
                print("Model updated")
            
            
# Evaluate model
def evaluate_model(model, dataloader, device):
    model.eval()
    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)

            logits, _, _ = model(input_ids=input_ids,
                                  attention_mask=attention_mask,
                                  token_type_ids=token_type_ids,
                                  labels=None,
                                  ground_truth_attention=None)  # Set to None for evaluation
            
            predictions = torch.argmax(logits, dim=-1)
            all_labels.extend(batch['label'].cpu().numpy())
            all_predictions.extend(predictions.cpu().numpy())


    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_predictions)

    return accuracy



# Main function
def main():
    # Initialize tokenizer
    tokenizer = BertTokenizer.from_pretrained("google/muril-base-cased")
    max_len = 128  # Adjust based on your dataset


    # Create dataset and dataloader
    dataset = CustomTextDataset(train_df, tokenizer, max_len=max_len)
    train_dataloader = DataLoader(dataset, batch_size=64, shuffle=True)
    
    eval_dataset = CustomTextDataset(test_df, tokenizer, max_len=max_len)
    eval_dataloader = DataLoader(eval_dataset, batch_size=32, shuffle=False)

    # Initialize model and optimizer
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = BertWithAttentionSupervision(num_labels=2).to(device)

    # Resize embeddings after adding new tokens
    model.bert_classifier.resize_token_embeddings(len(tokenizer))

    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

    # Train the model
    train_model(model, train_dataloader, eval_dataloader, optimizer, device, alph=0.7, bet=0.3, epochs=100)


def evaluate_loaded_model(device):
    # Load the tokenizer first
    tokenizer = AutoTokenizer.from_pretrained("path")
    
    # Load the saved model for evaluation
    loaded_model = BertWithAttentionSupervision(num_labels=2).to(device)
    
#     # Ensure you resize the embeddings after loading the tokenizer
#     loaded_model.bert_classifier.resize_token_embeddings(len(tokenizer))
    
    loaded_model.load_state_dict(torch.load("path"))
    print("Model loaded for evaluation.")

    # Create a separate evaluation dataset and dataloader
    eval_dataset = CustomTextDataset(test_df, tokenizer, max_len=128)
    eval_dataloader = DataLoader(eval_dataset, batch_size=32, shuffle=False)

    # Evaluate the loaded model
    evaluate_model(loaded_model, eval_dataloader, device)

if __name__ == "__main__":
    main()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     evaluate_loaded_model(device)

# Evaluation

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Evaluate model
def evaluate_model(model, dataloader, device):
    model.eval()
    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)

            logits, _, _ = model(input_ids=input_ids,
                                  attention_mask=attention_mask,
                                  token_type_ids=token_type_ids,
                                  labels=None,
                                  ground_truth_attention=None)  # Set to None for evaluation

            predictions = torch.argmax(logits, dim=-1)
            all_labels.extend(batch['label'].cpu().numpy())
            all_predictions.extend(predictions.cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_predictions)
    precision = precision_score(all_labels, all_predictions, average='binary')
    recall = recall_score(all_labels, all_predictions, average='binary')
    f1 = f1_score(all_labels, all_predictions, average='binary')

    macro_precision = precision_score(all_labels, all_predictions, average='macro')
    macro_recall = recall_score(all_labels, all_predictions, average='macro')
    macro_f1 = f1_score(all_labels, all_predictions, average='macro')

    print("Evaluation Metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Macro Precision: {macro_precision:.4f}")
    print(f"Macro Recall: {macro_recall:.4f}")
    print(f"Macro F1 Score: {macro_f1:.4f}")


def evaluate_loaded_model(device):
    # Load the tokenizer first
    tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")
    
    # Load the saved model for evaluation
    loaded_model = BertWithAttentionSupervision(num_labels=2).to(device)
    
    # Ensure you resize the embeddings after loading the tokenizer
    loaded_model.bert_classifier.resize_token_embeddings(len(tokenizer))
    
    loaded_model.load_state_dict(torch.load("path"))
    print("Model loaded for evaluation.")

    # Create a separate evaluation dataset and dataloader
    eval_dataset = CustomTextDataset(test_df, tokenizer, max_len=128)
    eval_dataloader = DataLoader(eval_dataset, batch_size=32, shuffle=False)

    # Evaluate the loaded model
    evaluate_model(loaded_model, eval_dataloader, device)

In [None]:
# %%capture cap
evaluate_loaded_model(device)