In [1]:
import pandas as pd
from transformers import BertForSequenceClassification, BertTokenizer, AdamW
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

class ArabicToxicDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        return input_ids, attention_mask, label

def collate_fn(batch):
    input_ids = torch.stack([item[0] for item in batch])
    attention_mask = torch.stack([item[1] for item in batch])
    labels = torch.tensor([item[2] for item in batch])
    return input_ids, attention_mask, labels

def preprocess_data(filepath):
    data = pd.read_csv(filepath)
    data['label'] = data['Majority_Label'].apply(lambda x: 1 if x == 'Offensive' else 0)
    return data

def oversample_data(data):
    X = data['Comment']
    y = data['label']
    ros = RandomOverSampler(random_state=42)
    X_resampled, y_resampled = ros.fit_resample(X.values.reshape(-1, 1), y)
    balanced_data = pd.DataFrame({
        'Comment': X_resampled.flatten(),
        'label': y_resampled
    })
    return balanced_data

def undersample_data(data):
    X = data['Comment']
    y = data['label']
    rus = RandomUnderSampler(random_state=42)
    X_resampled, y_resampled = rus.fit_resample(X.values.reshape(-1, 1), y)
    undersampled_data = pd.DataFrame({
        'Comment': X_resampled.flatten(),
        'label': y_resampled
    })
    # store undersampled_data in a csv file
    undersampled_data.to_csv('undersampled_data.csv', index=False)
    return undersampled_data

def train_and_evaluate(data, model, tokenizer, device, epochs=5, batch_size=32):
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

    train_texts = train_data['Comment'].tolist()
    train_labels = train_data['label'].tolist()
    test_texts = test_data['Comment'].tolist()
    test_labels = test_data['label'].tolist()

    train_dataset = ArabicToxicDataset(train_texts, train_labels, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

    optimizer = AdamW(model.parameters(), lr=2e-5)

    model.train()
    for epoch in range(epochs):
        for input_ids, attention_mask, labels in train_loader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    test_dataset = ArabicToxicDataset(test_texts, test_labels, tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    model.eval()
 
    all_labels = []
    all_predictions = []
    with torch.no_grad():
        for input_ids, attention_mask, labels in test_loader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=1)
            # correct += (predictions == labels).sum().item()
            # total += labels.size(0)
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predictions.cpu().numpy())

    # accuracy = correct / total
    # print(f"Test Accuracy: {accuracy:.4f}")
    # Calculate and print metrics
    accuracy = accuracy_score(all_labels, all_predictions)
    precision = precision_score(all_labels, all_predictions)
    recall = recall_score(all_labels, all_predictions)
    f1 = f1_score(all_labels, all_predictions)


    print(f"Test Accuracy: {accuracy:.4f}")
    print(f"Test Precision: {precision:.4f}")
    print(f"Test Recall: {recall:.4f}")
    print(f"Test F1 Score: {f1:.4f}")


def train_and_evaluate30(data, model, tokenizer, device, epochs=5, batch_size=32):
    train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

    train_texts = train_data['Comment'].tolist()
    train_labels = train_data['label'].tolist()
    test_texts = test_data['Comment'].tolist()
    test_labels = test_data['label'].tolist()

    train_dataset = ArabicToxicDataset(train_texts, train_labels, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

    optimizer = AdamW(model.parameters(), lr=2e-5)

    model.train()
    for epoch in range(epochs):
        for input_ids, attention_mask, labels in train_loader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    test_dataset = ArabicToxicDataset(test_texts, test_labels, tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    model.eval()
    # correct = 0
    # total = 0
    all_labels = []
    all_predictions = []
    with torch.no_grad():
        for input_ids, attention_mask, labels in test_loader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=1)
            # correct += (predictions == labels).sum().item()
            # total += labels.size(0)
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predictions.cpu().numpy())

    # accuracy = correct / total
    # print(f"Test Accuracy: {accuracy:.4f}")
    # Calculate and print metrics
    accuracy = accuracy_score(all_labels, all_predictions)
    precision = precision_score(all_labels, all_predictions)
    recall = recall_score(all_labels, all_predictions)
    f1 = f1_score(all_labels, all_predictions)


    print(f"Test Accuracy: {accuracy:.4f}")
    print(f"Test Precision: {precision:.4f}")
    print(f"Test Recall: {recall:.4f}")
    print(f"Test F1 Score: {f1:.4f}")



def main():
    # Load and preprocess data
    data = preprocess_data('ardata.csv')

    # Load the BERT model and tokenizer
    model = BertForSequenceClassification.from_pretrained('asafaya/bert-base-arabic', num_labels=2)
    tokenizer = BertTokenizer.from_pretrained('asafaya/bert-base-arabic')

    # Device configuration
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)
    model.to(device)

    # Train and evaluate on original data
    print("Training and evaluating on original data...")
    train_and_evaluate(data, model, tokenizer, device)
    #
    # Train and evaluate on 30% test data
    print("Training and evaluating on 30% test data...")
    train_and_evaluate30(data, model, tokenizer, device)
    
    # Oversample the data
    balanced_data = oversample_data(data)
    # Train and evaluate on oversampled data
    print("Training and evaluating on balanced data...")
    train_and_evaluate(balanced_data, model, tokenizer, device)
    #
    # Train and evaluate on 30% test data
    print("Training and evaluating on 30% test data for balanced_data...")
    train_and_evaluate30(balanced_data, model, tokenizer, device)

    # Undersample the data
    undersampled_data = undersample_data(data)

    # Train and evaluate on undersampled data
    print("Training and evaluating on undersampled data...")
    train_and_evaluate(undersampled_data, model, tokenizer, device)

    # Train and evaluate on 30% test data for undersampled data
    print("Training and evaluating on 30% test data for undersampled data...")
    train_and_evaluate30(undersampled_data, model, tokenizer, device)


if __name__ == "__main__":
    main()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at asafaya/bert-base-arabic and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda
Training and evaluating on original data...


  attn_output = torch.nn.functional.scaled_dot_product_attention(


Test Accuracy: 0.8838
Test Precision: 0.6689
Test Recall: 0.6923
Test F1 Score: 0.6804
Training and evaluating on 30% test data...




Test Accuracy: 0.9183
Test Precision: 0.7647
Test Recall: 0.7573
Test F1 Score: 0.7610
Training and evaluating on balanced data...




Test Accuracy: 0.9820
Test Precision: 0.9695
Test Recall: 0.9938
Test F1 Score: 0.9815
Training and evaluating on 30% test data for balanced_data...




Test Accuracy: 0.9875
Test Precision: 0.9819
Test Recall: 0.9929
Test F1 Score: 0.9874
Training and evaluating on undersampled data...




Test Accuracy: 1.0000
Test Precision: 1.0000
Test Recall: 1.0000
Test F1 Score: 1.0000
Training and evaluating on 30% test data for undersampled data...




Test Accuracy: 1.0000
Test Precision: 1.0000
Test Recall: 1.0000
Test F1 Score: 1.0000


In [2]:
def main():
    # Load and preprocess data
    data = preprocess_data('ardata.csv')

    # Load the BERT model and tokenizer
    model = BertForSequenceClassification.from_pretrained('asafaya/bert-base-arabic', num_labels=2)
    tokenizer = BertTokenizer.from_pretrained('asafaya/bert-base-arabic')

    # Device configuration
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)
    model.to(device)

  
    # Undersample the data
    undersampled_data = undersample_data(data)

    # Train and evaluate on undersampled data
    print("Training and evaluating on undersampled data...")
    train_and_evaluate(undersampled_data, model, tokenizer, device)

    # Train and evaluate on 30% test data for undersampled data
    print("Training and evaluating on 30% test data for undersampled data...")
    train_and_evaluate30(undersampled_data, model, tokenizer, device)


if __name__ == "__main__":
    main()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at asafaya/bert-base-arabic and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda
Training and evaluating on undersampled data...




Test Accuracy: 0.8037
Test Precision: 0.8231
Test Recall: 0.7810
Test F1 Score: 0.8015
Training and evaluating on 30% test data for undersampled data...




Test Accuracy: 0.8593
Test Precision: 0.8565
Test Recall: 0.8689
Test F1 Score: 0.8627
