# Deep Leaning Project

In [1]:
%pip install libretranslatepy
%pip install transformers datasets evaluate

from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torchvision
import torchvision.datasets as datasets
import torchvision.models as models
import torchvision.utils as utils
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import requests
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, RobertaTokenizer, RobertaModel, RobertaForSequenceClassification
import os


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.





**Run libretranslate on http://127.0.0.1:5000**

*Add the languages to translate in --load-only <comma-separated language codes> Set available languages (ar,de,en,es,fr,ga,hi,it,ja,ko,pt,ru,zh)*

**Jupyter does not allow the process to run in the background, thus start a console and run libretranslate --load-only en,de**

**Add the Dataset**

In [2]:

# Don't forget to cite the authors for using their dataset for our project.
# Load the dataset into a pandas dataframe
datasetFilename = "Dynamically_Generated_Hate_Dataset_v0.2.3.csv"
datasetPath = os.path.join(os.getcwd(), datasetFilename)
datasetInDataframe = pd.read_csv(datasetPath)

# Get dataset for training and make textlabels binary
datasetTrain = datasetInDataframe[["text", "label", "split"]]
datasetTrain = datasetTrain[datasetTrain["split"] == "train"]
datasetTrain["label"] = datasetTrain["label"].map({"hate": 1, 'nothate': 0})
datasetTrain = datasetTrain.drop(columns="split")

# Get dataset for testing and make labels binary
datasetTest = datasetInDataframe[["text", "label", "split"]]
datasetTest = datasetTest[datasetTest["split"] == "test"]
datasetTest["label"] = datasetTest["label"].map({"hate": 1, 'nothate': 0})
datasetTest = datasetTest.drop(columns="split")

# Get dev dataset, whatever that is, and make labels binary
datasetDev = datasetInDataframe[["text", "label", "split"]]
datasetDev = datasetDev[datasetDev["split"] == "dev"]
datasetDev["label"] = datasetDev["label"].map({"hate": 1, 'nothate': 0})
datasetDev = datasetDev.drop(columns="split")

# Next step would be tokenization of the text as input for our model.


*Iterate through the dataset and translate, specifiy the source and target language. For batch processing, add multiple requests into the array*

In [None]:
url = "http://127.0.0.1:5000/translate"

params = {"q" : ["Hello, how are you?", "What is your name?"],
          "source" : "en",
          "target" : "ar"
}

response = requests.post(url, json=params)

if response.status_code == 200:
    translated_text = response.json()["translatedText"]
    print(translated_text)
else:
    print(f"Request failed with status code {response.status_code}")

**Load the pretrained Model**

In [4]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = model = RobertaForSequenceClassification.from_pretrained('roberta-base')
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


*Create a Custom Dataset for training and test datasets*

In [7]:
# Define dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(label, dtype=torch.long),
        }

In [None]:
# Load dataset for training
train_texts = datasetTrain["text"].tolist()
train_labels = datasetTrain["label"].tolist()

# Define training dataset and data loader
train_dataset = CustomDataset(train_texts, train_labels, tokenizer, max_length=128)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Freeze pre-trained model layers
for param in model.roberta.parameters():
    param.requires_grad = False

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

# Training loop
num_epochs = 5

model.train()
for epoch in range(num_epochs):
    for batch in train_loader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()
    print("Epoch_number: ", epoch)

# Save fineturned model
torch.save(model.state_dict(), "trained_model.pth")

Validate the trained model with the TestDataset

In [None]:
# Load Datasets
val_texts = datasetTest["text"].tolist()
val_labels = datasetTest["label"].tolist()

# Define validation dataset and data loader
val_dataset = CustomDataset(val_texts, val_labels, tokenizer, max_length=128)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Evaluation loop
model.eval()
val_loss = 0.0
val_correct = 0
counter1 = []


for batch in val_loader:
    counter1.append("0")


print(len(counter1))

counter = 0

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)
        val_loss += loss.item()

        # Calculate accuracy
        _, predicted = torch.max(outputs.logits, 1)
        val_correct += (predicted == labels).sum().item()
        print("counter", counter)
        counter = counter + 1

# Calculate average validation loss
avg_val_loss = val_loss / len(val_loader.dataset)

# Calculate validation accuracy
val_accuracy = val_correct / len(val_loader.dataset)

print(f"Validation Loss: {avg_val_loss:.4f}")
print(f"Validation Accuracy: {val_accuracy:.4f}")

## Arabert

In [10]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import os
import re

In [12]:
# Loading the tokenizer and model
model_name = "aubmindlab/bert-large-arabertv02-twitter"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)



tokenizer_config.json:   0%|          | 0.00/456 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/815k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.48G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-large-arabertv02-twitter and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preprocess arabic datasets to match english translation. Only needs to be done once, after that it will be saved as .csv in the same folder.

In [11]:
def remove_emojis(text):
    # Regex to filter out emojis
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U000024C2-\U0001F251" 
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def process_file(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split('\t')
            if len(parts) < 6:
                continue  # Skip malformed lines
            id, tweet_text, off_label, hs_label, vulgar_label, violence_label = parts
            # Replace "<LF>" with ".", remove emojis
            tweet_text = tweet_text.replace('<LF>', '.')
            tweet_text = remove_emojis(tweet_text)
            data.append([id, tweet_text, off_label, hs_label, vulgar_label, violence_label])
    return pd.DataFrame(data, columns=['id', 'tweet_text', 'OFF_label', 'HS_label', 'Vulgar_label', 'Violence_label'])

# Example usage:
dataset_train = process_file('arabic-data/OSACT2022-sharedTask-train.txt')
dataset_dev = process_file('arabic-data/OSACT2022-sharedTask-dev.txt')
dataset_test = process_file('arabic-data/OSACT2022-sharedTask-test-tweets.txt')

# Saving to CSV
dataset_train.to_csv('arabic-data/OSACT2022-sharedTask-train.csv', index=False)
dataset_dev.to_csv('arabic-data/OSACT2022-sharedTask-dev.csv', index=False)
dataset_test.to_csv('arabic-data/OSACT2022-sharedTask-test-tweets.csv', index=False)


In [14]:
# Define a function to load the dataset and map labels
def load_and_process_data(filepath):
    # Load dataset
    df = pd.read_csv(filepath, delimiter=',')
    
    # Define a mapping function for binary labels
    def map_label(row):
        if 'NOT_' in row['OFF_label'] and 'NOT_' in row['Vulgar_label'] and 'NOT_' in row['Violence_label'] and 'HS' not in row['HS_label']:
            return 0
        return 1
    
    # Apply the mapping function to create a binary label
    df['binary_label'] = df.apply(map_label, axis=1)
    
    # Select and rename necessary columns
    df = df[['tweet_text', 'binary_label']]
    df.columns = ['text', 'label']
    
    return df

# Paths to the datasets
train_path = 'arabic-data/OSACT2022-sharedTask-train.csv'
dev_path = 'arabic-data/OSACT2022-sharedTask-dev.csv'
test_path = 'arabic-data/OSACT2022-sharedTask-test-tweets.csv'

# Load and process datasets
dataset_train = load_and_process_data(train_path)
dataset_dev = load_and_process_data(dev_path)
dataset_test = load_dataset('csv', data_files='arabic-data/OSACT2022-sharedTask-test-tweets.txt')['test']

# Now dataset_train, dataset_dev, and dataset_test contain 'text' and 'label' where label is binary


Generating train split: 0 examples [00:00, ? examples/s]

Failed to read file '/Users/romino/Documents-local/projects/deeplearningproject_2024/arabic-data/OSACT2022-sharedTask-test-tweets.txt' with error <class 'pandas.errors.ParserError'>: Error tokenizing data. C error: Expected 1 fields in line 26, saw 4



DatasetGenerationError: An error occurred while generating the dataset

In [None]:
# Define a PyTorch dataset class
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# Function to encode the texts
def encode_texts(tokenizer, texts, labels, max_length=512):
    encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_length)
    return TextDataset(encodings, labels)

In [None]:
# Preparing datasets
train_texts = dataset_train['text']
train_labels = dataset_train['label']
train_dataset = encode_texts(tokenizer, train_texts, train_labels)

dev_texts = dataset_dev['text']
dev_labels = dataset_dev['label']
dev_dataset = encode_texts(tokenizer, dev_texts, dev_labels)

test_texts = dataset_test['text']
test_labels = dataset_test['label']
test_dataset = encode_texts(tokenizer, test_texts, test_labels)

In [None]:
# Dataloader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
# Model training function
def train_model(model, train_loader, optimizer, criterion, epochs=5):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch+1}: Loss {total_loss/len(train_loader)}')
        
# Training setup
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()

In [None]:
# Run this cell to train the model
train_model(model, train_loader, optimizer, criterion)

# Uncomment below to save the model and weights
# model.save_pretrained('arabert-pretrained')
# torch.save(model.state_dict(), "arabert-weights.pth")

In [None]:
# Model evaluation function
def evaluate_model(model, loader):
    model.eval()
    total_loss, total_accuracy = 0, 0
    for batch in loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        accuracy = (predictions == labels).float().mean()
        total_loss += loss.item()
        total_accuracy += accuracy.item()
    return total_loss / len(loader), total_accuracy / len(loader)

In [None]:
# Run this cell to evaluate the model
val_loss, val_accuracy = evaluate_model(model, dev_loader)
print(f'Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')