# Deep Leaning Project

In [27]:
%pip install libretranslatepy
%pip install transformers datasets evaluate

from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torchvision
import torchvision.datasets as datasets
import torchvision.models as models
import torchvision.utils as utils
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import requests
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, RobertaTokenizer, RobertaModel, RobertaForSequenceClassification
import os


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.



**Run libretranslate on http://127.0.0.1:5000**

*Add the languages to translate in --load-only <comma-separated language codes> Set available languages (ar,de,en,es,fr,ga,hi,it,ja,ko,pt,ru,zh)*

**Jupyter does not allow the process to run in the background, thus start a console and run libretranslate --load-only en,de**

**Add the Dataset**

In [21]:

# Don't forget to cite the authors for using their dataset for our project.
# Load the dataset into a pandas dataframe
datasetFilename = "Dynamically_Generated_Hate_Dataset_v0.2.3.csv"
datasetPath = os.path.join(os.getcwd(), datasetFilename)
datasetInDataframe = pd.read_csv(datasetPath)

# Get dataset for training and make textlabels binary
datasetTrain = datasetInDataframe[["", "label", "split"]]
datasetTrain = datasetTrain[datasetTrain["split"] == "train"]
datasetTrain["label"] = datasetTrain["label"].map({"hate": 1, 'nothate': 0})
datasetTrain = datasetTrain.drop(columns="split")

# Get dataset for testing and make labels binary
datasetTest = datasetInDataframe[["text", "label", "split"]]
datasetTest = datasetTest[datasetTest["split"] == "test"]
datasetTest["label"] = datasetTest["label"].map({"hate": 1, 'nothate': 0})
datasetTest = datasetTest.drop(columns="split")

# Get dev dataset, whatever that is, and make labels binary
datasetDev = datasetInDataframe[["text", "label", "split"]]
datasetDev = datasetDev[datasetDev["split"] == "dev"]
datasetDev["label"] = datasetDev["label"].map({"hate": 1, 'nothate': 0})
datasetDev = datasetDev.drop(columns="split")

# Next step would be tokenization of the text as input for our model.


*Iterate through the dataset and translate, specifiy the source and target language. For batch processing, add multiple requests into the array*

In [17]:
url = "http://127.0.0.1:5000/translate"

params = {"q" : ["Hello, how are you?", "What is your name?"],
          "source" : "en",
          "target" : "de"
}

response = requests.post(url, json=params)

if response.status_code == 200:
    translated_text = response.json()["translatedText"]
    print(translated_text)
else:
    print(f"Request failed with status code {response.status_code}")

ConnectionError: HTTPConnectionPool(host='127.0.0.1', port=5000): Max retries exceeded with url: /translate (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7b599e0cbd90>: Failed to establish a new connection: [Errno 111] Connection refused'))

**Load the pretrained Model**

In [28]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = model = RobertaForSequenceClassification.from_pretrained('roberta-base')
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


*Create a Dataloader for training and test datasets*

In [19]:
train_dataloader = DataLoader(dataset, shuffle=True, batch_size=10)
eval_dataloader = DataLoader(dataset, batch_size=10)


NameError: name 'dataset' is not defined

In [33]:
# Define your dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(label, dtype=torch.long),
        }

In [38]:


# Prepare your dataset for training
train_texts = datasetTrain["text"].tolist()
train_labels = datasetTrain["label"].tolist()

# Define your training dataset and data loader
train_dataset = CustomDataset(train_texts, train_labels, tokenizer, max_length=128)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Freeze pre-trained model layers
for param in model.roberta.parameters():
    param.requires_grad = False

# Define your loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

# Training loop
num_epochs = 5

model.train()
for epoch in range(num_epochs):
    for batch in train_loader:
        print(batch)
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()

# Save the trained model
torch.save(model.state_dict(), "trained_model.pth")

{'input_ids': tensor([[    0,  1620,    89,  ...,     1,     1,     1],
        [    0, 44412,    24,  ...,     1,     1,     1],
        [    0, 10932,    89,  ...,     1,     1,     1],
        ...,
        [    0, 25616,    86,  ...,     1,     1,     1],
        [    0, 24873,  1137,  ...,    52,   214,     2],
        [    0,   282,  1023,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0,
        0, 1, 0, 0, 0, 1, 0, 1])}
{'input_ids': tensor([[    0,   133,   796,  ...,     1,     1,     1],
        [    0,   133,   266,  ...,     1,     1,     1],
        [    0,  9226,  1437,  ...,     1,     1,     1],
        ...,
        [    0,  1185,    64,  ...,     1,     1,     1],
   

KeyboardInterrupt: 