In [None]:
!pip install transformers[torch]
!pip install accelerate -U


Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu

#T5-model without Type#

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from tqdm.auto import tqdm
import pandas as pd
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define the T5 model and tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Load the CSV file from Google Drive
csv_file_path = '/content/drive/MyDrive/questions_responses.csv'
df = pd.read_csv(csv_file_path, nrows=1000)  # Limit to first 1000 rows

# Define a custom dataset class for question answering
class QADataset(Dataset):
    def __init__(self, examples, tokenizer):
        self.examples = examples
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        example = self.examples.iloc[idx]
        question = example["question"]
        response = example["response"]
        input_text = "question: {} response: {}".format(question, response)
        target_text = response
        input_ids = self.tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
        target_ids = self.tokenizer.encode(target_text, return_tensors="pt", max_length=32, truncation=True)
        return {"input_ids": input_ids, "labels": target_ids}

# Prepare the dataset for training
train_dataset = QADataset(df, tokenizer)

# Define the collate function to pad sequences dynamically within each batch
def collate_fn(batch):
    input_ids = [item['input_ids'].squeeze(0) for item in batch]
    labels = [item['labels'].squeeze(0) for item in batch]
    input_ids_padded = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    labels_padded = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=tokenizer.pad_token_id)
    return {"input_ids": input_ids_padded, "labels": labels_padded}

# Prepare the data loader with padding
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

# Define optimizer and learning rate scheduler
optimizer = Adam(model.parameters(), lr=5e-5)

# Training loop
num_epochs = 1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}", unit="batch"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    average_loss = total_loss / len(train_loader)
    print(f"Average loss for epoch {epoch + 1}: {average_loss}")

# Save the fine-tuned model
model.save_pretrained("./t5_qa_finetuned")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Epoch 1:   0%|          | 0/250 [00:00<?, ?batch/s]

Average loss for epoch 1: 1.3478613460063935


In [None]:
#  load model  t5_qa_finetuned

model_path = "./t5_qa_finetuned"
model = T5ForConditionalGeneration.from_pretrained(model_path)


In [None]:
# prompt: save the model t5_qa_finetuned  to drive to folder T5_models

!cp -r ./t5_qa_finetuned /content/drive/MyDrive/T5_models/t5_qa_finetuned


#T5-model with Type#

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from tqdm.auto import tqdm
import pandas as pd
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define the T5 model and tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Load the CSV file from Google Drive
csv_file_path = '/content/drive/MyDrive/questions_responses.csv'
df = pd.read_csv(csv_file_path, nrows=1000)  # Limit to first 1000 rows

# Define a custom dataset class for question answering
class QADataset(Dataset):
    def __init__(self, examples, tokenizer):
        self.examples = examples
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        example = self.examples.iloc[idx]
        question = example["question"]
        response = example["response"]
        question_type = example["type"]
        context = "question: {} response: {} type: {}".format(question, response, question_type)
        input_text = "{} type: {}".format(context, example["type"])
        target_text = response
        input_ids = self.tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
        target_ids = self.tokenizer.encode(target_text, return_tensors="pt", max_length=32, truncation=True)
        return {"input_ids": input_ids, "labels": target_ids}

# Prepare the dataset for training
train_dataset = QADataset(df, tokenizer)

# Define the collate function to pad sequences dynamically within each batch
def collate_fn(batch):
    input_ids = [item['input_ids'].squeeze(0) for item in batch]
    labels = [item['labels'].squeeze(0) for item in batch]
    input_ids_padded = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    labels_padded = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=tokenizer.pad_token_id)
    return {"input_ids": input_ids_padded, "labels": labels_padded}

# Prepare the data loader with padding
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

# Define optimizer and learning rate scheduler
optimizer = Adam(model.parameters(), lr=5e-5)

# Training loop
num_epochs = 1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}", unit="batch"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    average_loss = total_loss / len(train_loader)
    print(f"Average loss for epoch {epoch + 1}: {average_loss}")

# Save the fine-tuned model
model.save_pretrained("./t5_qa_withType_finetuned")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Epoch 1:   0%|          | 0/250 [00:00<?, ?batch/s]

Average loss for epoch 1: 1.2973367706537247


In [None]:
# prompt: save the model t5_qa_withType_finetuned to drive to folder T5_models

!cp -r ./t5_qa_withType_finetuned /content/drive/MyDrive/T5_models/t5_qa_withType_finetuned


In [None]:
#  load model t5_qa_withType_finetuned

model_path = "./t5_qa_withType_finetuned"
model = T5ForConditionalGeneration.from_pretrained(model_path)


Model Evaluation By Rough

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
pip install evaluate



In [None]:
pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=fd93df9941ded6d2852265bf530e0ceaf4b5f4431a8f82a35e1caa8ee15f2ecc
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


Evaluate the model without Question Type

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
import pandas as pd
from evaluate import load
from rouge_score import rouge_scorer
# Load your trained T5 model and tokenizer
model_path = "/content/drive/MyDrive/NLP_QA/t5_qa_finetuned"
#tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Load the CSV file
csv_file_path = '/content/drive/MyDrive/NLP_QA/questions_responses.csv'
df = pd.read_csv(csv_file_path, skiprows=range(1, 1000), nrows=20)  # Load 200 rows after the 10000th row

# Define a custom dataset class for question answering
class QADataset(Dataset):
    def __init__(self, examples, tokenizer):
        self.examples = examples
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        example = self.examples.iloc[idx]
        question = example["question"]
        response = example["response"]
        input_text = "question: {} response: {}".format(question, response)
        target_text = response
        input_ids = self.tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
        target_ids = self.tokenizer.encode(target_text, return_tensors="pt", max_length=32, truncation=True)
        return {"input_ids": input_ids, "labels": target_ids, "response": response,"question": question}

# Prepare the dataset for evaluation
eval_dataset = QADataset(df, tokenizer)

# Define a function to generate responses
def generate_responses(model, tokenizer, questions):
    generated_responses = []
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    for question in tqdm(questions, desc="Generating Predictions"):
        input_text = "question: {}".format(question)
        input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
        output_ids = model.generate(input_ids=input_ids, max_length=32, num_beams=4, early_stopping=True)
        generated_response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        generated_responses.append(generated_response)

    return generated_responses

# Generate responses for evaluation
generated_responses = generate_responses(model, tokenizer, df['question'])

# Load the ROUGE scorer
#ROUGE-1, for instance, looks at individual words or unigrams, while ROUGE-2 considers pairs of words or bigrams
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Compute the ROUGE scores
references = df['response'].tolist()


for i in range(len(eval_dataset)):
    print("Question", i ,":",eval_dataset[i]["question"])
    print("Response",i,":",eval_dataset[i]["response"])
    rouge_scores = scorer.score(generated_responses[i], eval_dataset[i]["response"])
    print("Generated Response: ",generated_responses[i])
    print("ROUGE scores:", rouge_scores)
    print("-----------------------------------------")

# Print the ROUGE scores
print(rouge_scores)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Generating Predictions:   0%|          | 0/20 [00:00<?, ?it/s]

Question 0 : Translate the following sentence to German:
This is why we are critical of the proposals put forward by the European Commission and supported by the Council to use the 'Barcelona Process' to undertake, in the framework of this process of association agreements with these countries and of the creation of a 'free trade zone', the liberalisation of services and of agriculture and, generally, the neoliberal guidelines laid down at the WTO Conference in Doha.

German:
Response 0 : Dies ist der Grund, warum wir die von der Europäischen Kommission vorgelegten und vom Rat unterstützten Vorschläge kritisch sehen, im Rahmen des "Barcelona-Prozesses" im Kontext der Assoziierungsabkommen mit diesen Ländern und der Schaffung einer "Freihandelszone" die Liberalisierung von Dienstleistungen und Landwirtschaft und im Allgemeinen, die neoliberalen Leitlinien, die auf der WTO-Konferenz in Doha festgelegt wurden, umzusetzen.

Step-by-step translation and justification:
1. "This is why we are

Evaluate the model with Question Type

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
import pandas as pd
from evaluate import load
from rouge_score import rouge_scorer
# Load your trained T5 model and tokenizer
model_path = "/content/drive/MyDrive/NLP_QA/t5_qa_withType_finetuned"
#tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Load the CSV file
csv_file_path = '/content/drive/MyDrive/NLP_QA/questions_responses.csv'
df = pd.read_csv(csv_file_path, skiprows=range(1, 1000), nrows=20)  # Load 200 rows after the 10000th row

# Define a custom dataset class for question answering
class QADataset(Dataset):
    def __init__(self, examples, tokenizer):
        self.examples = examples
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        example = self.examples.iloc[idx]
        question = example["question"]
        response = example["response"]
        input_text = "question: {} response: {}".format(question, response)
        target_text = response
        input_ids = self.tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
        target_ids = self.tokenizer.encode(target_text, return_tensors="pt", max_length=32, truncation=True)
        return {"input_ids": input_ids, "labels": target_ids, "response": response,"question": question}

# Prepare the dataset for evaluation
eval_dataset = QADataset(df, tokenizer)

# Define a function to generate responses
def generate_responses(model, tokenizer, questions):
    generated_responses = []
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    for question in tqdm(questions, desc="Generating Predictions"):
        input_text = "question: {}".format(question)
        input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
        output_ids = model.generate(input_ids=input_ids, max_length=32, num_beams=4, early_stopping=True)
        generated_response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        generated_responses.append(generated_response)

    return generated_responses

# Generate responses for evaluation
generated_responses = generate_responses(model, tokenizer, df['question'])

# Load the ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Compute the ROUGE scores
for i in range(len(eval_dataset)):
    print("Question", i ,":",eval_dataset[i]["question"])
    print("Response",i,":",eval_dataset[i]["response"])
    rouge_scores = scorer.score(generated_responses[i], eval_dataset[i]["response"])
    print("Generated Response: ",generated_responses[i])
    print("ROUGE scores:", rouge_scores)
    print("-----------------------------------------")
# Print the ROUGE scores
print(rouge_scores)



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Generating Predictions:   0%|          | 0/20 [00:00<?, ?it/s]

Question 0 : Translate the following sentence to German:
This is why we are critical of the proposals put forward by the European Commission and supported by the Council to use the 'Barcelona Process' to undertake, in the framework of this process of association agreements with these countries and of the creation of a 'free trade zone', the liberalisation of services and of agriculture and, generally, the neoliberal guidelines laid down at the WTO Conference in Doha.

German:
Response 0 : Dies ist der Grund, warum wir die von der Europäischen Kommission vorgelegten und vom Rat unterstützten Vorschläge kritisch sehen, im Rahmen des "Barcelona-Prozesses" im Kontext der Assoziierungsabkommen mit diesen Ländern und der Schaffung einer "Freihandelszone" die Liberalisierung von Dienstleistungen und Landwirtschaft und im Allgemeinen, die neoliberalen Leitlinien, die auf der WTO-Konferenz in Doha festgelegt wurden, umzusetzen.

Step-by-step translation and justification:
1. "This is why we are