<h1>Final Assignment</h1>
<h2>Task: Text Generation</h2>
<h2>Submitted by: Mainuddin Alam Irteja</h2>

In [None]:
# Installing necessary libraries
!pip install transformers datasets torch

In [None]:
# Loading FLAN-T5 model

# Importing necessary modules
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Assigning the model name and loading the tokenizer and model
modelName = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(modelName)
model = AutoModelForSeq2SeqLM.from_pretrained(modelName)

In [None]:
# Transfer the model so that the gpu is being used
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

# Print out which device we're using (GPU or CPU)
print(device)

In [45]:
# Load the WikiText-103 dataset
from datasets import load_dataset
wiki_Dataset = load_dataset("wikitext", "wikitext-103-v1", split="train")

# Split the dataset so that it could be used for training and evaluating
split_Dataset = wiki_Dataset.train_test_split(test_size=0.15)
train_Dataset = split_Dataset['train'].train_test_split(test_size=0.99)['train']
eval_Dataset = split_Dataset['test']

In [35]:
# Preprocessing the dataset

"""
Function to preprocess the dataset

@param givenData The dataset given to be preprocessed
@reuturns model_inputs The preprocessed model inputs
"""
def preprocessDataset(givenData):
  # Extract the raw text from the data
  inputs = [text for text in givenData['text']]

  # Tokenize the inputs for text generation
  model_inputs = tokenizer(inputs, max_length=512, padding="max_length", truncation=True, return_tensors="pt")

  # Shift the tokens by one position to create the input labels
  # Shifting helps model predicting next token
  labels = model_inputs['input_ids'].clone()

  # Adjust labels to ignore padding tokens
  # -100 is used so the loss function ignores padding tokens
  labels[labels == tokenizer.pad_token_id] = -100

  # Attach the shifted labels to the model inputs
  model_inputs["labels"] = labels

  # Move the tokenized inputs and labels to the appropriate device (GPU/CPU)
  model_inputs = {k: v.to(device) for k, v in model_inputs.items()}

  # Return the preprocessed model inputs
  return model_inputs

In [None]:
# Tokenize the training and testing datasets
tokenized_train_dataset = train_Dataset.map(preprocessDataset, batched=True)
tokenized_eval_dataset = eval_Dataset.map(preprocessDataset, batched=True)

In [None]:
from transformers import Seq2SeqTrainingArguments

# Setting training parameters for text generation
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',               # Directory to save model checkpoints
    evaluation_strategy="epoch",          # Evaluate the model at the end of each epoch
    learning_rate=2e-5,                   # Learning rate
    per_device_train_batch_size=8,        # Batch size for training
    per_device_eval_batch_size=8,         # Batch size for evaluation
    weight_decay=0.01,                    # Regularization to prevent overfitting
    save_total_limit=3,                   # Only keep the last 3 checkpoints
    num_train_epochs=3,                   # Number of epochs to train the model
    predict_with_generate=True,           # Enable text generation during evaluation
    generation_max_length=128,            # Max length for generated sequences
    generation_num_beams=5,               # Improves text generation
    logging_dir="./logs"                  # Directory for storing training logs
)

In [48]:
from transformers import Seq2SeqTrainer

# Initializing the trainer object for text generation
trainer = Seq2SeqTrainer(
    model=model,                            # The model to be trained
    args=training_args,                     # The training arguments adapted for text generation
    train_dataset=tokenized_train_dataset,  # Tokenized training dataset
    eval_dataset=tokenized_eval_dataset,    # Tokenized evaluation dataset
    tokenizer=tokenizer                     # The tokenizer to handle input and output
)

In [None]:
# Training the model
trainer.train()

In [None]:
# Evaluating the model
metrics = trainer.evaluate()

# Display the evaluation metrics
print(metrics)

In [None]:
# Creating the text generation function

"""
Function to generate texts.

@param givenText The given text by the user
@return The generated texts
"""
def generateTexts(givenText):
  ...


In [None]:
# Testing the text generation function
print(generateTexts(
    """
Person A: I like sports, especially soccer.
Person B: I do not know much about it. Can you explain?
"""
))