<h1>Final Assignment</h1>
<h2>Task: Text Generation</h2>
<h2>Submitted by: Mainuddin Alam Irteja</h2>

In [None]:
# Installing necessary libraries
!pip install transformers datasets torch

In [None]:
# Loading FLAN-T5 model

# Importing necessary modules
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Assigning the model name and loading the tokenizer and model
modelName = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(modelName)
model = AutoModelForSeq2SeqLM.from_pretrained(modelName)

In [None]:
# Transfer the model so that the gpu is being used
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

# Print out which device we're using (GPU or CPU)
print(device)

In [None]:
# Load the WikiText-103 dataset
from datasets import load_dataset
wiki_Dataset = load_dataset("wikitext", "wikitext-103-v1", split="train")

# Split the dataset so that it could be used for training and testing
# test_size = 0.2 which signifies 80 percent of data for training and 20 percent for testing
split_Dataset = wiki_Dataset.train_test_split(test_size=0.2)
train_Dataset = split_Dataset['train']
test_Dataset = split_Dataset['test']

In [10]:
# Preprocessing the dataset

"""
Function to preprocess the dataset

@param givenData The dataset given to be preprocessed
@reuturns model_inputs The preprocessed model inputs
"""
def preprocessDataset(givenData):
  # Extract the raw text from the data
  inputs = [text for text in givenData['text']]

  # Tokenize the inputs for text generation
  model_inputs = tokenizer(inputs, max_length=512, padding="max_length", truncation=True, return_tensors="pt")

  # Shift the tokens by one position to create the input labels
  # Shifting helps model predicting next token
  labels = model_inputs['input_ids'].copy()

  # Adjust labels to ignore padding tokens (-100 is used so the loss function ignores padding tokens)
  labels[labels == tokenizer.pad_token_id] = -100

  # Attach the shifted labels to the model inputs
  model_inputs["labels"] = labels

  # Move the tokenized inputs and labels to the appropriate device (GPU/CPU)
  model_inputs = {k: v.to(device) for k, v in model_inputs.items()}

  # Return the preprocessed model inputs
  return model_inputs