In [1]:
#installing libraries

!pip install transformers datasets faiss-cpu sentence-transformers


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64

In [2]:
#importing necessary libraries
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from transformers import T5Tokenizer
import faiss
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset

from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm

In [3]:
#loading dataset
#I have used a popular dataset for QA in English
dataset = load_dataset("squad_v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [122]:

'''
This cell we will load our sentence embedder and using this embedder we will encode
our context and question and we will encode context to create our knowledge-base
to search context in our indexing based on similarity on our question. we will use
two different embedder for context and question.
'''

sentence_embedder = SentenceTransformer('all-MiniLM-L6-v2') #loading sentenceTransformer to use sentence piece embedding
t5_tokenizer = T5Tokenizer.from_pretrained('t5-small') #loading tokenizer
context = [data['context'] for data in dataset['train']] #separating context
questions = [data['question'] for data in dataset['train']] #separating questions


context_embeddings = sentence_embedder.encode(context) #encoding our context text using our embedder

In [123]:
'''
I have used a popular indexing system FAISS, to indexing our RAG knowledge-base
and then save the index for using later use .
'''
index = faiss.IndexFlatL2(context_embeddings.shape[1])  # Using L2 distance for similarity
index.add(context_embeddings)

index_file = "faiss_index.bin"  # name our index file
faiss.write_index(index, index_file) #saving our index file

#print(f"Index saved to {index_file}")

In [124]:
def retrieve_relevant_contexts(question, top_k=5):

  '''
  This function will retrive matched context based on our question.
  This is to check the retrived documents based on question.
  This will be used in our inference pipe-line

  Inp:
    question : str, question from the user end
    top_k = int, number of doucuments this function will retrive based on question
  Out:
    relevant_docs:list, list of the defined number of context from fasiss index

  '''
  question_embedding = sentence_embedder.encode([question]) #embedding question using sentence_embedder


  _, indices = index.search(question_embedding, top_k) #searching our knowlege-base and retrive indices


  relevant_context = [context[i] for i in indices[0]] #using retrived indice we will retrive relevant contexts

  return relevant_context


In [127]:
#Test retrival capability
question = 'what is the capital of France?' #write a question here
retrieve_relevant_contexts(question, top_k=1) #modify the top_k number to any int


['Paris is located in northern central France. By road it is 450 kilometres (280 mi) south-east of London, 287 kilometres (178 mi) south of Calais, 305 kilometres (190 mi) south-west of Brussels, 774 kilometres (481 mi) north of Marseille, 385 kilometres (239 mi) north-east of Nantes, and 135 kilometres (84 mi) south-east of Rouen. Paris is located in the north-bending arc of the river Seine and includes two islands, the Île Saint-Louis and the larger Île de la Cité, which form the oldest part of the city. The river\'s mouth on the English Channel (La Manche) is about 233 mi (375 km) downstream of the city, established around 7600 BC. The city is spread widely on both banks of the river. Overall, the city is relatively flat, and the lowest point is 35 m (115 ft) above sea level. Paris has several prominent hills, the highest of which is Montmartre at 130 m (427 ft). Montmartre gained its name from the martyrdom of Saint Denis, first bishop of Paris, atop the Mons Martyrum, "Martyr\'s m

In [157]:
'''
For training our generator function we will randomly select some question, context and answers.
As our generator function will fetch contexts based on questions and using contexts and questions will
generate answers. I will extract data for train and validation of our generator model. We will use T5 LLM.
'''
train_dataset = dataset['train'].shuffle(seed=42).select(range(20000))
val_dataset = dataset['validation'].shuffle(seed=42).select(range(8000))

In [130]:
#loading our pretrianed tokenizer
t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')

In [150]:
#we will fetch input and target data from our train dataset
'''
Here , we will do like insert direct tokenize text in the list instead text.
Tokenization in a sence , split text in tokens and convert tokens into ids

'''
input_texts = []
target_texts = []


for tr_data in train_dataset:
    input_text = f"question: {tr_data ['question']} context: {tr_data ['context']}"
    inputs = t5_tokenizer(input_text, max_length=512, padding='max_length', truncation=True, return_tensors="pt") #tokenize input text
    target_text = tr_data ['answers']['text']
    for tt in target_text:
        target_tokens = t5_tokenizer(tt, max_length=128, padding='max_length', truncation=True, return_tensors="pt")
        target_texts.append(target_tokens['input_ids'])

    input_texts.append(inputs['input_ids'])
    lmt = len(target_texts) if len(input_texts) > len(target_texts) else len(input_texts)

    input_texts = input_texts[:lmt]#just to ensure input_texts and target_texts will be same size
    target_texts = target_texts[:lmt]                                           #there could be some multipe answers for a question or some blank question for ansewers . so.


In [158]:

'''
Similar as train data , we extract some val data
'''
val_input_texts = []
val_target_texts = []

for vl_data in val_dataset:
    input_text = f"question: {vl_data['question']} context: {vl_data['context']}"
    val_inputs = t5_tokenizer(input_text, max_length=512, padding='max_length', truncation=True, return_tensors="pt")
    val_input_texts.append(val_inputs['input_ids'])
    target = vl_data['answers']['text']

    if len(target_text) > 1:
      target_text = target_text[0]

    for t in target:
        target_tokens = t5_tokenizer(t, max_length=128, padding='max_length', truncation=True, return_tensors="pt")
        val_target_texts.append(target_tokens['input_ids'])

    #val_lmt = len(val_target_texts)


    if len(val_input_texts) > len(val_target_texts):
      val_lmt = len(val_target_texts)
    else:
      val_lmt = len(val_input_texts)

    val_input_texts = val_input_texts[:val_lmt]
    val_target_texts = val_target_texts[:val_lmt]



In [115]:
#Random cell to check or debugg while coding
#Ignore this one
print(dataset['train']['question'][:2])

['When did Beyonce start becoming popular?', 'What areas did Beyonce compete in when she was growing up?']


In [159]:
print(len(val_input_texts), len(val_target_texts))


3918 3918


In [160]:
'''
In this cell , we will define our custom dataset for generator Model.
Then we will create our datasets and dataloader for training and validation.
'''
class GeneratorDataset(Dataset):
    def __init__(self, input_texts, target_texts):
        self.input_texts = input_texts
        self.target_texts = target_texts

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.input_texts[idx]).squeeze(0)  # Tokenized input text
        target_ids = torch.tensor(self.target_texts[idx]).squeeze(0)  # Tokenized target text
        return {'input_ids': input_ids, 'labels': target_ids}

# Create Dataset and DataLoader for training
train_dataset = GeneratorDataset(input_texts, target_texts)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Create Dataset and DataLoader for validation (same process)
val_dataset = GeneratorDataset(val_input_texts, val_target_texts)
val_dataloader = DataLoader(val_dataset, batch_size=8)

In [161]:
'''
In this cell , we will load our pretrianed LLM to finetune as our generator model.
we will use t5 small model here.
we will set our hyper parameter here too.
'''
model = T5ForConditionalGeneration.from_pretrained('t5-small')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #set device to utilize gpu if available
model.to(device)


optimizer = AdamW(model.parameters(), lr=0.001)
num_epochs = 5
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)



In [162]:
'''
This will be our training and validation Loop
'''

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    total_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}/{num_epochs}"):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)


        outputs = model(input_ids=input_ids, labels=labels) #forward passing
        loss = outputs.loss


        loss.backward() #back propagation


        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) #clipping gradient to save from gradient exploading


        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        total_loss += loss.item()

    print(f"Epoch {epoch+1} Loss: {total_loss / len(train_dataloader)}")

    # Validation loop
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc="Validating"):
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)


            outputs = model(input_ids=input_ids, labels=labels)
            loss = outputs.loss

            val_loss += loss.item()

    print(f"Validation Loss after Epoch {epoch+1}: {val_loss / len(val_dataloader)}")

model.save_pretrained("fine_tuned_t5") #saving model

  input_ids = torch.tensor(self.input_texts[idx]).squeeze(0)  # Tokenized input text
  target_ids = torch.tensor(self.target_texts[idx]).squeeze(0)  # Tokenized target text
Training Epoch 1/5: 100%|██████████| 829/829 [08:54<00:00,  1.55it/s]


Epoch 1 Loss: 0.041383241625689196


Validating: 100%|██████████| 490/490 [00:56<00:00,  8.71it/s]


Validation Loss after Epoch 1: 0.02304669473253723


Training Epoch 2/5: 100%|██████████| 829/829 [09:01<00:00,  1.53it/s]


Epoch 2 Loss: 0.02051992318677223


Validating: 100%|██████████| 490/490 [00:56<00:00,  8.72it/s]


Validation Loss after Epoch 2: 0.023324647303242997


Training Epoch 3/5: 100%|██████████| 829/829 [09:00<00:00,  1.53it/s]


Epoch 3 Loss: 0.014787107507223028


Validating: 100%|██████████| 490/490 [00:56<00:00,  8.74it/s]


Validation Loss after Epoch 3: 0.023984756173176348


Training Epoch 4/5: 100%|██████████| 829/829 [09:00<00:00,  1.54it/s]


Epoch 4 Loss: 0.010404716710929632


Validating: 100%|██████████| 490/490 [00:56<00:00,  8.65it/s]


Validation Loss after Epoch 4: 0.023842226360848515


Training Epoch 5/5: 100%|██████████| 829/829 [09:07<00:00,  1.51it/s]


Epoch 5 Loss: 0.007697544219261609


Validating: 100%|██████████| 490/490 [00:56<00:00,  8.68it/s]


Validation Loss after Epoch 5: 0.025111733407152777


In [22]:
#if you have pretrained model and faiss index file . You can simply skip all the above code
#Just run this cell and then run bellow cells
#install libraries and import libraries cell should be run
#ELSE skipp this cell

index = faiss.read_index() # pass saved index path
model = T5ForConditionalGeneration.from_pretrained() # pass fine tunned model path
t5_tokenizer = T5Tokenizer.from_pretrained('t5-small') # load pretrained tokenizer


question: What is the capital of France? context: Paris is located in northern central France. By road it is 450 kilometres (280 mi) south-east of London, 287 kilometres (178 mi) south of Calais, 305 kilometres (190 mi) south-west of Brussels, 774 kilometres (481 mi) north of Marseille, 385 kilometres (239 mi) north-east of Nantes, and 135 kilometres (84 mi) south-east of Rouen. Paris is located in the north-bending arc of the river Seine and includes two islands, the Île Saint-Louis and the larger Île de la Cité, which form the oldest part of the city. The river's mouth on the English Channel (La Manche) is about 233 mi (375 km) downstream of the city, established around 7600 BC. The city is spread widely on both banks of the river. Overall, the city is relatively flat, and the lowest point is 35 m (115 ft) above sea level. Paris has several prominent hills, the highest of which is Montmartre at 130 m (427 ft). Montmartre gained its name from the martyrdom of Saint Denis, first bishop

In [163]:
'''
Here we will create our generator function using our pretrained model

'''

# Set the model to evaluation mode
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()


def prepare_inference_data(question, top_k=5):
  '''
  This function will prepare our inference data.
  based on question it will retrive contexts from faiss index and then join the question and context for generation response.

  '''
  relevant_context = retrieve_relevant_contexts(question, top_k)  # Get relevant documents using your function
  context = " ".join(relevant_context)  # Join the relevant documents as context
  inf_data = f"question: {question} context: {context}"  # Combine question and context
  return inf_data


def generator(question, top_k=5):
  '''
  This will our main generator function .
  With all the predefined functions and model we will generate response to respond with user end
  '''

  inference_input = prepare_inference_data(question, top_k)

    # Tokenize the input text
  inputs = t5_tokenizer(inference_input, return_tensors="pt", max_length=512, padding="max_length", truncation=True)


  input_ids = inputs["input_ids"].to(device)


  with torch.no_grad():  # we do not need gradient calculation in inference
      outputs = model.generate(
          input_ids,
          max_length=512,  # Set the max length for the generated response
          num_beams=10,  # Use beam search for better response quality
          #early_stopping=True
        )

    # Decode the generated tokens to get the response
  decoded_output = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)

  return decoded_output




In [164]:
'''
This is the final cell to QA with our trained system
Hence we use t5 small and small dataset . Our system might not be very good but this is the
process to make an QA chatbot
'''
question = "when did messi become popular?"
response = generator(question, top_k=5)
print("Model's answer:", response)

Model's answer: 2010


In [165]:
# a simple chatbot interface here in google colab

def QA_chatbot():
    print("Chatbot: Hi! I am a QA Chatbot. Ask me anything, or type 'exit' to end the chat.\n")

    while True:
        question = input("You: ")

        if question.lower() == "exit":
            print("Chatbot: Goodbye! Have a great day!")
            break

        # Generate response using your model
        try:
            response = generator(question, top_k=5)  # Call your trained QA system
            print(f"Chatbot: {response}")
        except Exception as e:
            print(f"Chatbot: Sorry, I couldn't process your question. Error: {e}")




In [166]:
#run this cell to chat
QA_chatbot()

Chatbot: Hi! I am a QA Chatbot. Ask me anything, or type 'exit' to end the chat.

You: what is the capitla of France?
Chatbot: The Paris Region is France's leading region for economic activity, with a 2012 GDP of €624 billion (US$687 billion)
You: Okay. Can you tell me who is the best scientist ever?
Chatbot: John von Neumann
You: Tell me somethign about cricket
Chatbot: It is considered the spiritual home of the two sports in Australia.
You: what is a football?
Chatbot: Association football is played in accordance with a set of rules known as the Laws of the Game.
You: do you know who is the greatest singer ever?
Chatbot: Andy Williams, Johnny Mathis, Nana Mouskouri, Celine Dion, Julio Iglesias, Frank Sinatra, Barry Manilow, Engelbert Humperdinck, and Marc Anthony
You: Tell me about messi
Chatbot: Lionel Messi with 474 goals
You: exit
Chatbot: Goodbye! Have a great day!
