### Prompt Refinement Model


In [None]:
!pip install pytorch-lightning
!pip install tensorboard tensorboardX
!pip install datasets
!pip install transformers
!pip install nltk
!pip install rouge-score
!pip install evaluate
!pip install langdetect
!pip install bert_score

### Loading The Dataset

In [None]:

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    get_linear_schedule_with_warmup
)

from torch.optim import AdamW
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import nltk
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
import evaluate
import os
from tqdm.auto import tqdm
from langdetect import detect

# Downloading NLTK resources
nltk.download('punkt')

# Setting random seed for reproducibility
pl.seed_everything(42)

# Load the Refined Prompts dataset
# The dataset consists of poorly written or ambiguous prompts
# and their corresponding refined prompt for better responses
print("Loading dataset...")
dataset = load_dataset("yaswanth-iitkgp/Refined_Prompts")
print(f"Dataset loaded: {len(dataset['train'])} examples")

# some samples from the dataset
print("\nDataset preview:")
for i, example in enumerate(dataset['train']):
    if i < 3:
        print(f"Example {i+1}:")
        print(f"Original: {example['Raw_Prompts']}")
        print(f"Refined: {example['Refined_Prompt']}")
        print("-" * 50)

# Function to check if text is in English
def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

# Create train/validation/test splits
all_data = pd.DataFrame(dataset['train'])

# Filter to keep only English examples since the dataset contains foreign languages (e.g. chinese)
print("Filtering dataset to English-only examples...")
english_data = all_data[all_data['Raw_Prompts'].apply(lambda x: is_english(str(x)))]
print(f"English data: {len(english_data)} out of {len(all_data)} examples")

# Split the filtered data
train_data, val_test_data = train_test_split(english_data, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(val_test_data, test_size=0.5, random_state=42)

print(f"Train size: {len(train_data)}, Validation size: {len(val_data)}, Test size: {len(test_data)}")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
INFO:lightning_fabric.utilities.seed:Seed set to 42


Loading dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/745 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/938k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/71.6k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/72.1k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2250 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/124 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/125 [00:00<?, ? examples/s]

Dataset loaded: 2250 examples

Dataset preview:
Example 1:
Original: how to do small talk?
let me share some examples plz
Refined: Refined_Prompt: Could you provide some practical examples on effectively engaging in small talk for various social or professional settings?
--------------------------------------------------
Example 2:
Original: Give me a business idea that utilizes ChatGPT.
more
Refined: Refined_Prompt: Propose a detailed business plan for a startup that leverages the capabilities of ChatGPT to offer innovative services or products. Outline the target market, potential revenue streams, and unique selling points of the idea.
--------------------------------------------------
Example 3:
Original: animeganv2 使い方

The response must be short.
Refined: Refined_Prompt: Please provide a brief guide on how to use AnimeGANv2, focusing on its basic functionality and requirements for implementation.
--------------------------------------------------
Filtering dataset to English-only 

In [None]:
# Define the dataset class for GPT-2
class GPT2PromptRefinementDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        original_prompt = str(self.data.iloc[idx]['Raw_Prompts'])
        refined_prompt = str(self.data.iloc[idx]['Refined_Prompt'])

        # Format prompt for GPT-2 model: original prompt to refined prompt
        full_text = f"Refine the following prompt in English: {original_prompt}\n\nRefined prompt: {refined_prompt}{self.tokenizer.eos_token}"

        encoding = self.tokenizer(
            full_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        input_ids = encoding.input_ids.squeeze()
        attention_mask = encoding.attention_mask.squeeze()

        labels = input_ids.clone()
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels,
            'original_prompt': original_prompt,
            'refined_prompt': refined_prompt
        }



In [None]:
# Creating the model for GPT-2
class PromptRefinementModelGPT2(pl.LightningModule):
    def __init__(self, model_name="gpt2", lr=2e-5, weight_decay=0.01,
                 train_steps=0, warmup_steps=0):
        super().__init__()
        self.model = GPT2LMHeadModel.from_pretrained(model_name)
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)

        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.model.config.pad_token_id = self.model.config.eos_token_id

        self.lr = lr
        self.weight_decay = weight_decay
        self.train_steps = train_steps
        self.warmup_steps = warmup_steps
        self.save_hyperparameters()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        return outputs

    def training_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            labels=batch['labels']
        )
        loss = outputs.loss
        self.log('train_loss', loss, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            labels=batch['labels']
        )
        loss = outputs.loss
        self.log('val_loss', loss, prog_bar=True, logger=True)
        return loss

    def test_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            labels=batch['labels']
        )
        return outputs.loss

    def generate_text(self, input_text, max_length=150):
        # Format input for prompt refinement task
        prompt = f"Refine the following prompt in English: {input_text}"

        input_ids = self.tokenizer.encode(
            prompt,
            return_tensors="pt"
        )
        input_ids = input_ids.to(self.device)

        # Generate with parameters to enhance output
        outputs = self.model.generate(
            input_ids=input_ids,
            max_length=max_length + len(input_ids[0]),
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            no_repeat_ngram_size=2,
            num_return_sequences=1
        )

        decoded_output = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract just the refined part
        refined_part = decoded_output.split("Refined prompt:")[-1].strip()


        return refined_part

    def configure_optimizers(self):
        optimizer = AdamW(
            self.parameters(),
            lr=self.lr,
            weight_decay=self.weight_decay
        )

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.warmup_steps,
            num_training_steps=self.train_steps
        )

        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': scheduler,
                'interval': 'step'
            }
        }


In [None]:
# Function to train the model
def train_model(model_name="gpt2", batch_size=8, max_epochs=10, lr=2e-5):
    # Initialize GPT2 tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)

    tokenizer.pad_token = tokenizer.eos_token

    # Create datasets for training testing and validation
    train_dataset = GPT2PromptRefinementDataset(train_data, tokenizer)
    val_dataset = GPT2PromptRefinementDataset(val_data, tokenizer)
    test_dataset = GPT2PromptRefinementDataset(test_data, tokenizer)

    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=2)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=2)


    train_steps = len(train_loader) * max_epochs
    warmup_steps = int(0.1 * train_steps)

    # Initialize model
    model = PromptRefinementModelGPT2(
        model_name=model_name,
        lr=lr,
        train_steps=train_steps,
        warmup_steps=warmup_steps
    )

    checkpoint_callback = ModelCheckpoint(
        dirpath='checkpoints',
        filename='{epoch}-{val_loss:.4f}',
        save_top_k=3,
        monitor='val_loss',
        mode='min'
    )

    early_stop_callback = EarlyStopping(
        monitor='val_loss',
        patience=3,
        mode='min'
    )

    # Set up logger to monitor training progress
    logger = TensorBoardLogger('logs', name='prompt_refinement_gpt2')


    trainer = pl.Trainer(
        max_epochs=max_epochs,
        callbacks=[checkpoint_callback, early_stop_callback],
        logger=logger,
        accelerator='auto',
        devices=1 if torch.cuda.is_available() else "auto",
        log_every_n_steps=10
    )

    # Train model
    print("Starting training...")
    trainer.fit(model, train_loader, val_loader)

    # Test model
    trainer.test(model, test_loader)

    # Saving the model
    model_save_path = "final_model_gpt2"
    model.model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)

    return model, tokenizer, test_loader






In [None]:
# Evaluates the model
def evaluate_model(model, tokenizer, test_loader):
    print("Evaluating model...")
    model.eval()
    model.to('cuda' if torch.cuda.is_available() else 'cpu')

    bertscore = evaluate.load("bertscore")
    meteor = evaluate.load("meteor")
    rouge_scorer_instance = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    originals = []
    references = []
    predictions = []

    # Process test data
    for batch in tqdm(test_loader):
        for i in range(len(batch['original_prompt'])):
            original_prompt = batch['original_prompt'][i]
            refined_prompt = batch['refined_prompt'][i]

            # Generate prediction
            predicted_prompt = model.generate_text(original_prompt)

            originals.append(original_prompt)
            references.append(refined_prompt)
            predictions.append(predicted_prompt)



    # Calculate BLEU scores
    references_tokenized = [[nltk.word_tokenize(ref)] for ref in references]
    predictions_tokenized = [nltk.word_tokenize(pred) for pred in predictions]
    bleu_score = corpus_bleu(references_tokenized, predictions_tokenized)

    # Calculate ROUGE scores
    rouge_scores = {metric: 0 for metric in ['rouge1', 'rouge2', 'rougeL']}
    for ref, pred in zip(references, predictions):
        scores = rouge_scorer_instance.score(ref, pred)
        for metric, score in scores.items():
            rouge_scores[metric] += score.fmeasure

    # Average ROUGE scores
    for metric in rouge_scores:
        rouge_scores[metric] /= len(references)

    # Calculate BERTScore
    bert_results = bertscore.compute(predictions=predictions, references=references, lang="en")
    bert_f1 = sum(bert_results['f1']) / len(bert_results['f1'])

    # Calculate METEOR
    meteor_score = meteor.compute(predictions=predictions, references=references)['meteor']

    # Print results
    print("\n===== Evaluation Results =====")
    print(f"BLEU: {bleu_score:.4f}")
    print(f"ROUGE-1: {rouge_scores['rouge1']:.4f}")
    print(f"ROUGE-2: {rouge_scores['rouge2']:.4f}")
    print(f"ROUGE-L: {rouge_scores['rougeL']:.4f}")
    print(f"BERTScore F1: {bert_f1:.4f}")
    print(f"METEOR: {meteor_score:.4f}")

    # Display example outputs
    print("\n===== Example Outputs =====")
    for i in range(min(5, len(originals))):
        print(f"Original: {originals[i]}")
        print(f"Reference: {references[i]}")
        print(f"Prediction: {predictions[i]}")
        print("-" * 50)

    results_df = pd.DataFrame({
        'original': originals,
        'reference': references,
        'prediction': predictions,
        'is_english': [is_english(p) for p in predictions]
    })
    results_df.to_csv('evaluation_results_gpt2.csv', index=False)

    return {
        'bleu': bleu_score,
        'rouge1': rouge_scores['rouge1'],
        'rouge2': rouge_scores['rouge2'],
        'rougeL': rouge_scores['rougeL'],
        'bertscore': bert_f1,
        'meteor': meteor_score,
        'english_percentage': english_outputs/len(predictions)*100
    }

In [None]:
# Main function to run everything
def main():
    print("Starting prompt refinement model training and evaluation with GPT-2")

    # Train model
    model, tokenizer, test_loader = train_model(
        model_name="gpt2",  # Can also use "gpt2-medium" for better results
        batch_size=8,
        max_epochs=3,
        lr=3e-5
    )

    # Evaluate model
    results = evaluate_model(model, tokenizer, test_loader)

if __name__ == "__main__":
    main()

Starting prompt refinement model training and evaluation with GPT-2


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Starting training...


INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type            | Params | Mode
-------------------------------------------------
0 | model | GPT2LMHeadModel | 124 M  | eval
-------------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
497.759   Total estimated model params size (MB)
0         Modules in train mode
164       Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/utilities/data.py:79: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 8. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.11/dist-packages/pytorch_lightning/utilities/data.py:79: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 2. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

Evaluating model...


Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


  0%|          | 0/27 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask

English outputs: 210/210 (100.00%)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



===== Evaluation Results =====
BLEU: 0.0336
ROUGE-1: 0.2775
ROUGE-2: 0.0865
ROUGE-L: 0.1952
BERTScore F1: 0.8705
METEOR: 0.2978

===== Example Outputs =====
Original: tell me why
"tell me why" continue
continue the song "tell me why"
Reference: Refined_Prompt: Continue the lyrics from the song "Tell Me Why" starting from the phrase "tell me why". Please provide the next few lines of the song.
Prediction: Refined_Prompt: Please compose a song that captures the essence of the lyrics to the popular song, "Tell me Why." The song should be upbeat, capturing the emotion of a joyful and hopeful moment. Focus on maintaining the chorus that conveys the message that "I'm not alone" and "We can help each other." Additionally, ensure that the theme of "we can" is clearly articulated. Ensure that each line is clear, concise, and catchy. Additionally and specifically, write a verse that describes the significance of this moment and the importance of friendship. Please ensure the verse is well-struc

# Build Your First RAG System

1. Data Ingestion.
2. Indexing.
3. Retriever.
4. Response Synthesizer.
5. Querying.

## Install Required packages

Download the required packages by executing the below commands in either Anaconda Prompt (in Windows) or Terminal (in Linux or Mac OS)

pip install llama-index

## Environment Variables

It is recommonded to store the API keys in a '.env' file, separate from the code.
Plesae follow the below steps.
1. Create a text file with the name '.env'
2. Enter your api key in this format OPENAI_API_KEY='sk-e8943u9ru4982............'
3. Save and close the file

Then, as shown below you can provide the path of the '.env' file to 'load_dotenv' method.
This will load any API keys stored in the '.env' file.

## Start

In [None]:
import os

In [None]:
%%capture
!pip install llama-index

In [None]:
!pip install python-dotenv



In [None]:
from dotenv import load_dotenv, find_dotenv

In [None]:
# Load environment variables from the .env file and uploading the .env file
from google.colab import files
uploaded = files.upload()

Saving .env to .env


In [None]:
# Loading the .env file
from dotenv import load_dotenv
load_dotenv('.env')

# Read the API key for the chatbot
import os
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

if not OPENAI_API_KEY:
    raise ValueError("OPENAI_API_KEY not found!")

#added this line while testing
print("API Key loaded successfully ")

API Key loaded successfully 


# Stage 1: Data Ingestion

## Data Loaders


In [None]:
from llama_index.core import SimpleDirectoryReader

In [None]:
documents = SimpleDirectoryReader(input_files=['Photosynthesis.pdf']).load_data()

In [None]:
# Check the datatype and length of the loaded documents
type(documents)

list

In [None]:
# total number of pages read from the PDF
len(documents)

10

In [None]:
documents[0]

Document(id_='73196265-87af-4e6a-a743-cb97493f3eee', embedding=None, metadata={'page_label': '1', 'file_name': 'Photosynthesis.pdf', 'file_path': 'Photosynthesis.pdf', 'file_type': 'application/pdf', 'file_size': 663220, 'creation_date': '2025-04-26', 'last_modified_date': '2025-04-26'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='Photosynthesis \nPhotosynthesis is the process by which plants, some bacteria and some protistans use the energy \nfrom sunlight to produce glucose from carbon dioxide and water. This glucose can be converted into \npyruvate which releases adenosine triphosphate (ATP) by cellular respiration. Oxygen is also f

In [None]:
# Retrieve the first document, wich is the first page in the PDF
documents[0]

Document(id_='73196265-87af-4e6a-a743-cb97493f3eee', embedding=None, metadata={'page_label': '1', 'file_name': 'Photosynthesis.pdf', 'file_path': 'Photosynthesis.pdf', 'file_type': 'application/pdf', 'file_size': 663220, 'creation_date': '2025-04-26', 'last_modified_date': '2025-04-26'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='Photosynthesis \nPhotosynthesis is the process by which plants, some bacteria and some protistans use the energy \nfrom sunlight to produce glucose from carbon dioxide and water. This glucose can be converted into \npyruvate which releases adenosine triphosphate (ATP) by cellular respiration. Oxygen is also f

We can also access specific attributes of the document, such as its ID and metadata:

In [None]:
# Get the ID of the first document
documents[0].id_

'73196265-87af-4e6a-a743-cb97493f3eee'

In [None]:
documents[0].doc_id

'73196265-87af-4e6a-a743-cb97493f3eee'

In [None]:
# Get the metadata of the first document
documents[0].metadata

{'page_label': '1',
 'file_name': 'Photosynthesis.pdf',
 'file_path': 'Photosynthesis.pdf',
 'file_type': 'application/pdf',
 'file_size': 663220,
 'creation_date': '2025-04-26',
 'last_modified_date': '2025-04-26'}

In [None]:
# Get the text content of the first document
print(documents[0].text)

Photosynthesis 
Photosynthesis is the process by which plants, some bacteria and some protistans use the energy 
from sunlight to produce glucose from carbon dioxide and water. This glucose can be converted into 
pyruvate which releases adenosine triphosphate (ATP) by cellular respiration. Oxygen is also formed. 
Photosynthesis may be summarised by the word equation: 
carbon dioxide + water 
 
glucose + oxygen 
The conversion of usable sunlight energy into chemical energy is associated with the action of the 
green pigment chlorophyll. 
Chlorophyll is a complex molecule. Several modifications of chlorophyll occur among plants and other 
photosynthetic organisms. All photosynthetic organisms have chlorophyll a. Accessory pigments 
absorb energy that chlorophyll a does not absorb. Accessory pigments include chlorophyll b (also c, 
d, and e in algae and protistans), xanthophylls, and carotenoids (such as beta-carotene). Chlorophyll 
a absorbs its energy from the violet-blue and reddish or

## Embedding Model

Next, we need to prepare our document for embedding and interaction with a large language model. We will use the OpenAI API for this purpose.

In [None]:
# Embedding Model
from llama_index.embeddings.openai import OpenAIEmbedding

In [None]:
# Initialize the embedding model
embed_model = OpenAIEmbedding(model="text-embedding-3-large")

## LLM

Similarly, let's set up our large language model (LLM):

In [None]:
# LLM
from llama_index.llms.openai import OpenAI

In [None]:
# Initialize the large language model
llm = OpenAI(model= "gpt-4o")

# Stage 2: Indexing

In [None]:
# Indexing
from llama_index.core import VectorStoreIndex

Here, we use the `VectorStoreIndex` class to create an index from the loaded documents. We pass the document chunks, embedding model, and LLM to the `from_documents` method.

In [None]:
# Create an index from the documents using the embedding model and LLM
index = VectorStoreIndex.from_documents(documents, embed_model=embed_model) #, llm=llm)

# Stage 3: Retrieval

In [None]:
# Setting up the Index as Retriever
retriever = index.as_retriever()

In [None]:
# Retrieve information based on the query "What are Transformers?"
retrieved_nodes = retriever.retrieve("What is self attention?")

In [None]:
# Get the metadata of the first retrieved node
retrieved_nodes[0].metadata

{'page_label': '8',
 'file_name': 'Photosynthesis.pdf',
 'file_path': 'Photosynthesis.pdf',
 'file_type': 'application/pdf',
 'file_size': 663220,
 'creation_date': '2025-04-26',
 'last_modified_date': '2025-04-26'}

In [None]:
# Access the ID of the first retrieved node
retrieved_nodes[0].id_

'fcab3c4f-1e82-4f0f-a665-aa0e70ea80b1'

In [None]:
# Access the node_id of the first retrieved node
retrieved_nodes[0].node_id

'fcab3c4f-1e82-4f0f-a665-aa0e70ea80b1'

In [None]:
# Access the full node object of the first retrieved node
retrieved_nodes[0].node

TextNode(id_='fcab3c4f-1e82-4f0f-a665-aa0e70ea80b1', embedding=None, metadata={'page_label': '8', 'file_name': 'Photosynthesis.pdf', 'file_path': 'Photosynthesis.pdf', 'file_type': 'application/pdf', 'file_size': 663220, 'creation_date': '2025-04-26', 'last_modified_date': '2025-04-26'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='4876f8f8-e3a4-43b5-942f-b5d27478819d', node_type='4', metadata={'page_label': '8', 'file_name': 'Photosynthesis.pdf', 'file_path': 'Photosynthesis.pdf', 'file_type': 'application/pdf', 'file_size': 663220, 'creation_date': '2025-04-26', 'last_modified_date': '2025-04-26'}, hash='269206bde8e101b5481e3de3718504d1cefbc77869899d9c81081765aeaeeeb1')}, metadata_template='{key}: {val

In [None]:
# Access the text content of the first retrieved node
print(retrieved_nodes[0].text)

Cyclic phosphorylation 
The net effect of non-cyclic phosphorylation is to pass electrons from water to NADP. Energy 
released enables the production of ATP. But much more ATP is needed to drive the light -independent 
reactions. 
This extra energy is obtained from cyclic phosphorylation. This involves only Photosystem I which 
generates excited electrons. These are transferred to the electron transport chain between PSII and 
PSI, rather than to NADP+ and so no NADPH is formed. The cycle is completed by electrons being 
transported back to PSI by the electron transport system. 
The light-independent reactions 
In the Light-Independent Process (the Dark reaction) carbon dioxide from the atmosphere (or water 
for aquatic/marine organisms) is captured and modified by the addition of hydrogen to form 
carbohydrates. The incorporation of carbon dioxide into organic compounds is known as  carbon 
fixation. The energy for this comes from the first phase of the photosynthetic process. Living 

In [None]:
retrieved_nodes[1].metadata

{'page_label': '4',
 'file_name': 'Photosynthesis.pdf',
 'file_path': 'Photosynthesis.pdf',
 'file_type': 'application/pdf',
 'file_size': 663220,
 'creation_date': '2025-04-26',
 'last_modified_date': '2025-04-26'}

In [None]:
print(retrieved_nodes[1].text)

The light-dependent reactions 
When light energy is absorbed by a chlorophyll molecule its electrons gain energy and move to higher 
energy levels in the molecule (photoexcitation). Sufficient energy ionises the molecule, with the 
electron being 'freed' leaving a positively charged chlorophyll ion. This is called photoionisation.  
In whole chloroplasts each chlorophyll molecule is associated with an electron acceptor and 
an electron donor. These three molecules make up the core of a photosystem. Two electrons from 
a photoionised chlorophyll molecule are transferred to the electron acceptor. The positively charged 
chlorophyll ion then takes a pair of electrons from a neighbouring electron donor such as water.  
 
An electron transfer system (a series of chemical reactions) carries the two electrons to and fro across 
the thylakoid membrane. The energy to drive these processes comes from two photosystems:  
• Photosystem II (PSII) (P680) 
• Photosystem I (PSI) (P700) 
It may seem co

# Stage 4: Response Synthesis


In [None]:
from llama_index.core import get_response_synthesizer

In [None]:
# Initialize the response synthesizer with the LLM
response_synthesizer = get_response_synthesizer(llm=llm)

## Stage 5: Query Engine

In [None]:
# Create a query engine using the index, LLM, and response synthesizer
query_engine = index.as_query_engine(llm=llm, response_synthesizer=response_synthesizer)

In [None]:
# Query the LLM using the query engine
response = query_engine.query("i have a headache, a sore throat and a high temperature, give me a diagnosis")

In this command, we query the LLM with the question "What are Transformers?" and store the response in the `response` variable.

To view the response generated by the LLM, we can access the `response` attribute:


In [None]:
# View the response from the LLM
response.response

"I'm sorry to hear that you're not feeling well. However, I can't provide a medical diagnosis. It's best to consult a healthcare professional who can evaluate your symptoms and provide appropriate advice or treatment."

This returns the synthesized answer to our query.

We can further analyze the response by checking its length and inspecting the source nodes used to generate it:


These commands provide the length of the response and the number of source nodes, respectively.

In [None]:
# Check the length of the response
len(response.response) # number of characters in the response

216

In [None]:
# Check the number of source nodes
len(response.source_nodes)  # list of 2 nodes

2

In [None]:
# Access the ID and metadata of the first source node
response.source_nodes[0].id_

'dd2a4998-3dc8-4c34-85f3-3e3eadf4d050'

In [None]:
# Access the ID and metadata of the second source node
response.source_nodes[0].metadata

{'page_label': '5',
 'file_name': 'Photosynthesis.pdf',
 'file_path': 'Photosynthesis.pdf',
 'file_type': 'application/pdf',
 'file_size': 663220,
 'creation_date': '2025-04-26',
 'last_modified_date': '2025-04-26'}

In [None]:
response.source_nodes[1].id_

'07a969e3-b34f-41db-af64-446691657206'

In [None]:
response.source_nodes[1].metadata

{'page_label': '10',
 'file_name': 'Photosynthesis.pdf',
 'file_path': 'Photosynthesis.pdf',
 'file_type': 'application/pdf',
 'file_size': 663220,
 'creation_date': '2025-04-26',
 'last_modified_date': '2025-04-26'}

In [None]:
print(query_engine.query("Why are positional encodings important in transformer? answer in one very short sentence").response)

Positional encodings are important in transformers to provide information about the order of the input sequence.


In [None]:
print(query_engine.query("What are Encoder and Decoder blocks in transformer?").response)

The context does not provide information about Encoder and Decoder blocks in transformers.


In [None]:
query = "If I want to generate document embeddings, then which type of Transformer Architecture I must choose?"
print(query_engine.query(query).response)

The context does not provide information about generating document embeddings or the type of Transformer Architecture to choose for that purpose.


In [None]:
query = """If I want to generate document embeddings,
then which type of Transformer Architecture I must choose among Encoders, Decoders or Encoder-Decorder?"""

print(query_engine.query(query).response)

To generate document embeddings, you should choose the Encoder architecture.


In [None]:
!pip install gradio

In [None]:
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the tokenizer and model we trained earlier using HuggingFace AutoClasses
tokenizer = AutoTokenizer.from_pretrained("final_model_gpt2")
refinement_model = AutoModelForCausalLM.from_pretrained("final_model_gpt2") # Using CausalLM instead of Seq2Seq because we're working with GPT-2
refinement_model.eval()
refinement_model.to('cuda' if torch.cuda.is_available() else 'cpu')

# This function takes a prompt and improves it using the trained model
def refine_prompt(prompt):
    # Set up the input format that GPT-2 expects
    formatted_prompt = f"Refine the following prompt concisely in English,stay true to the original meaning: {prompt}"

    input_ids = tokenizer(formatted_prompt, return_tensors="pt").input_ids.to(refinement_model.device)

    # Generate a refined version with some controlled randomness
    outputs = refinement_model.generate(
        input_ids=input_ids,
        max_length=100,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        no_repeat_ngram_size=2,
        num_return_sequences=1
    )

    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)


    refined_part = full_output

    # Try to cut the text cleanly at the last sentence-ending punctuation mark
    sentence_end_marks = ['.', '!', '?']
    last_punctuation_index = -1

    for mark in sentence_end_marks:
        index = refined_part.rfind(mark)
        if index > last_punctuation_index:
            last_punctuation_index = index

    if last_punctuation_index != -1:
        refined_part = refined_part[:last_punctuation_index + 1]
        refined_part = refined_part.split("Refined prompt:")[-1].strip()


    return refined_part

# This function will send the prompt to the RAG system and get a response
def predict(prompt):
    response = query_engine.query(prompt)
    return response.response

# Set up the Gradio user interface using Blocks
with gr.Blocks() as iface:
    with gr.Row():  # Put elements side by side in a row
        prompt_box = gr.Textbox(lines=2, placeholder="Enter your prompt here...", label="Prompt")
        refine_button = gr.Button("Refine Prompt")
        submit_button = gr.Button("Submit")  # Button to submit the prompt to the chatbot
    output_box = gr.Textbox(label="Output")

    # When the "Refine Prompt" button is clicked, this function is executed
    def refine_prompt_handler(prompt):
        refined_prompt = refine_prompt(prompt)
        return gr.update(value=refined_prompt)  # Update the prompt box with the refined version

    # Connect the refine button to the handler
    refine_button.click(
        refine_prompt_handler,
        inputs=prompt_box,  # Take the user's input
        outputs=prompt_box  # Show the refined prompt in the same box
    )

    # Connect the submit button to the predict function
    submit_button.click(
        predict,  # Call the predict function to get a response
        inputs=prompt_box,  # Use whatever is currently in the prompt box
        outputs=output_box  # Show the response in the output box
    )

# Launch the Gradio app
iface.launch()

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ab0339cdd70be34a75.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


