In [3]:
!pip install rank_bm25


Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [4]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.20.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.11-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.7.2 (from gradio)
  Downloading gradio_client-1.7.2-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3

In [1]:
import json
import requests
import pandas as pd
from rank_bm25 import BM25Okapi
import nltk
from nltk.tokenize import word_tokenize
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
import gradio as gr

# Download necessary NLTK data
nltk.download('punkt')

# Download the SQuAD dataset
SQUAD_URL = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json"
response = requests.get(SQUAD_URL)
squad_data = response.json()

# Extract relevant data
qa_pairs = []
contexts = []
for article in squad_data["data"]:
    for paragraph in article["paragraphs"]:
        context = paragraph["context"]
        contexts.append(context)
        for qa in paragraph["qas"]:
            question = qa["question"]
            if qa["answers"]:
                answer = qa["answers"][0]["text"]
            else:
                answer = "No Answer"
            qa_pairs.append([context, question, answer])

# Convert to DataFrame
df = pd.DataFrame(qa_pairs, columns=["context", "question", "answer"])
print(df.head())

# Save to CSV for later use
df.to_csv("qa_dataset.csv", index=False)

# Tokenize documents for BM25
tokenized_corpus = [word_tokenize(doc.lower()) for doc in contexts]
bm25 = BM25Okapi(tokenized_corpus)

def retrieve_answer(question):
    tokenized_question = word_tokenize(question.lower())
    scores = bm25.get_scores(tokenized_question)
    best_doc_index = scores.argmax()
    return contexts[best_doc_index]

# Load T5 model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

def generate_answer(question, context):
    input_text = f"question: {question} context: {context}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    output_ids = model.generate(input_ids)
    answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return answer

# Gradio UI
def qa_interface(question):
    retrieved_context = retrieve_answer(question)
    generated_answer = generate_answer(question, retrieved_context)
    return f"Context: {retrieved_context}\n\nAnswer: {generated_answer}"

iface = gr.Interface(fn=qa_interface, inputs="text", outputs="text", title="AI Question Answering System")
iface.launch()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                             context  \
0  Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...   
1  Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...   
2  Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...   
3  Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...   
4  Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...   

                                            question               answer  
0           When did Beyonce start becoming popular?    in the late 1990s  
1  What areas did Beyonce compete in when she was...  singing and dancing  
2  When did Beyonce leave Destiny's Child and bec...                 2003  
3      In what city and state did Beyonce  grow up?        Houston, Texas  
4         In which decade did Beyonce become famous?           late 1990s  


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://9570b851a424d9ba5a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


