In [1]:
# Installing necessary libraries

!pip install sentence-transformers pandas googletrans==4.0.0-rc1 datasets transformers gradio


Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting gradio
  Downloading gradio-5.7.1-py3-none-any.whl.metadata (16 kB)
Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading hstspreload-2024.12.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading idna-2.10-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting httpcore==0

In [2]:
# Importing required libraries

import pandas as pd

from sentence_transformers import SentenceTransformer, util

from googletrans import Translator

from transformers import pipeline

import gradio as gr


In [3]:
# Initializing models and translator

translator = Translator()  # Initializing the Google Translator to translate text

english_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")  # Initializing English question-answering pipeline using distilBERT

multilingual_pipeline = pipeline("question-answering", model="xlm-roberta-base")  # Initializing Multilingual question-answering pipeline using XLM-RoBERTa

sentence_model = SentenceTransformer("all-MiniLM-L6-v2")  # Initializing the sentence transformer model to generate embeddings




config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [4]:
# Defining dataset with questions and contexts

data = [

    {"context": "Artificial Intelligence (AI) refers to the simulation of human intelligence in machines.",

     "question": "What does AI refer to?"},

    {"context": "The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France.",

     "question": "Where is the Eiffel Tower located?"},

    {"context": "The Mona Lisa is a portrait painting by the Italian artist Leonardo da Vinci.",

     "question": "Who painted the Mona Lisa?"},

    {"context": "The Great Wall of China is a series of fortifications made of various materials.",

     "question": "What is the Great Wall of China made of?"},

    {"context": "The speed of light in a vacuum is approximately 299,792 kilometers per second.",

     "question": "What is the speed of light?"}

]



# Translating dataset to Bangla

for row in data:

    row['context_bn'] = translator.translate(row['context'], src='en', dest='bn').text  # Translating context to Bangla

    row['question_bn'] = translator.translate(row['question'], src='en', dest='bn').text  # Translating question to Bangla




In [5]:
# Defining a function to retrieve the most relevant context for a given question

def retrieve_context(question, documents):

    question_embedding = sentence_model.encode(question, convert_to_tensor=True)  # Encoding the question to get its embedding

    document_embeddings = sentence_model.encode(documents, convert_to_tensor=True)  # Encoding the documents (contexts) to get their embeddings

    scores = util.pytorch_cos_sim(question_embedding, document_embeddings)  # Calculating the cosine similarity between the question and the documents

    best_doc_idx = scores.argmax().item()  # Finding the index of the most relevant document

    return documents[best_doc_idx]  # Returning the most relevant document (context)




In [6]:
# Defining a function to generate answers in both English and Bangla

def generate_answers(question, idx):

    context_en = retrieve_context(question, [data[idx]['context']])  # Retrieving the English context

    context_bn = retrieve_context(question, [data[idx]['context_bn']])  # Retrieving the Bangla context



    try:

        en_result = english_pipeline({'context': context_en, 'question': question})  # Generating answer in English using the English pipeline

        en_answer = en_result['answer']  # Extracting the English answer

    except Exception as e:

        en_answer = f"Error: {e}"  # Handling any errors while generating the English answer



    try:

        bn_question = translator.translate(question, src='en', dest='bn').text  # Translating the question to Bangla

        bn_result = multilingual_pipeline({'context': context_bn, 'question': bn_question})  # Generating answer in Bangla using the multilingual pipeline

        bn_answer = bn_result['answer']  # Extracting the Bangla answer

    except Exception as e:

        bn_answer = f"Error: {e}"  # Handling any errors while generating the Bangla answer



    # Returning the answers along with the context in both English and Bangla

    return f"English Context:\n{context_en}\n\nEnglish Answer:\n{en_answer}\n\nBangla Context:\n{context_bn}\n\nBangla Answer:\n{bn_answer}"




In [7]:
# Defining Gradio Interface function

def qa_interface(question, idx):

    idx = int(idx) - 1  # Adjusting the index to be 0-based as Python uses 0-based indexing

    return generate_answers(question, idx)  # Returning the generated answers by calling the generate_answers function




In [8]:
# Defining Gradio input and output components

questions = [f"{i+1}: {row['question']}" for i, row in enumerate(data)]  # Creating a list of questions for the interface




In [9]:
# Creating Gradio interface with input text box for question and number input for dataset index

interface = gr.Interface(

    fn=qa_interface,  # Function that will be called on user input

    inputs=[gr.Textbox(label="Question"), gr.Number(label="Dataset Index (1-based)")],  # Inputs for question and dataset index

    outputs="text",  # Output type: text will be shown in the output area

    title="Question Answering System",  # Title of the app

    description="Provide a question and the dataset index to retrieve the relevant context and generate answers."  # Description of the app functionality

)






IMPORTANT: You are using gradio version 4.19.1, however version 4.44.1 is available, please upgrade.
--------


In [11]:
# Launching the Gradio app

interface.launch()  # Launching the interface to be accessible in the browser

Rerunning server... use `close()` to stop if you need to change `launch()` parameters.
----
Kaggle notebooks require sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Running on public URL: https://31858fa099ba45d10a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


