In [None]:
# Installing necessary libraries
!pip install sentence-transformers pandas googletrans==4.0.0-rc1 datasets transformers



In [None]:
# Importing required libraries
import pandas as pd
from sentence_transformers import SentenceTransformer, util  # For semantic similarity and retrieval
from googletrans import Translator  # For translating between English and Bangla
from transformers import pipeline  # For performing question-answering tasks

In [None]:
# Initializing translator, QA pipelines, and SentenceTransformer model
translator = Translator()  # Translator for handling language translation tasks
english_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")  # English QA model
multilingual_pipeline = pipeline("question-answering", model="xlm-roberta-base")  # Multilingual QA model
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")  # Sentence embedding model for context retrieval

Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
### PART 1: Bigger Dataset Creation ###

# Creating a larger English dataset for testing
def create_large_combined_dataset():
    # Defining a list of dictionaries with context and questions
    data = [
        {
            "context": "Photosynthesis is the process by which green plants and some other organisms use sunlight to synthesize foods from carbon dioxide and water.",
            "question": "What is photosynthesis?"
        },
        {
            "context": "The Eiffel Tower is one of the most iconic structures in the world, located in Paris, France.",
            "question": "Where is the Eiffel Tower located?"
        },
        {
            "context": "Blockchain is a decentralized digital ledger used for recording transactions securely and transparently.",
            "question": "What is blockchain?"
        },
        {
            "context": "Leonardo da Vinci was a polymath of the Renaissance period known for his works such as the Mona Lisa and The Last Supper.",
            "question": "Who was Leonardo da Vinci?"
        },
        {
            "context": "Climate change refers to long-term shifts in temperatures and weather patterns, primarily due to human activities.",
            "question": "What is climate change?"
        },
        {
            "context": "The Amazon rainforest is the largest tropical rainforest in the world, home to diverse flora and fauna.",
            "question": "What is the Amazon rainforest known for?"
        },
        {
            "context": "Python is a versatile programming language that supports multiple programming paradigms.",
            "question": "What type of programming language is Python?"
        },
        {
            "context": "The Moon orbits the Earth and is its only natural satellite, influencing tides and nighttime illumination.",
            "question": "What is the Moon's relationship with Earth?"
        }
    ]
    # Creating a DataFrame and saving it to a CSV file
    df = pd.DataFrame(data)
    df.to_csv("large_combined_dataset.csv", index=False)
    print("Large English dataset created and saved to large_combined_dataset.csv")

In [None]:
# Calling the function to create the dataset
create_large_combined_dataset()

Large English dataset created and saved to large_combined_dataset.csv


In [None]:
### PART 2: Translating Dataset to Bangla ###

# Translating the English dataset to Bangla
def translate_to_bangla(input_csv, output_csv):
    # Reading the English dataset
    df = pd.read_csv(input_csv)

    # Translating context and questions to Bangla
    df['context_bn'] = df['context'].apply(lambda x: translator.translate(x, src='en', dest='bn').text)
    df['question_bn'] = df['question'].apply(lambda x: translator.translate(x, src='en', dest='bn').text)

    # Debugging: Printing the first few rows to ensure correct translation
    print(df.head())

    # Saving the translated dataset to a CSV file
    df.to_csv(output_csv, index=False)
    print(f"Translated Bangla dataset saved to {output_csv}")

In [None]:
# Calling the function to translate the dataset
translate_to_bangla("large_combined_dataset.csv", "translated_large_combined_dataset.csv")

                                             context  \
0  Photosynthesis is the process by which green p...   
1  The Eiffel Tower is one of the most iconic str...   
2  Blockchain is a decentralized digital ledger u...   
3  Leonardo da Vinci was a polymath of the Renais...   
4  Climate change refers to long-term shifts in t...   

                             question  \
0             What is photosynthesis?   
1  Where is the Eiffel Tower located?   
2                 What is blockchain?   
3          Who was Leonardo da Vinci?   
4             What is climate change?   

                                          context_bn  \
0  সালোকসংশ্লেষণ হ'ল প্রক্রিয়া যার মাধ্যমে সবুজ ...   
1  আইফেল টাওয়ারটি ফ্রান্সের প্যারিসে অবস্থিত বিশ...   
2  ব্লকচেইন হ'ল একটি বিকেন্দ্রীভূত ডিজিটাল লেজার ...   
3  লিওনার্দো দা ভিঞ্চি ছিলেন মোনা লিসা এবং দ্য লা...   
4  জলবায়ু পরিবর্তন মূলত মানুষের ক্রিয়াকলাপের কা...   

                       question_bn  
0                সালোকসংশ্লেষণ কী?  
1  আই

In [None]:
### PART 3: Simplified Retrieval (RAG) ###

# Retrieving the most relevant context using semantic similarity
def retrieve_context(question, documents):
    """
    Retrieve the most relevant context from documents using sentence embeddings.
    :param question: User's question
    :param documents: List of documents
    :return: Most relevant context
    """
    # Encoding the question and documents as sentence embeddings
    question_embedding = sentence_model.encode(question, convert_to_tensor=True)
    document_embeddings = sentence_model.encode(documents, convert_to_tensor=True)

    # Calculating similarity scores
    scores = util.pytorch_cos_sim(question_embedding, document_embeddings)
    best_doc_idx = scores.argmax().item()  # Getting the index of the best matching document

    return documents[best_doc_idx]

In [None]:
### PART 4: Direct Preference Optimization (DPO) ###

# Ranking answers based on a simple heuristic
def rank_answers_with_dpo(answers):
    """
    Rank answers using a simple heuristic (like length of the answer).
    :param answers: List of answers
    :return: Best answer
    """
    # Selecting the longest answer as the best (simple heuristic)
    best_answer = max(answers, key=len)
    return best_answer

In [None]:
### PART 5: Processing and Printing English and Bangla Answers Separately ###

# Processing the dataset and printing English and Bangla answers separately
def process_and_print_answers(input_csv):
    # Reading the dataset
    df = pd.read_csv(input_csv)

    # Initializing lists to store answers
    answers_en = []  # English answers
    answers_bn = []  # Bangla answers

    # Printing English answers
    print("English QA Results")
    print("{:<5} {:<80} {:<50}".format("No.", "English Context", "English Answer"))
    print("=" * 140)

    for idx, row in df.iterrows():
        # Retrieving English context
        retrieved_context_en = retrieve_context(row['question'], [row['context']])

        # Processing English context and question
        try:
            result_en = english_pipeline({'context': retrieved_context_en, 'question': row['question']})
            answers_en.append(result_en['answer'])
        except Exception as e:
            answers_en.append(f"Error: {e}")

        # Printing the results for English QA
        print("{:<5} {:<80} {:<50}".format(idx + 1, row['context'], answers_en[-1]))

    print("\n\nBangla QA Results")
    print("{:<5} {:<80} {:<50}".format("No.", "Bangla Context", "Bangla Answer"))
    print("=" * 140)

    for idx, row in df.iterrows():
        # Retrieving Bangla context
        retrieved_context_bn = retrieve_context(row['question'], [row['context_bn']])

        # Processing Bangla context and question
        try:
            result_bn = multilingual_pipeline({'context': retrieved_context_bn, 'question': row['question_bn']})
            answers_bn.append(result_bn['answer'])
        except Exception as e:
            answers_bn.append(f"Error: {e}")

        # Printing the results for Bangla QA
        print("{:<5} {:<80} {:<50}".format(idx + 1, row['context_bn'], answers_bn[-1]))

In [None]:
# Calling the function to process and print QA results
process_and_print_answers("translated_large_combined_dataset.csv")

English QA Results
No.   English Context                                                                  English Answer                                    
1     Photosynthesis is the process by which green plants and some other organisms use sunlight to synthesize foods from carbon dioxide and water. the process by which green plants and some other organisms use sunlight




2     The Eiffel Tower is one of the most iconic structures in the world, located in Paris, France. Paris, France                                     
3     Blockchain is a decentralized digital ledger used for recording transactions securely and transparently. a decentralized digital ledger                    
4     Leonardo da Vinci was a polymath of the Renaissance period known for his works such as the Mona Lisa and The Last Supper. a polymath                                        
5     Climate change refers to long-term shifts in temperatures and weather patterns, primarily due to human activities. long-term shifts in temperatures and weather patterns
6     The Amazon rainforest is the largest tropical rainforest in the world, home to diverse flora and fauna. diverse flora and fauna                           
7     Python is a versatile programming language that supports multiple programming paradigms. versatile                                         
8     The Moon orbits the 