# Installing the necessary Modules

In [None]:
!pip install datasets

# Importing Necessary Libraries

In [55]:
import re
import requests
from datasets import load_dataset
from typing import List, Tuple, Dict
from dotenv import load_dotenv
import os
import json
load_dotenv()

DEEPSEEK_API = os.environ['DEEPSEEK_API']

# Step 1. Loading the dataset

In [2]:
dataset = load_dataset("hishab/bangla-mmlu")

# splitting into different sets

validation = dataset["validation"]
test = dataset["test"]
dev = dataset['dev']

# Step 2. Working with a single Set (i.e, Validation)

## Formatting The Question

In [3]:
def format_question(sample: Dict) -> Dict:
    """
    A function to format the question with proper option in a clear way so that the llm have easy time understanding it without any confusion
    i.e, "Question in Bangla: নিচের কোনটিতে মেটামারিজম অনুপস্থিত?
        Options in Bangla: {"A":'সেকেন্ডারি অ্যামিন', "B":'কিটোন', "C": 'অ্যালকোহল', "D": 'ইথার'}"

    Inputs:
    sample =  A dict object containing id,question, choices and answer field

    Output:
    Dictionary object containing evarything in input dict and formated question (str)
    """

    ID = sample['id']
    question = sample['question']
    choices = sample['choices']
    answer = sample['answer']


    # formating the Question
    formated_option = {
        "A": choices[0],
        "B": choices[1],
        "C": choices[2],
        "D": choices[3],
    }

    formated_question = f"""Below there is a Bangla multiple choice question with 4 option, you need to find the correct option for the question.
    Question in Bangla: {question}
    Options in Bangla: {str(formated_option)}"""
    
    sample_with_formated_question = {
        'id':ID,
        'question': question,
        'choices': choices,
        'answer': answer,
        'formated_question': formated_question
    }

    return sample_with_formated_question

In [7]:
## checking if the function

format_question(validation[6])

{'id': '32f661ee-5fde-4209-93cb-0f7fe37a6f75-71577',
 'question': '"Am I wanted by you?" বাক্যের Active Voice হচ্ছে-',
 'choices': ['Are you wanted me?',
  'Did you wanted me?',
  'Do you want me?',
  'Does you want me?'],
 'answer': 'C',
 'formated_question': 'Below there is a Bangla multiple choice question with 4 option, you need to find the correct option for the question.\n    Question in Bangla: "Am I wanted by you?" বাক্যের Active Voice হচ্ছে-\n    Options in Bangla: {\'A\': \'Are you wanted me?\', \'B\': \'Did you wanted me?\', \'C\': \'Do you want me?\', \'D\': \'Does you want me?\'}'}

In [5]:
len(validation)

72944

In [42]:
validation[7]

{'id': 'e7a15d85-d42f-4a26-ace7-cea8809de0b5-121871',
 'question': 'কোন জীবাণুর কারণে আলুর বিলম্বিত ধ্বসা রােগ হয়?',
 'choices': ['Puccinia', 'Pythium', 'Phytophthora', 'Penicillium'],
 'answer': 'C'}

## Building the API pipeline

In [43]:
def api_calling(sample):
    
    formated_question = sample['formated_question']
    url = "https://api.hyperbolic.xyz/v1/chat/completions"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {DEEPSEEK_API}"
    }
    system_instruction = """You are an inteligent assistant which will help the user to answer multiple choice question. The multiple choice question is in bangla and your final answer will always be in bangla."""
    
    data = {
        "messages": [
            {
                "role": "system",
                "content": system_instruction
            },
            {
                "role": "user",
                "content": formated_question
            }
        ],
        "model": "deepseek-ai/DeepSeek-R1",
        "temperature": 0.1,
        "top_p": 0.2
    }
    
    response = requests.post(url, headers=headers, json=data)
    return response.json()

In [44]:
# testing api calling

formatted_sample = format_question(validation[7])

response_json  = api_calling(formatted_sample)

In [45]:
response_json

{'id': 'chatcmpl-t7unX7BycuApvA6D5AUrkY',
 'object': 'chat.completion',
 'created': 1740189667,
 'model': 'deepseek-ai/DeepSeek-R1',
 'choices': [{'index': 0,
   'message': {'role': 'assistant',
    'content': "<think>Okay, let's see. The question is asking which pathogen causes the late blight disease in potatoes. The options are Puccinia, Pythium, Phytophthora, and Penicillium.\n\nFirst, I remember that late blight is a famous plant disease. I think it's the one that caused the Irish potato famine. The name Phytophthora infestans comes to mind. So Phytophthora would be option C. Let me check the other options to be sure. Puccinia is a genus of rust fungi, maybe causes wheat rust. Pythium is associated with damping-off in seedlings. Penicillium is the mold used for antibiotics, like Penicillin. So yeah, the correct answer should be C.\n</think>\n\nThe correct answer is C: Phytophthora. আলুর বিলম্বিত ধ্বসা রোগের জন্য দায়ী জীবাণু হলো Phytophthora infestans, যা Phytophthora গণের অন্তর্ভ

In [None]:
{'error_code': 124, 'message': "Model requires a role of 'pro' or higher"}

In [62]:
def log_fail(sample_no:int, set_name:str, sample_id:str):

    header = "sample_no,set_name,sample_id\n"  # Include newline
    filename = "failed_sample.csv"
    # Check if the file exists and if it's empty.  More robust than just checking existence.
    if not os.path.exists(filename) or os.path.getsize(filename) == 0:
        with open(filename, 'w') as file:  # Use 'w' to create and write the header
            file.write(header)
    
    with open(filename,'a') as file:
        log_value = f"{str(sample_no)},{set_name},{sample_id}\n"
        file.write(log_value)
    print(f"Failed to get response for sameple no = {sample_no} in {set_name} set")
    

In [12]:
def split_think_reason(respons_string:str):
    # Use regular expression to find the content within the <think> tag
    think_content = re.search(r'<think>(.*?)</think>', respons_string, re.DOTALL).group(0)
    
    # Remove the <think> tag and its content from the original prompt
    ans_content = re.sub(r'<think>(.*?)</think>', '', respons_string, flags=re.DOTALL).strip()

    think_ans_response = {
        'think': think_content,
        'ans': ans_content
    }

    return think_ans_response

In [28]:
def log_token_count(usage:Dict,sample_no:int,set_name:str):
    input_token = usage['usage']['prompt_tokens']
    output_token = usage['usage']['completion_tokens']
    total_token = usage['usage']['total_tokens']

    header = "sample_no,set_name,input_token,output_token,total_token\n"  # Include newline
    filename = 'token_counter.csv'
    # Check if the file exists and if it's empty.  More robust than just checking existence.
    if not os.path.exists(filename) or os.path.getsize(filename) == 0:
        with open(filename, 'w') as file:  # Use 'w' to create and write the header
            file.write(header)

    with open(filename,'a') as file:
        value = f"{sample_no},{set_name},{input_token},{output_token},{total_token}\n"
        file.write(value)

    print(f"Total token for sample {sample_no} in {set_name} = {total_token}")

In [50]:
def create_sample_json(think_response:Dict, formated_question:Dict):
    think = think_response['think']
    gen_ans = think_response['ans']
    
    question = formated_question['question']
    options = formated_question['choices']
    actual_ans = formated_question['answer']
    formated_question_only = formated_question['formated_question']
    id = formated_question['id']

    return {
        "id":id,
        "question": question,
        "options": options,
        "answer": actual_ans,
        "formated_question": formated_question_only,
        "reason": think,
        "gen_ans": gen_ans,
    }

In [51]:
split_think= split_think_reason(response_json['choices'][0]['message']['content'])

create_sample_json(split_think,formatted_sample)

{'id': 'e7a15d85-d42f-4a26-ace7-cea8809de0b5-121871',
 'question': 'কোন জীবাণুর কারণে আলুর বিলম্বিত ধ্বসা রােগ হয়?',
 'options': ['Puccinia', 'Pythium', 'Phytophthora', 'Penicillium'],
 'answer': 'C',
 'formated_question': "Below there is a Bangla multiple choice question with 4 option, you need to find the correct option for the question.\n    Question in Bangla: কোন জীবাণুর কারণে আলুর বিলম্বিত ধ্বসা রােগ হয়?\n    Options in Bangla: {'A': 'Puccinia', 'B': 'Pythium', 'C': 'Phytophthora', 'D': 'Penicillium'}",
 'reason': "<think>Okay, let's see. The question is asking which pathogen causes the late blight disease in potatoes. The options are Puccinia, Pythium, Phytophthora, and Penicillium.\n\nFirst, I remember that late blight is a famous plant disease. I think it's the one that caused the Irish potato famine. The name Phytophthora infestans comes to mind. So Phytophthora would be option C. Let me check the other options to be sure. Puccinia is a genus of rust fungi, maybe causes whe

In [103]:
def save_new_data(data_list:List):
    file_name = "reasoning_bangla_r1_synthetic_data.json"

    with open(file_name,'w') as file:
        try:
            json.dump(data_list,file,indent=4,ensure_ascii=False)
        except:
            print("Error while saving to json file")

In [104]:
## building the main pipeline

def main(dataset,set_name):

    data_from_a_specific_set = dataset[set_name].to_list() # converting to list object
    all_new_data_list = []
    # print(data_from_a_specific_set[:2])

    # now looping each item manually
    for sample_no,sample in enumerate(data_from_a_specific_set[:2]):
        # print(sample_no)
        # print(sample)
        # first formate the question
        format_question_sample =  format_question(sample)

        # now calling the api
        try:
            response_json  = api_calling(format_question_sample)

            # now checking if we got the proper response from it or not
            if('choices' in response_json and isinstance(response_json['choices'], list)):
                # splitting the think and ans from the generated ans
                split_think = split_think_reason(response_json['choices'][0]['message']['content'])
                new_data_sample = create_sample_json(split_think,format_question_sample)
                all_new_data_list.append(new_data_sample) # appending to the list
                # print(all_new_data_list)

                # saving the value every time
                save_new_data(all_new_data_list)
                # logging the token count
                log_token_count(response_json,sample_no,set_name)
            else:
                # didn't get the right response need to log
                log_fail(sample_no,set_name,format_question_sample['id'])
        
        except:
            # logging the error
            log_fail(sample_no,set_name,format_question_sample['id'])

    return all_new_data_list

In [106]:
datalist = main(dataset,'test')

Total token for sample 0 in test = 727
Total token for sample 1 in test = 2507


In [98]:
len(datalist)

2

In [101]:
for i,j in enumerate(dataset['test'].to_list()[:2]):
    print(j)

{'id': '77fe50bf-3147-4b3e-a0e4-8cc1b4eaeb5c-78813', 'question': '“Run away children” said their mother. বাক্যটির indirect speech হবে-', 'choices': ['Addressing the children, their mother said to them to run away.', 'Addressing the children, their mother urged them to run away.', 'Addressing the children, their mother told them to run away.', 'Address the children, their mother asked them to run away.'], 'answer': 'C'}
{'id': '2ce4a431-04cc-4e19-bf0e-eb7a38ceda54-86481', 'question': 'CuSO4 দ্রবণে 5000 mA বিদ্যুৎ 1.0 ঘন্টা চালনা করলে ক্যাথোডে কি পরিমাণ Cu জমা হবে?', 'choices': ['5.8222g', '5.5222 g', '5.7222 g', '5.9222 g'], 'answer': 'D'}


In [102]:
save_new_data(datalist)