In [1]:
import json
import os
from typing import List, Optional

from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import HumanMessage
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.output_parsers import PydanticOutputParser
import re

# Load your Groq API Key from environment variables
groq_api_key = os.environ.get("GROQ_API_KEY")  # IMPORTANT: Set this in your environment
if not groq_api_key:
    raise ValueError("Groq API key not found.  Please set the GROQ_API_KEY environment variable.")

# Initialize Groq model
model = ChatGroq(model_name="qwen-2.5-32b", groq_api_key=groq_api_key)


# Define the Pydantic model for structured output
class TranslationResult(BaseModel):
    bangla_translation: str = Field(description="The Bangla translation of the input text. Do not translate any text enclosed in <<...>> or any line starting with ####. Also, do NOT translate any numbers; keep them in their original English numeral form. Mathematical expressions/answers must remain in their original form.")


# Create a PydanticOutputParser
output_parser = PydanticOutputParser(pydantic_object=TranslationResult)

# Create the prompt
template = """You are a highly precise and professional language translator assistant. Your ONLY task is to translate English language text to Bangla language. You are given ONE piece of text as input, and you MUST return ONLY the Bangla translation of that text in the following JSON format:

{{
    "bangla_translation": "The Bangla translation"
}}

Do not add any introductory or concluding phrases, questions, or conversational elements. Do not translate any text enclosed in <<...>> or any line starting with ####. Also, do NOT translate any numbers; keep them in their original English numeral form. Mathematical expressions/answers must remain in their original form.

Here is the text to translate:
{text}

{format_instructions}"""

prompt = ChatPromptTemplate.from_template(template)

# Get format instructions from the output parser
format_instructions = output_parser.get_format_instructions()

# Chain
chain = prompt | model | output_parser


def translate_text(text):
    """Translates English text to Bangla using structured output."""
    try:
        result = chain.invoke({"text": text, "format_instructions": format_instructions})
        return result.bangla_translation  # Access the translated text from the object
    except Exception as e:
        print(f"Translation error: {e}")
        return None

def process_gsm8k(file_path):
    """
    Loads the GSM8k JSON file, translates questions and answers,
    and saves the translated data back to a new JSON file.
    """
    with open(file_path, 'r', encoding='utf-8') as f:  # Specify encoding!
        data = json.load(f)

    translated_data = []
    for item in data:
        question_en = item["question"]
        answer_en = item["answer"]

        # Translate Question
        question_bn = translate_text(question_en)

        # Translate Answer (preserving calculations)
        def replace_with_original(match):
            return match.group(0)  # Return the original matched text

        # Split the answer into translatable and non-translatable parts.
        parts = re.split(r'(<<.*?>>|####.*)', answer_en)  # Split by <<...>> and ####...
        translated_parts = []

        for i, part in enumerate(parts):
            if re.match(r'(<<.*?>>|####.*)', part):  # Check if it's a calculation part
                translated_parts.append(part)  # Keep the calculation part as is
            else:
                translated_part = translate_text(part)
                translated_parts.append(translated_part if translated_part else part)  # Translate or keep original if translation fails

        answer_bn = "".join(translated_parts)  # Reassemble the translated answer.

        translated_data_item = {
            "question_en": question_en,
            "answer_en": answer_en,
            "question_bn": question_bn,
            "answer_bn": answer_bn
        }
        translated_data.append(translated_data_item)


    # Save the translated data to a new JSON file
    output_file_path = "gsm8k_bn-structured.json"
    with open(output_file_path, 'w', encoding='utf-8') as outfile:
        json.dump(translated_data, outfile, indent=4, ensure_ascii=False)  # ensure_ascii=False for Bangla characters

    print(f"Translation complete. Translated data saved to {output_file_path}")


# Example usage:
file_path = "gsm8k.json"  # Replace with the actual path to your file
process_gsm8k(file_path)


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


AttributeError: type object 'TranslationResult' has no attribute 'model_json_schema'