<a href="https://colab.research.google.com/github/Karthikpasupuleti11/Auto-Correction_of_Telugu_Words/blob/main/Mixed_Telugu_to_Telugu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install OpenAI

Collecting OpenAI
  Downloading openai-1.93.0-py3-none-any.whl.metadata (29 kB)
Collecting jiter<1,>=0.4.0 (from OpenAI)
  Downloading jiter-0.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Downloading openai-1.93.0-py3-none-any.whl (755 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m755.0/755.0 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jiter-0.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (352 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m352.2/352.2 kB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jiter, OpenAI
Successfully installed OpenAI-1.93.0 jiter-0.10.0


In [3]:
import pandas as pd
import re
import csv
import os
import time
from openai import OpenAI

# --- NVIDIA Nemotron Setup ---
API_KEY = "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"
BASE_URL = "https://integrate.api.nvidia.com/v1"
MODEL_NAME = "mistralai/mistral-nemotron"
SLEEP_INTERVAL = 0.01

client = OpenAI(
    base_url=BASE_URL,
    api_key=API_KEY
)

# --- Config ---
START_INDEX = 200
END_INDEX = 20000
INPUT_FILE = "SHOP_DATA.csv"
OUTPUT_FILE = "corrected_telugu_shop_data.csv"
LOG_FILE = "corrected_shop_mismatches_log.csv"

# --- Load input file ---
df = pd.read_csv(INPUT_FILE, encoding='utf-8-sig')
required_columns = ['shop_ward_name', 'shop_street', 'shop_land_mark', 'telugu_1', 'telugu_2', 'telugu_3']
if not all(col in df.columns for col in required_columns):
    raise ValueError(f"Input file must contain these columns: {required_columns}")

def contains_english_letters(text):
    return bool(re.search(r'[a-zA-Z]', str(text)))

# --- Correction Function ---
def correct_telugu(english, telugu):
    telugu = str(telugu).strip()
    english = str(english).strip()

    if telugu and not contains_english_letters(telugu):
        return telugu, None

    try:
        prompt = f"""
You are an AI-based transliteration system designed for high-accuracy conversion of personal names from English to Telugu script, with precise phonetic fidelity.

Your task is to perform strict transliteration only, ensuring that the pronunciation is preserved exactly as intended. This is not a translation task.

Input Name (English): {english}
Incorrect Telugu Version: {telugu}

Follow these directives:

1. Transliterate all parts of the name phonetically into Telugu script.
2. If an initial (e.g., "S", "K") appears alone or at the beginning of a name, transliterate it properly:
   - S → ఎస్
   - K → కే
   - M → ఎం
   - B → బీ
   - D → డి
   - C → సీ
   - V → వీ
3. Maintain the original structure and spacing of the name.
4. Do not retain English letters.
5. Output ONLY the final corrected Telugu name. No English, no explanations, no punctuation.

Corrected Telugu Name:
""".strip()

        response = client.chat.completions.create(
            model=MODEL_NAME,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.2,
            top_p=0.5,
            max_tokens=60,
            stream=False
        )
        corrected = response.choices[0].message.content.strip()
        time.sleep(SLEEP_INTERVAL)

        if corrected != telugu:
            return corrected, {
                "English": english,
                "Before Correction": telugu,
                "After Correction": corrected
            }
        return corrected, None

    except Exception as e:
        print(f"❌ Error correcting '{english}': {e}")
        return telugu, None

# --- Processing ---
corrections = []
corrected_rows = []

for i in range(START_INDEX, min(END_INDEX, len(df))):
    row = df.iloc[i]
    new_row = row.copy()

    for eng_col, tel_col in zip(
        ['shop_ward_name', 'shop_street', 'shop_land_mark'],
        ['telugu_1', 'telugu_2', 'telugu_3']
    ):
        corrected_telugu, correction = correct_telugu(row[eng_col], row[tel_col])
        new_row[tel_col] = corrected_telugu
        if correction:
            correction["Column"] = tel_col
            corrections.append(correction)
            print(f"✔ Row {i+1} [{tel_col}] {correction['Before Correction']} → {correction['After Correction']}")

    corrected_rows.append(new_row)

# --- Append Output CSV ---
output_exists = os.path.exists(OUTPUT_FILE)
pd.DataFrame(corrected_rows).to_csv(
    OUTPUT_FILE,
    mode='a',
    index=False,
    header=not output_exists,
    encoding='utf-8-sig'
)
print(f"✅ Appended {len(corrected_rows)} rows to '{OUTPUT_FILE}'")

# --- Append Log CSV ---
if corrections:
    log_exists = os.path.exists(LOG_FILE)
    log_df = pd.DataFrame(corrections)
    log_df.to_csv(
        LOG_FILE,
        mode='a',
        index=False,
        header=not log_exists,
        encoding='utf-8-sig'
    )
    print(f"📝 Appended {len(corrections)} corrections to '{LOG_FILE}'")
else:
    print("✅ No corrections were needed.")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
✔ Row 15940 [telugu_3] nan → ఎస్‌టి కాలనీ
✔ Row 15942 [telugu_1] nan → నన్
✔ Row 15942 [telugu_2] nan → మెయిన్ రోడ్
✔ Row 15942 [telugu_3] nan → నన్
✔ Row 15944 [telugu_2] nan → ఇంద్రా కాలనీ
✔ Row 15944 [telugu_3] nan → ధర్మపురం సచివలయం
✔ Row 15945 [telugu_1] nan → నన్
✔ Row 15945 [telugu_3] nan → నన్
✔ Row 15946 [telugu_2] nan → మహాదేవపల్లి
✔ Row 15946 [telugu_3] nan → నీర్ సచివలయం
✔ Row 15948 [telugu_2] nan → రేగను గుదేం
✔ Row 15948 [telugu_3] nan → నీర్ ఎన్‌హే 16
✔ Row 15950 [telugu_2] nan → మునిసిబ్ స్ట్రీట్
✔ Row 15950 [telugu_3] nan → నీర్ శివలయం
✔ Row 15952 [telugu_2] nan → హనుమాన్ టెంపుల్ రోడ్
✔ Row 15952 [telugu_3] nan → అంజనేయస్వామి టెంపుల్
✔ Row 15954 [telugu_2] nan → బీసీ కాలనీ
✔ Row 15954 [telugu_3] nan → నీర్ ఉరు కొండ
✔ Row 15956 [telugu_2] nan → గుడం వీడి
✔ Row 15956 [telugu_3] nan → గుండం వీడి
✔ Row 15958 [telugu_2] nan → మెయిన్ రోడ్
✔ Row 15958 [telugu_3] nan → మెయిన్ రోడ్ సోమవారం
✔ Row 15960 [telugu_2] n