In [1]:
!huggingface-cli login --token {hf_token}

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `hf`CLI if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `llm_finetuning` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `llm_finetuning`


In [3]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load model and tokenizer
model_name = "ai4bharat/Cadence"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)

id2label = model.config.id2label

# text = "यह एक वाक्य है इसका क्या मतलब है"
text = "No newspaper is completely unbiased in my expert opinion."

def punctuator_cadence(text):
    # Tokenize input and prepare for model
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    input_ids = inputs['input_ids'][0] # Get input_ids for the first (and only) sentence

    with torch.no_grad():
        outputs = model(**inputs)
        predictions_for_sentence = torch.argmax(outputs.logits, dim=-1)[0]


    result_tokens_and_punctuation = []
    all_token_strings = tokenizer.convert_ids_to_tokens(input_ids.tolist()) # Get all token strings

    for i, token_id_value in enumerate(input_ids.tolist()):
        # Process only non-padding tokens based on the attention mask
        if inputs['attention_mask'][0][i] == 0:
            continue

        current_token_string = all_token_strings[i]

        is_special_token = token_id_value in tokenizer.all_special_ids

        if not is_special_token:
            result_tokens_and_punctuation.append(current_token_string)

        predicted_punctuation_id = predictions_for_sentence[i].item()
        punctuation_character = id2label[predicted_punctuation_id]

        if punctuation_character != "O" and not is_special_token:
            result_tokens_and_punctuation.append(punctuation_character)

    punctuated_text = tokenizer.convert_tokens_to_string(result_tokens_and_punctuation)
    return punctuated_text

print(f"Original Text: {text}")
print(f"Punctuated Text: {punctuator_cadence(text)}")

Original Text: No newspaper is completely unbiased in my expert opinion.
Punctuated Text: No newspaper is completely unbiased in my expert opinion.।


In [4]:
from datasets import load_dataset
dataset_name = "thenlpresearcher/test_data_human_validated_eng_mar"
dataset = load_dataset(dataset_name)
# Collect all input sentences
src_texts = list(dataset["test"]["sent_written"])

predicted_sentences = []
for t in src_texts:
    predicted_sentences.append(punctuator_cadence(t)[:-1])

In [5]:
predicted_sentences[:3]

['Chanting the choir raised the volume as the celebrant intoned the prayer.',
 'A six-month-old calf was submitted for examination, showing lameness in all four legs which had been present since soon after birth.',
 'Planning authorities should provide alternative locations for small businesses, which are or would be offensive in a residential area.']

In [6]:
import pandas as pd

# Create a brand-new empty dataframe
df = pd.DataFrame()
# Add model predictions
df["prediction"] = predicted_sentences

# Add source fields from HF dataset
df["src"] = dataset["test"]["sent_written"]
df["gt"]   = dataset["test"]["sent_meant"]

# Save the file
output_file = "approach1_eng_to_eng_cadence_outputs_punct_restor_data.csv"
df.to_csv(output_file, index=False)

print("Saved:", output_file)

Saved: approach1_eng_to_eng_cadence_outputs_punct_restor_data.csv


In [7]:
original_model_name = "ai4bharat/indictrans2-en-indic-dist-200M"
model_name = original_model_name

import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from IndicTransToolkit.processor import IndicProcessor
from tqdm import tqdm
# recommended to run this on a gpu with flash_attn installed
# don't set attn_implemetation if you don't have flash_attn
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

src_lang, tgt_lang = "eng_Latn", "mar_Deva"
tokenizer_name =  "ai4bharat/indictrans2-en-indic-dist-200M"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)

model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name, 
    trust_remote_code=True, 
    torch_dtype=torch.float16, # performance might slightly vary for bfloat16
    attn_implementation="flash_attention_2"
).to(DEVICE)

ip = IndicProcessor(inference=True)

input_sentences = predicted_sentences

import torch

def batch_translate(
    input_sentences,
    src_lang,
    tgt_lang,
    model,
    tokenizer,
    ip,
    device="cuda",
    batch_size=16,
):
    all_translations = []

    for i in tqdm(range(0, len(input_sentences), batch_size)):
        batch = input_sentences[i : i + batch_size]

        # Preprocess (handles entity mapping etc.)
        batch = ip.preprocess_batch(
            batch,
            src_lang=src_lang,
            tgt_lang=tgt_lang
        )

        # Tokenize on device
        inputs = tokenizer(
            batch,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            return_attention_mask=True,
        ).to(device)

        # Generate translations
        with torch.no_grad():
            generated = model.generate(
                **inputs,
                use_cache=True,
                min_length=0,
                max_length=256,
                num_beams=5,
                num_return_sequences=1,
            )

        # Decode
        decoded = tokenizer.batch_decode(
            generated,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True,
        )

        # Postprocess (restore entities)
        decoded = ip.postprocess_batch(decoded, lang=tgt_lang)

        all_translations.extend(decoded)

    return all_translations

In [8]:
translations = batch_translate(
    input_sentences,
    src_lang="eng_Latn",
    tgt_lang="mar_Deva",
    model=model,
    tokenizer=tokenizer,
    ip=ip,
    device=DEVICE,
    batch_size=32
)

for input_sentence, translation in zip(input_sentences[:10], translations[:10]):
    print(f"{src_lang}: {input_sentence}")
    print(f"{tgt_lang}: {translation}")

100%|█████████████████████████████████████████| 2/2 [00:02<00:00,  1.12s/it]

eng_Latn: Chanting the choir raised the volume as the celebrant intoned the prayer.
mar_Deva: उत्सवी गायकांनी प्रार्थनेचा उच्चार केल्याने गायकवृंदाचा जप केल्याने आवाज वाढला.
eng_Latn: A six-month-old calf was submitted for examination, showing lameness in all four legs which had been present since soon after birth.
mar_Deva: जन्मानंतर लगेचच अस्तित्वात असलेल्या चारही पायांमध्ये लंगडेपणा दर्शविणाऱ्या सहा महिन्यांच्या वासराला तपासणीसाठी सादर करण्यात आले.
eng_Latn: Planning authorities should provide alternative locations for small businesses, which are or would be offensive in a residential area.
mar_Deva: नियोजन अधिकाऱ्यांनी छोट्या व्यवसायांसाठी पर्यायी ठिकाणे पुरवली पाहिजेत, जी निवासी भागात आक्षेपार्ह असतील किंवा असतील.
eng_Latn: As the machine develops the forms we use to record data from past projects will be amended.
mar_Deva: जसजसे मशीन विकसित होईल तसतसे आम्ही मागील प्रकल्पांमधील डेटा रेकॉर्ड करण्यासाठी वापरत असलेले फॉर्म सुधारित केले जातील.
eng_Latn: As mentioned, first impressions




In [9]:
import re
def remove_prefix(translations, prefix):
    ans = []
    for t in translations:
        t = t.strip()
        if t.startswith(prefix):
            t = t[len(prefix):]
        t = re.sub(r'\.+', '.', t)
        ans.append(t)
    return ans

translations_mar = remove_prefix(translations, "mar_Deva eng_Latn ")
print(translations_mar[0])

उत्सवी गायकांनी प्रार्थनेचा उच्चार केल्याने गायकवृंदाचा जप केल्याने आवाज वाढला.


In [10]:
import pandas as pd
from evaluate import load

# -------------------- SAVE OUTPUTS --------------------
results_df = pd.DataFrame({
    "src": dataset["test"]["sent_written"],
    "prediction_mar": translations_mar,
    "gt_mar": dataset["test"]["gt"],
})

import os
os.makedirs(dataset_name, exist_ok=True)
results_df.to_csv(f"cadence_outputs.csv", index=False)
print(f"✔ Saved predictions to cadence_outputs.csv")

✔ Saved predictions to cadence_outputs.csv


In [12]:
# -------------------- METRICS --------------------
bleu = load("sacrebleu")
chrf = load("chrf")

def compute_scores(preds, ref1):
    """
    Compute BLEU and chrF++ scores using all three references for each sentence.
    """
    references = [[r1] for r1 in ref1]  # sacrebleu format
    bleu_score = bleu.compute(predictions=preds, references=references)["score"]
    chrf_score = chrf.compute(predictions=preds, references=references)["score"]
    return bleu_score, chrf_score

bleu_score_mar, chrf_score_mar = compute_scores(translations_mar, dataset["test"]["gt"])

In [13]:
print('Metrics -- Marathi')
print(f"BLEU: {bleu_score_mar}, CHRF++: {chrf_score_mar}")

Metrics -- Marathi
BLEU: 23.43559271086952, CHRF++: 60.48614078383402
