In [None]:
%%capture
!git clone https://github.com/AI4Bharat/IndicTrans2.git

In [None]:
%%capture
%cd /content/IndicTrans2/huggingface_interface

In [None]:
%%capture
!python3 -m pip install nltk sacremoses pandas regex mock transformers>=4.33.2 mosestokenizer
!python3 -c "import nltk; nltk.download('punkt')"
!python3 -m pip install bitsandbytes scipy accelerate datasets
!python3 -m pip install sentencepiece

!git clone https://github.com/VarunGumma/IndicTransTokenizer
%cd IndicTransTokenizer
!python3 -m pip install --editable ./
%cd ..

In [None]:
import torch
from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig
from IndicTransTokenizer import IndicProcessor, IndicTransTokenizer

BATCH_SIZE = 4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
quantization = None

In [None]:
def initialize_model_and_tokenizer(ckpt_dir, direction, quantization):
    if quantization == "4-bit":
        qconfig = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
        )
    elif quantization == "8-bit":
        qconfig = BitsAndBytesConfig(
            load_in_8bit=True,
            bnb_8bit_use_double_quant=True,
            bnb_8bit_compute_dtype=torch.bfloat16,
        )
    else:
        qconfig = None

    tokenizer = IndicTransTokenizer(direction=direction)
    model = AutoModelForSeq2SeqLM.from_pretrained(
        ckpt_dir,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        quantization_config=qconfig,
    )

    if qconfig == None:
        model = model.to(DEVICE)
        if DEVICE == "cuda":
            model.half()

    model.eval()

    return tokenizer, model


def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip):
    translations = []
    for i in range(0, len(input_sentences), BATCH_SIZE):
        batch = input_sentences[i : i + BATCH_SIZE]

        # Preprocess the batch and extract entity mappings
        batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)

        # Tokenize the batch and generate input encodings
        inputs = tokenizer(
            batch,
            src=True,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            return_attention_mask=True,
        ).to(DEVICE)

        # Generate translations using the model
        with torch.no_grad():
            generated_tokens = model.generate(
                **inputs,
                use_cache=True,
                min_length=0,
                max_length=256,
                num_beams=5,
                num_return_sequences=1,
            )

        # Decode the generated tokens into text
        generated_tokens = tokenizer.batch_decode(generated_tokens.detach().cpu().tolist(), src=False)

        # Postprocess the translations, including entity replacement
        translations += ip.postprocess_batch(generated_tokens, lang=tgt_lang)

        del inputs
        torch.cuda.empty_cache()

    return translations

**ENGLISH TO HINDI**

In [None]:
def read_sentences_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        sentences = file.readlines()
    # Remove leading and trailing whitespaces, and filter out empty lines
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    return sentences

# Define the path to your input text file containing sentences
input_file_path = "/content/test.en_1000.txt"
# Define the path for the output file
output_file_path = "indicTRANS_eng_to_hin.txt"

# Read sentences from the input file
en_sents = read_sentences_from_file(input_file_path)

# Rest of your code remains unchanged from here
en_indic_ckpt_dir = "ai4bharat/indictrans2-en-indic-dist-200M"
en_indic_tokenizer, en_indic_model = initialize_model_and_tokenizer(en_indic_ckpt_dir, "en-indic", quantization)

ip = IndicProcessor(inference=True)

src_lang, tgt_lang = "eng_Latn", "hin_Deva"
hi_translations = batch_translate(en_sents, src_lang, tgt_lang, en_indic_model, en_indic_tokenizer, ip)

# Save translations to the output file
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    for input_sentence, translation in zip(en_sents, hi_translations):
        output_file.write(f"{translation}\n")
    print(f"Translations saved to {output_file_path}")

# Flush the models to free the GPU memory
del en_indic_tokenizer, en_indic_model


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

configuration_indictrans.py:   0%|          | 0.00/14.1k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-en-indic-dist-200M:
- configuration_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_indictrans.py:   0%|          | 0.00/61.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-en-indic-dist-200M:
- modeling_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.10G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

Translations saved to indicTRANS_eng_to_hin.txt


**HINDI TO ENGLISH**

In [None]:
def read_sentences_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        sentences = file.readlines()
    # Remove leading and trailing whitespaces, and filter out empty lines
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    return sentences

# Define the path to your input text file containing sentences
input_file_path = "/content/test.hi_1000.txt"
# Define the path for the output file
output_file_path = "indicTRANS_hin_to_eng.txt"

# Read sentences from the input file
hi_sents = read_sentences_from_file(input_file_path)

# Rest of your code remains unchanged from here
hi_indic_ckpt_dir = "ai4bharat/indictrans2-indic-en-dist-200M"
hi_indic_tokenizer, hi_indic_model = initialize_model_and_tokenizer(hi_indic_ckpt_dir, "indic-en", quantization)

ip = IndicProcessor(inference=True)

src_lang, tgt_lang = "hin_Deva","eng_Latn"
en_translations = batch_translate(hi_sents, src_lang, tgt_lang, hi_indic_model, hi_indic_tokenizer, ip)

# Save translations to the output file
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    for input_sentence, translation in zip(hi_sents, en_translations):
        output_file.write(f"{translation}\n")
    print(f"Translations saved to {output_file_path}")

# Flush the models to free the GPU memory
del hi_indic_tokenizer, hi_indic_model


config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

configuration_indictrans.py:   0%|          | 0.00/14.1k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-indic-en-dist-200M:
- configuration_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_indictrans.py:   0%|          | 0.00/61.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-indic-en-dist-200M:
- modeling_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin:   0%|          | 0.00/914M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

Translations saved to indicTRANS_hin_to_eng.txt


**HINDI TO MARATHI**

In [None]:
def read_sentences_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        sentences = file.readlines()
    # Remove leading and trailing whitespaces, and filter out empty lines
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    return sentences

# Define the path to your input text file containing sentences
input_file_path = "/content/test.hi_1000.txt"
# Define the path for the output file
output_file_path = "indicTRANS_hin_to_mar.txt"

# Read sentences from the input file
hi_sents = read_sentences_from_file(input_file_path)

# Rest of your code remains unchanged from here
hi_indic_ckpt_dir = "ai4bharat/indictrans2-indic-indic-dist-320M"
hi_indic_tokenizer, hi_indic_model = initialize_model_and_tokenizer(hi_indic_ckpt_dir, "indic-indic", quantization)

ip = IndicProcessor(inference=True)

src_lang, tgt_lang = "hin_Deva","mar_Deva"
mr_translations = batch_translate(hi_sents, src_lang, tgt_lang, hi_indic_model, hi_indic_tokenizer, ip)

# Save translations to the output file
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    for input_sentence, translation in zip(hi_sents, mr_translations):
        output_file.write(f"{translation}\n")
    print(f"Translations saved to {output_file_path}")

# Flush the models to free the GPU memory
del hi_indic_tokenizer, hi_indic_model


config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

configuration_indictrans.py:   0%|          | 0.00/14.1k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-indic-indic-dist-320M:
- configuration_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_indictrans.py:   0%|          | 0.00/61.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-indic-indic-dist-320M:
- modeling_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin:   0%|          | 0.00/1.28G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

Translations saved to indicTRANS_hin_to_mar.txt


**MARATHI TO HINDI**

In [None]:
def read_sentences_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        sentences = file.readlines()
    # Remove leading and trailing whitespaces, and filter out empty lines
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    return sentences

# Define the path to your input text file containing sentences
input_file_path = "/content/test.mr_1000.txt"
# Define the path for the output file
output_file_path = "indicTRANS_mar_to_hin.txt"

# Read sentences from the input file
mr_sents = read_sentences_from_file(input_file_path)

# Rest of your code remains unchanged from here
mr_indic_ckpt_dir = "ai4bharat/indictrans2-indic-indic-dist-320M"
mr_indic_tokenizer, mr_indic_model = initialize_model_and_tokenizer(mr_indic_ckpt_dir, "indic-indic", quantization)

ip = IndicProcessor(inference=True)

src_lang, tgt_lang = "mar_Deva","hin_Deva"
hi_translations = batch_translate(mr_sents, src_lang, tgt_lang, mr_indic_model, mr_indic_tokenizer, ip)

# Save translations to the output file
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    for input_sentence, translation in zip(mr_sents, hi_translations):
        output_file.write(f"{translation}\n")
    print(f"Translations saved to {output_file_path}")

# Flush the models to free the GPU memory
del mr_indic_tokenizer, mr_indic_model


Translations saved to indicTRANS_mar_to_hin.txt


**QUESTION 2:- BLEU AND ROGUE SCORES OF ALL TRANSLATIONS BY indicTRANS**

In [2]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m909.2 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-2.8.2 sacrebleu-2.4.2


In [3]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=516dc868f32816e18b1faa30f8670a3ba639e28a97da4ff3ebfe6a3c6e8f4a96
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [9]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [10]:
from rouge import Rouge
from nltk.translate.bleu_score import corpus_bleu

def load_sentences(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        sentences = [line.strip() for line in f if line.strip()]
    return sentences

def calculate_scores(reference_file, translated_file):
    # Load sentences from files
    reference_sentences = load_sentences(reference_file)
    translated_sentences = load_sentences(translated_file)

    # Initialize ROUGE
    rouge = Rouge()

    # Calculate ROUGE scores
    rouge_scores = rouge.get_scores(translated_sentences, reference_sentences, avg=True)

    # Calculate BLEU score
    bleu_score = corpus_bleu([[ref] for ref in reference_sentences], translated_sentences)

    return rouge_scores, bleu_score

def print_scores(rouge_scores, bleu_score):
    print("ROUGE Scores:")
    for metric, values in rouge_scores.items():
        print(f"{metric}:")
        for key, value in values.items():
            print(f"{key}: {value}")
    print("BLEU Score:", bleu_score)

**FOR ENGLISH TO HINDI**

In [11]:
# Example usage:
input_file = "test.hi_1000.txt"
output_file = "indicTRANS_eng_to_hin.txt"

# Calculate scores
rouge_scores, bleu_score = calculate_scores(input_file, output_file)

# Print scores
print_scores(rouge_scores, bleu_score)

ROUGE Scores:
rouge-1:
r: 0.6270386143636398
p: 0.6344910701438935
f: 0.627473479134096
rouge-2:
r: 0.39380706318151115
p: 0.399636179290827
f: 0.39458509897405
rouge-l:
r: 0.5888695393414554
p: 0.5962371067409259
f: 0.5894927752055814
BLEU Score: 0.6995483147835915


**FOR HINDI TO ENGLISH**

In [12]:
# Example usage:
input_file = "test.en_1000.txt"
output_file = "indicTRANS_hin_to_eng.txt"

# Calculate scores
rouge_scores, bleu_score = calculate_scores(input_file, output_file)

# Print scores
print_scores(rouge_scores, bleu_score)

ROUGE Scores:
rouge-1:
r: 0.6667621128652064
p: 0.662975116834313
f: 0.6612807966510005
rouge-2:
r: 0.4576863711711367
p: 0.451573672589976
f: 0.4517390846536463
rouge-l:
r: 0.6336902759953175
p: 0.6295916692638548
f: 0.6282762248654941
BLEU Score: 0.7516008909688211


**FOR HINDI TO MARATHI**

In [14]:
# Example usage:
input_file = "test.mr_1000.txt"
output_file = "indicTRANS_hin_to_mar.txt"

# Calculate scores
rouge_scores, bleu_score = calculate_scores(input_file, output_file)

# Print scores
print_scores(rouge_scores, bleu_score)

ROUGE Scores:
rouge-1:
r: 0.4235107105756269
p: 0.4199960895720604
f: 0.4178299267877743
rouge-2:
r: 0.19286763302103752
p: 0.18900911053580094
f: 0.1891159785980255
rouge-l:
r: 0.39528685803641705
p: 0.3918145260136416
f: 0.3898598573554864
BLEU Score: 0.6091614587017943


**FOR MARATHI TO HINDI**

In [15]:
# Example usage:
input_file = "test.hi_1000.txt"
output_file = "indicTRANS_mar_to_hin.txt"

# Calculate scores
rouge_scores, bleu_score = calculate_scores(input_file, output_file)

# Print scores
print_scores(rouge_scores, bleu_score)

ROUGE Scores:
rouge-1:
r: 0.5315656827163657
p: 0.5445237554258757
f: 0.5332282783011526
rouge-2:
r: 0.28752005153928917
p: 0.2949085828895486
f: 0.2884293405003241
rouge-l:
r: 0.4930805188256202
p: 0.5052356285819584
f: 0.4947890659945677
BLEU Score: 0.6139302618015272
