In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers datasets peft accelerate bitsandbytes trl safetensors torch --no-cache

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.10.0-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m127.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl
  Downloading trl-0.8.5-py3-none-any.whl (245 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.1/245.1 kB[0m [31m46.

In [3]:
import torch
import pandas as pd
from functools import partial
from datasets import Dataset
from transformers import TrainingArguments
import time
import transformers
from sklearn.model_selection import train_test_split
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

column_names = ['H', 'RH', 'E']
# base_path = '/content/drive/My Drive/Colab Notebooks/CSCI 564 NLP'
# df = pd.read_csv(f'{base_path}/hindi_data/romanized_hindi_english_paper.csv', names=column_names,nrows=8000)

path = '/content/drive/MyDrive/Project_544/data/hindi_data/new_romanized_hindi_english_paper_19k.csv'

df = pd.read_csv(path, names=column_names, nrows=7000)

In [4]:
# Hugging Face model name
#model_name = "microsoft/phi-2" # not the larger version - need to look into this
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
use_flash_attention = False


# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    # quantization_config=bnb_config,
    use_cache=False,
    use_flash_attention_2=use_flash_attention,
    device_map="auto",
    torch_dtype=torch.float16
)

model.config.pretraining_tp = 1

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

In [5]:
# LoRA config based on QLoRA paper
peft_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    r=16,
    bias="none",
    task_type="CAUSAL_LM",
)
# Prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [6]:
# for RH to E
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(val_df, test_size=0.5, random_state=42)

In [7]:
# for H to E
train_df2, val_df2 = train_df.copy(), val_df.copy()
val_df2, test_df2 = val_df.copy(), test_df.copy()

In [8]:
# for RH+H to E
train_df3, val_df3 = train_df.copy(), val_df.copy()
val_df3, test_df3 = val_df.copy(), test_df.copy()

In [9]:
results = {'Languages': [],
           'Avg Bert' : []}
           #'Corpus BLEU' : []}

# **Hindi to English**

In [10]:
train_df, val_df = train_df, val_df
val_df, test_df = val_df, test_df

In [11]:
def create_json_record(row, prefix):
    return {
        "input_ids": f"{prefix}_{row.name}",
        "text": f"Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi: {row['H']} \nEnglish: {row['E']}"
    }
def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    print(batch)
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )


_preprocessing_function = partial(preprocess_batch, max_length=1024, tokenizer=tokenizer)
# Convert train_df and val_df into JSON format
# train_dataset = [create_json_record(row, "train") for _, row in train_df.iterrows()]
# val_dataset = [create_json_record(row, "val") for _, row in val_df.iterrows()]


def create_prompt(row):
    return f"Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: {row['H']} \nEnglish: {row['E']}"

# Apply the function to each row and create a new 'text' column
train_df['text'] = train_df.apply(create_prompt, axis=1)
val_df['text'] = val_df.apply(create_prompt, axis=1)

columns_to_drop = ['H', 'RH','E']
train_df.drop(columns=columns_to_drop, inplace=True)
val_df.drop(columns=columns_to_drop, inplace=True)

train_dataset = Dataset.from_pandas(train_df)
val_dataset= Dataset.from_pandas(val_df)


In [12]:
train_dataset = train_dataset.map(
        _preprocessing_function,
        batched=True,
    )

Map:   0%|          | 0/5600 [00:00<?, ? examples/s]

{'text': ["Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: वह बच्चे की तरह सो रहा है। \nEnglish: He's sleeping like a baby.", 'Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: यह एक फूल है \nEnglish: This is a flower', 'Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: एक और चुनौती राहदारी शुल्क कम करने के लिए राजनैतिक दबाव के रूप में है . \nEnglish: Another challenge is the political pressure being exerted for waiving the user charges .', 'Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: तुम दो तीन दिनों में फिरसे ठीक हो जाओगे। \nEnglish: You will be all right again in a couple of days.', 'Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi

In [13]:
val_dataset = val_dataset.map(
        _preprocessing_function,
    batched=True,
    )

Map:   0%|          | 0/700 [00:00<?, ? examples/s]

{'text': ['Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: लेकिन पोर्ट ब्लेयर में एक गैर सरकारी समिति है जिसमें सरकारी अधिकारियों के अतिरिक्त उपभोक्ता तथा व्यापारियों के प्रतिनिधि संयुक्त रूप से आवश्यक वस्तुओं के मूल्य निर्धारित करते हैं . \nEnglish: However , in Port Blair there is a non-official body consisting of government officials and representatives of consumers and traders , who jointly fix the prices of all essential commodities .', 'Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: यह अनुमान लगाया गया था कि सन् 1951 में 25,980 मिल करघे थे और 1,81,278 हथकरघे आर्ट सिल्क रेशे के निर्माण में लगे हुए थे . \nEnglish: It was estimated that in 1951 , there were some 25,980 mill looms and 181,278 handlooms engaged in the manufacture of art silk fibres .', 'Translate the following sentences from Hindi to English. The output should be in English a

In [14]:
base_path = "/content/drive/My Drive/CSCI544ProjOutput/"
output_dir = f'{base_path}/peft-dialogue-summary-training-{str(int(time.time()))}'
peft_training_args = TrainingArguments(
    output_dir = output_dir,
    warmup_steps=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    max_steps=50,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_steps=25,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=25,
    evaluation_strategy="steps",
    eval_steps=25,
    do_eval=True,
    gradient_checkpointing=False,
    report_to="none",
    overwrite_output_dir = 'True',
    group_by_length=True,
)

model.config.use_cache = False

peft_trainer = transformers.Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=peft_training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [15]:
print(type(train_dataset['text'][0]))

<class 'str'>


In [16]:
peft_trainer.train()

Step,Training Loss,Validation Loss
25,1.5766,1.285784
50,0.931,1.213891


TrainOutput(global_step=50, training_loss=1.2538189315795898, metrics={'train_runtime': 272.7523, 'train_samples_per_second': 0.733, 'train_steps_per_second': 0.183, 'total_flos': 140373858852864.0, 'train_loss': 1.2538189315795898, 'epoch': 0.04})

In [17]:
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu

# Assume test_data is a list of tuples: (source_sentence, reference_translation)

sources = test_df['H'].to_list()
references = test_df['E'].to_list()

#sources = [sources[0]]
#references = [references[0]]

s = []
r = []

for i in range(len(sources)):

    inputs = tokenizer(f'''Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi: {sources[i]} \nEnglish:''', return_tensors="pt",)

    with torch.no_grad():
        outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=20)
        text = tokenizer.batch_decode(outputs)[0]
        #print(text)

    #print(text)
    t = text.split("\nEnglish: ")
    t = t[1]. split("\n")[0]

    """
    print("t")
    print(t)
    print(t.split())
    print(references[i].split())
    print("---")
    """

    s.append(t.split())
    # translations.append(reference.split())
    r.append(references[i].split())  # Note: reference must be a list of lists for corpus_bleu

In [18]:
print(s)
print(r)

[['The', 'king', 'was', 'angry', 'with', 'him.'], ['The', 'religious', 'beliefs', 'of', 'the', 'Jains,', 'Buddhists,', 'Shaivas,', 'and', 'Shaktas'], ['The', 'other', 'things', 'too', ',', 'this', 'place', 'is', 'a', 'symbol', 'of', 'the', 'unity', 'of', 'the', 'country', '.'], ['If', 'we', 'buy', 'a', 'book', 'at', 'a', 'discount,', 'we', 'should', 'not', 'give', 'it', 'to', 'someone', 'who', 'is', 'not'], ['I', 'am', 'not', 'interested', 'in', 'that.'], ['I', 'am', 'hungry.'], ['I', 'was', 'tired', 'and', 'hungry,', 'I', 'was', 'in', 'a', 'car', 'with', 'a', 'family,', 'eating', 'at', 'a'], ['The', 'Prime', "Minister's", 'Office'], ['His', 'hand', 'is', 'long.'], ['You', 'said', 'that', 'I', 'was', 'your', 'lover.'], ['In', 'India', ',', 'where', 'the', 'age', 'of', 'the', 'sun', 'is', 'at', 'its', 'lowest', ',', 'the', 'age', 'of', 'the', 'sun', 'is'], ['He', 'read', 'books', 'in', 'foreign', 'countries.'], ["I'm", 'going', 'to', 'your', 'place', 'and', 'time.'], ['The', 'sun', 'and

In [19]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [20]:
# !pip install evaluate

In [21]:
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: bert_score
Successfully installed bert_score-0.3.13


In [22]:
from datasets import load_metric
bertscore_metric = load_metric('bertscore')

  bertscore_metric = load_metric('bertscore')
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.92k [00:00<?, ?B/s]

In [23]:
hypothesis = [' '.join(i) for i in s]
reference = [' '.join(i) for i in r]

print(len(hypothesis))
print(len(reference))
filtered_data = [(h, r) for h, r in zip(hypothesis, reference) if h and r]

# If you want to separate the filtered data into hypothesis and reference lists again
filtered_hypothesis = [pair[0] for pair in filtered_data]
filtered_reference = [pair[1] for pair in filtered_data]
print(len(filtered_hypothesis))
print(len(filtered_reference))
bert_scores = bertscore_metric.compute(predictions=filtered_hypothesis, references=filtered_reference, lang="en")
bert_results = [round(v, 4) for v in bert_scores["f1"]]
avg = sum(bert_results)/len(bert_results)
avg


700
700
693
693


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0.8852506493506487

In [24]:
avg

0.8852506493506487

In [25]:
results['Languages'].append('Hindi -> English')
results['Avg Bert'].append(avg)
print(avg)

0.8852506493506487


#**Romanized Hindi to English**

In [26]:
train_df, val_df = train_df2, val_df2
val_df, test_df = val_df2, test_df2

In [27]:
def create_json_record(row, prefix):
    return {
        "input_ids": f"{prefix}_{row.name}",
        "text": f"Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi Transliterated: {row['RH']} \nEnglish: {row['E']}"
    }
def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    print(batch)
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )


_preprocessing_function = partial(preprocess_batch, max_length=1024, tokenizer=tokenizer)
# Convert train_df and val_df into JSON format
# train_dataset = [create_json_record(row, "train") for _, row in train_df.iterrows()]
# val_dataset = [create_json_record(row, "val") for _, row in val_df.iterrows()]


def create_prompt(row):
    return f"Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi Transliterated: {row['RH']} ]\nEnglish: {row['E']}"

# Apply the function to each row and create a new 'text' column
train_df['text'] = train_df.apply(create_prompt, axis=1)
val_df['text'] = val_df.apply(create_prompt, axis=1)

columns_to_drop = ['H', 'RH','E']
train_df.drop(columns=columns_to_drop, inplace=True)
val_df.drop(columns=columns_to_drop, inplace=True)

train_dataset = Dataset.from_pandas(train_df)
val_dataset= Dataset.from_pandas(val_df)


In [28]:
train_dataset = train_dataset.map(
        _preprocessing_function,
        batched=True,
    )

Map:   0%|          | 0/5600 [00:00<?, ? examples/s]

{'text': ["Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi Transliterated: wah bachche key tarah soo rahaa hai. ]\nEnglish: He's sleeping like a baby.", 'Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi Transliterated: yeah ack fool hai ]\nEnglish: This is a flower', 'Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi Transliterated: ack our chunauti rahdari shulk cum karane key lie rajnaitik dabaav key roop main hai . ]\nEnglish: Another challenge is the political pressure being exerted for waiving the user charges .', 'Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi Transliterated: tum doe teen dino main firse theek how jaaoge. ]\nEnglish: You will be all right again in a couple of days.', 'Translate the foll

In [29]:
val_dataset = val_dataset.map(
        _preprocessing_function,
    batched=True,
    )

Map:   0%|          | 0/700 [00:00<?, ? examples/s]

{'text': ['Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi Transliterated: lekin port blare main ack gair sarkari samiti hai jisamen sarkari adhikaariyon key atirikt upabhokta tathaa vyapariyon key pratinidhi sanyukt roop say aavashyak vastuon key moolya nirdhaarit karate hain . ]\nEnglish: However , in Port Blair there is a non-official body consisting of government officials and representatives of consumers and traders , who jointly fix the prices of all essential commodities .', 'Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi Transliterated: yeah anuman lagaaya gaya tha kii sunn 1951 main 25,980 mill karghe they our 1,81,278 hathkarghe art silk reshe key nirmaan main lagey hue they . ]\nEnglish: It was estimated that in 1951 , there were some 25,980 mill looms and 181,278 handlooms engaged in the manufacture of art silk fibres .', 'Translate 

In [30]:
base_path = "/content/drive/My Drive/CSCI544ProjOutput/"
output_dir = f'{base_path}/peft-dialogue-summary-training-{str(int(time.time()))}'
peft_training_args = TrainingArguments(
    output_dir = output_dir,
    warmup_steps=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    max_steps=50,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_steps=25,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=25,
    evaluation_strategy="steps",
    eval_steps=25,
    do_eval=True,
    gradient_checkpointing=False,
    report_to="none",
    overwrite_output_dir = 'True',
    group_by_length=True,
)

model.config.use_cache = False

peft_trainer = transformers.Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=peft_training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [31]:
print(type(train_dataset['text'][0]))

<class 'str'>


In [32]:
peft_trainer.train()

Step,Training Loss,Validation Loss
25,2.7318,2.338466
50,1.4592,2.309554


TrainOutput(global_step=50, training_loss=2.095540237426758, metrics={'train_runtime': 175.0041, 'train_samples_per_second': 1.143, 'train_steps_per_second': 0.286, 'total_flos': 99573001801728.0, 'train_loss': 2.095540237426758, 'epoch': 0.04})

In [33]:
# model = AutoModelForCausalLM.from_pretrained('/content/drive/My Drive/peft-dialogue-summary-training-1712294967/checkpoint-375')
# tokenizer = AutoTokenizer.from_pretrained('/content/drive/My Drive/CSCI544ProjOutput/peft-dialogue-summary-training-1712294967/checkpoint-375',trust_remote_code=True)

In [34]:
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu

# Assume test_data is a list of tuples: (source_sentence, reference_translation)

sources = test_df['RH'].to_list()
references = test_df['E'].to_list()

#sources = [sources[0]]
#references = [references[0]]

s = []
r = []

for i in range(len(sources)):

    inputs = tokenizer(f'''Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi Transliterated: {sources[i]} \nEnglish:''', return_tensors="pt",)

    with torch.no_grad():
        outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=20)
        text = tokenizer.batch_decode(outputs)[0]
        #print(text)

    #print(text)
    t = text.split("\nEnglish: ")
    t = t[1]. split("\n")[0]

    """
    print("t")
    print(t)
    print(t.split())
    print(references[i].split())
    print("---")
    """

    s.append(t.split())
    # translations.append(reference.split())
    r.append(references[i].split())  # Note: reference must be a list of lists for corpus_bleu

In [35]:
print(s)
print(r)

[['The', 'king', 'was', 'deprived', 'of', 'his', 'power.'], ['RELIGIOUS', 'CONDITION', 'The', 'major', 'religions', 'practised', 'in', 'KarnataKa', 'during', 'the', 'first', 'half', 'of', 'the', 'twelfth', 'century', 'were', ':', 'Jainisru', ',', 'Yaishnavism', ',', 'Shaivism', 'and', 'Buddhism', '.'], ['This', 'applies', 'inter', 'alia', 'to', 'the', 'privileges', 'and', 'immunities', 'of', 'legislatures', ',', 'disqualification', 'of', 'members', ',', 'relation', 'between', 'the', 'two', 'Houses', ',', 'legislative', 'procedure', ',', 'origin', 'of', 'money', 'bills', ',', 'etc', '.'], ['if', 'you', 'made', 'literature', 'affordable', 'and', 'available', 'to', 'them.'], ['I', "don't", 'agree', 'with', 'you.'], ['I', 'run', 'every', 'day.'], ['Tipper', 'and', 'I', 'were', 'driving', 'ourselves,', "Shoney's,", 'low-cost', 'family', 'restaurant', 'chain,'], ['Prime', "minister's", 'office'], ['She', 'has', 'long', 'arms', 'and', 'legs.'], ['Take', 'back', 'what', 'you', 'said', 'about',

In [36]:
hypothesis = [' '.join(i) for i in s]
reference = [' '.join(i) for i in r]

print(len(hypothesis))
print(len(reference))
filtered_data = [(h, r) for h, r in zip(hypothesis, reference) if h and r]

# If you want to separate the filtered data into hypothesis and reference lists again
filtered_hypothesis = [pair[0] for pair in filtered_data]
filtered_reference = [pair[1] for pair in filtered_data]
print(len(filtered_hypothesis))
print(len(filtered_reference))
bert_scores = bertscore_metric.compute(predictions=filtered_hypothesis, references=filtered_reference, lang="en")
bert_results = [round(v, 4) for v in bert_scores["f1"]]
avg = sum(bert_results)/len(bert_results)
avg

700
700
700
700


0.8696738571428569

In [37]:
results['Languages'].append('Romanized Hindi -> English')
results['Avg Bert'].append(avg)
print(avg)

0.8696738571428569


# **RH + H to English**

In [38]:
train_df, val_df = train_df3, val_df3
val_df, test_df = val_df3, test_df3

In [39]:
def create_json_record(row, prefix):
    return {
        "input_ids": f"{prefix}_{row.name}",
        "text": f"Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi Transliterated: {row['H']} {row['RH']} \nEnglish: {row['E']}"
    }
def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    print(batch)
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )


_preprocessing_function = partial(preprocess_batch, max_length=1024, tokenizer=tokenizer)
# Convert train_df and val_df into JSON format
# train_dataset = [create_json_record(row, "train") for _, row in train_df.iterrows()]
# val_dataset = [create_json_record(row, "val") for _, row in val_df.iterrows()]


def create_prompt(row):
    return f"Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi: {row['H']} {row['RH']} \nEnglish: {row['E']}"

# Apply the function to each row and create a new 'text' column
train_df['text'] = train_df.apply(create_prompt, axis=1)
val_df['text'] = val_df.apply(create_prompt, axis=1)

columns_to_drop = ['H', 'RH','E']
train_df.drop(columns=columns_to_drop, inplace=True)
val_df.drop(columns=columns_to_drop, inplace=True)

train_dataset = Dataset.from_pandas(train_df)
val_dataset= Dataset.from_pandas(val_df)


In [None]:
train_dataset = train_dataset.map(
        _preprocessing_function,
        batched=True,
    )

Map:   0%|          | 0/5600 [00:00<?, ? examples/s]

{'text': ["Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi: वह बच्चे की तरह सो रहा है। wah bachche key tarah soo rahaa hai. \nEnglish: He's sleeping like a baby.", 'Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi: यह एक फूल है yeah ack fool hai \nEnglish: This is a flower', 'Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi: एक और चुनौती राहदारी शुल्क कम करने के लिए राजनैतिक दबाव के रूप में है . ack our chunauti rahdari shulk cum karane key lie rajnaitik dabaav key roop main hai . \nEnglish: Another challenge is the political pressure being exerted for waiving the user charges .', 'Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi: तुम दो तीन दिनों में फिरसे ठीक हो जाओगे। tum doe teen dino main firse theek 

In [None]:
val_dataset = val_dataset.map(
        _preprocessing_function,
    batched=True,
    )

In [None]:
base_path = "/content/drive/My Drive/CSCI544ProjOutput/"
output_dir = f'{base_path}/peft-dialogue-summary-training-{str(int(time.time()))}'
peft_training_args = TrainingArguments(
    output_dir = output_dir,
    warmup_steps=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    max_steps=50,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_steps=25,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=25,
    evaluation_strategy="steps",
    eval_steps=25,
    do_eval=True,
    gradient_checkpointing=False,
    report_to="none",
    overwrite_output_dir = 'True',
    group_by_length=True,
)

model.config.use_cache = False

peft_trainer = transformers.Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=peft_training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

In [None]:
print(type(train_dataset['text'][0]))

In [None]:
peft_trainer.train()

In [None]:
# model = AutoModelForCausalLM.from_pretrained('/content/drive/My Drive/peft-dialogue-summary-training-1712294967/checkpoint-375')
# tokenizer = AutoTokenizer.from_pretrained('/content/drive/My Drive/CSCI544ProjOutput/peft-dialogue-summary-training-1712294967/checkpoint-375',trust_remote_code=True)

In [None]:
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu

# Assume test_data is a list of tuples: (source_sentence, reference_translation)

rh = test_df['RH'].to_list()
h = test_df['H'].to_list()
references = test_df['E'].to_list()

#sources = [sources[0]]
#references = [references[0]]

s = []
r = []

for i in range(len(rh)):

    inputs = tokenizer(f'''Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi: {h[i]} {rh[i]} \nEnglish: ''', return_tensors="pt",)

    with torch.no_grad():
        outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=20)
        text = tokenizer.batch_decode(outputs)[0]
        #print(text)

    #print(text)
    t = text.split("\nEnglish: ")
    t = t[1]. split("\n")[0]

    """
    print("t")
    print(t)
    print(t.split())
    print(references[i].split())
    print("---")
    """

    s.append(t.split())
    # translations.append(reference.split())
    r.append(references[i].split())  # Note: reference must be a list of lists for corpus_bleu

In [None]:
hypothesis = [' '.join(i) for i in s]
reference = [' '.join(i) for i in r]

print(len(hypothesis))
print(len(reference))
filtered_data = [(h, r) for h, r in zip(hypothesis, reference) if h and r]

# If you want to separate the filtered data into hypothesis and reference lists again
filtered_hypothesis = [pair[0] for pair in filtered_data]
filtered_reference = [pair[1] for pair in filtered_data]
print(len(filtered_hypothesis))
print(len(filtered_reference))
bert_scores = bertscore_metric.compute(predictions=filtered_hypothesis, references=filtered_reference, lang="en")
bert_results = [round(v, 4) for v in bert_scores["f1"]]
avg = sum(bert_results)/len(bert_results)
avg

In [None]:
results['Languages'].append('Hindi + Romanized Hindi -> English')
results['Avg Bert'].append(avg)
print(avg)

# **Results**

In [None]:
results_df = pd.DataFrame(results)
results_df