In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers datasets peft accelerate bitsandbytes trl safetensors torch --no-cache

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.10.0-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m214.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m193.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl
  Downloading trl-0.8.5-py3-none-any.whl (245 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.1/245.1 kB[0m [31m18

In [3]:
import torch
import pandas as pd
from functools import partial
from datasets import Dataset
from transformers import TrainingArguments
import time
import transformers
from sklearn.model_selection import train_test_split
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

column_names = ['H', 'RH', 'E']
# base_path = '/content/drive/My Drive/Colab Notebooks/CSCI 564 NLP'
# df = pd.read_csv(f'{base_path}/hindi_data/romanized_hindi_english_paper.csv', names=column_names,nrows=8000)

path = '/content/drive/MyDrive/Project_544/data/hindi_data/new_romanized_hindi_english_paper_19k.csv'

df = pd.read_csv(path, names=column_names, nrows=5000)

In [4]:
# Hugging Face model name
#model_name = "microsoft/phi-2" # not the larger version - need to look into this
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
use_flash_attention = False


# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    # quantization_config=bnb_config,
    use_cache=False,
    use_flash_attention_2=use_flash_attention,
    device_map="auto",
    torch_dtype=torch.float16
)

model.config.pretraining_tp = 1

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

In [5]:
# LoRA config based on QLoRA paper
peft_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    r=16,
    bias="none",
    task_type="CAUSAL_LM",
)
# Prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [6]:
# for RH to E
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(val_df, test_size=0.5, random_state=42)

In [7]:
# for H to E
train_df2, val_df2 = train_df.copy(), val_df.copy()
val_df2, test_df2 = val_df.copy(), test_df.copy()

In [8]:
# for RH+H to E
train_df3, val_df3 = train_df.copy(), val_df.copy()
val_df3, test_df3 = val_df.copy(), test_df.copy()

In [9]:
results = {'Languages': [],
           'Avg Meteor' : []}
           #'Corpus BLEU' : []}

# **Hindi to English**

In [10]:
train_df, val_df = train_df, val_df
val_df, test_df = val_df, test_df

In [11]:
def create_json_record(row, prefix):
    return {
        "input_ids": f"{prefix}_{row.name}",
        "text": f"Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi: {row['H']} \nEnglish: {row['E']}"
    }
def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    print(batch)
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )


_preprocessing_function = partial(preprocess_batch, max_length=1024, tokenizer=tokenizer)
# Convert train_df and val_df into JSON format
# train_dataset = [create_json_record(row, "train") for _, row in train_df.iterrows()]
# val_dataset = [create_json_record(row, "val") for _, row in val_df.iterrows()]


def create_prompt(row):
    return f"Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: {row['H']} \nEnglish: {row['E']}"

# Apply the function to each row and create a new 'text' column
train_df['text'] = train_df.apply(create_prompt, axis=1)
val_df['text'] = val_df.apply(create_prompt, axis=1)

columns_to_drop = ['H', 'RH','E']
train_df.drop(columns=columns_to_drop, inplace=True)
val_df.drop(columns=columns_to_drop, inplace=True)

train_dataset = Dataset.from_pandas(train_df)
val_dataset= Dataset.from_pandas(val_df)


In [12]:
train_dataset = train_dataset.map(
        _preprocessing_function,
        batched=True,
    )

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

{'text': ['Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: बहुत कम लोग ऐसे हैं जो ये जानते हैं कि स्वयंभू प्रैस ने अमिताभ बच्चन पर प्रतिबंध लगा दिया था। \nEnglish: It is known to very few people,a restriction was brought by Swayambhu Press on him', 'Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: उसने सब कुछ अपने-आप किया। \nEnglish: She did it all by herself.', "Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: मैं दस बजे वापस आऊँगा। \nEnglish: I'll be back at ten.", 'Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: उसने किताब को फाड़ डाला। \nEnglish: He tore the book apart.', 'Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: ये सपने हि

In [13]:
val_dataset = val_dataset.map(
        _preprocessing_function,
    batched=True,
    )

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

{'text': ['Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: बस वही नहीं, \nEnglish: Not only that,', 'Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: बल्लेबाजों की भूमिका रन बनने के साथ और ओवर पूरे होने के साथ बदलती रहती है। \nEnglish: Batsman changes its side while making runs or the over is completed.', 'Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: अगर तुममें थोड़ा और सब्र होता तो तुम कामयाब हो जाते। \nEnglish: With a little more patience, you would have succeeded.', 'Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: जंग हमारे हित में जा रहा है। \nEnglish: The war is going in our favor.', 'Translate the following sentences from Hindi to English. The output should be in English and no other language.\n

In [14]:
base_path = "/content/drive/My Drive/CSCI544ProjOutput/"
output_dir = f'{base_path}/peft-dialogue-summary-training-{str(int(time.time()))}'
peft_training_args = TrainingArguments(
    output_dir = output_dir,
    warmup_steps=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    max_steps=50,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_steps=25,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=25,
    evaluation_strategy="steps",
    eval_steps=25,
    do_eval=True,
    gradient_checkpointing=False,
    report_to="none",
    overwrite_output_dir = 'True',
    group_by_length=True,
)

model.config.use_cache = False

peft_trainer = transformers.Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=peft_training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [15]:
print(type(train_dataset['text'][0]))

<class 'str'>


In [16]:
peft_trainer.train()

Step,Training Loss,Validation Loss
25,1.506,1.21426
50,0.8471,1.124664


TrainOutput(global_step=50, training_loss=1.1765452194213868, metrics={'train_runtime': 180.2335, 'train_samples_per_second': 1.11, 'train_steps_per_second': 0.277, 'total_flos': 130831472099328.0, 'train_loss': 1.1765452194213868, 'epoch': 0.05})

In [17]:
# from nltk.translate.bleu_score import corpus_bleu, sentence_bleu

# Assume test_data is a list of tuples: (source_sentence, reference_translation)

sources = test_df['H'].to_list()
references = test_df['E'].to_list()

#sources = [sources[0]]
#references = [references[0]]

s = []
r = []

for i in range(len(sources)):

    inputs = tokenizer(f'''Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi: {sources[i]} \nEnglish:''', return_tensors="pt",)

    with torch.no_grad():
        outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=20)
        text = tokenizer.batch_decode(outputs)[0]
        #print(text)

    #print(text)
    t = text.split("\nEnglish: ")
    t = t[1]. split("\n")[0]

    """
    print("t")
    print(t)
    print(t.split())
    print(references[i].split())
    print("---")
    """

    s.append(t.split())
    # translations.append(reference.split())
    r.append(references[i].split())  # Note: reference must be a list of lists for corpus_bleu

In [18]:
print(s)
print(r)

[['The', 'reason', 'for', 'this', 'is', 'that', 'the', 'remi', 'is', 'so', 'different', 'from', 'the', 'rest.'], ['He', 'was', 'a', 'very', 'powerful', 'ruler.'], ['If', 'you', 'want', 'to', 'make', 'a', 'lot', 'of', 'noise,', 'you', 'need', 'to', 'add', 'more', 'gasoline.'], ['I', 'am', 'tired', 'of', 'working.'], ['To', 'find', 'Hindi', 'words,', 'you', 'need', 'to', 'download', 'Hindi-English', 'dictionary.'], ['We', 'have', 'worked', 'in', 'India', 'for', 'a', 'long', 'time.'], ['She', 'has', 'no', 'place', 'of', 'her', 'own.'], ['Yes,', "that's", 'true.'], ['We', 'will', 'first', 'see', 'the', 'sunrise.'], ['The', 'best-selling', 'book', 'of', 'India', '(ISBN', '81-88086-'], ['There', 'are', 'many', 'students', 'in', 'this', 'school.'], ['This', 'color', 'is', 'what?'], ['Thank', 'you', 'for', 'your', 'kindness.'], ['I', "don't", 'speak', 'English.'], ['We', 'are', 'living', 'in', 'the', 'city.'], ['Is', 'it', 'true?'], ['The', 'village', 'of', 'Ravidas', 'Nagar'], ['The', 'last',

In [19]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [20]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0


In [21]:
from evaluate import load
meteor_metric = load('meteor')

Downloading builder script:   0%|          | 0.00/6.93k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [22]:
hypothesis = [' '.join(i) for i in s]
reference = [' '.join(i) for i in r]
print(len(hypothesis))
print(len(reference))
filtered_data = [(h, r) for h, r in zip(hypothesis, reference) if h and r]

# If you want to separate the filtered data into hypothesis and reference lists again
filtered_hypothesis = [pair[0] for pair in filtered_data]
filtered_reference = [pair[1] for pair in filtered_data]
print(len(filtered_hypothesis))
print(len(filtered_reference))
# bleurt_results = bleurt_metric.compute(predictions=hypothesis, references=reference)
meteor_scores = meteor_metric.compute(predictions=filtered_hypothesis, references=filtered_reference)
# meteor_results = [round(v, 4) for v in meteor_scores["meteor"]]
# avg = sum(meteor_results)/len(meteor_results)
avg= meteor_scores["meteor"]

500
500
498
498


In [23]:
avg

0.3065586821628368

In [24]:
results['Languages'].append('Hindi -> English')
results['Avg Meteor'].append(avg)
print(avg)

0.3065586821628368


#**Romanized Hindi to English**

In [25]:
train_df, val_df = train_df2, val_df2
val_df, test_df = val_df2, test_df2

In [26]:
def create_json_record(row, prefix):
    return {
        "input_ids": f"{prefix}_{row.name}",
        "text": f"Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi Transliterated: {row['RH']} \nEnglish: {row['E']}"
    }
def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    print(batch)
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )


_preprocessing_function = partial(preprocess_batch, max_length=1024, tokenizer=tokenizer)
# Convert train_df and val_df into JSON format
# train_dataset = [create_json_record(row, "train") for _, row in train_df.iterrows()]
# val_dataset = [create_json_record(row, "val") for _, row in val_df.iterrows()]


def create_prompt(row):
    return f"Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: {row['RH']} ]\nEnglish: {row['E']}"

# Apply the function to each row and create a new 'text' column
train_df['text'] = train_df.apply(create_prompt, axis=1)
val_df['text'] = val_df.apply(create_prompt, axis=1)

columns_to_drop = ['H', 'RH','E']
train_df.drop(columns=columns_to_drop, inplace=True)
val_df.drop(columns=columns_to_drop, inplace=True)

train_dataset = Dataset.from_pandas(train_df)
val_dataset= Dataset.from_pandas(val_df)


In [27]:
train_dataset = train_dataset.map(
        _preprocessing_function,
        batched=True,
    )

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

{'text': ['Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: bahut cum log aise hain zoo yeye jaanate hain kii swayambhu press nay amitabh bachchan para pratibandh lagaa diya tha. ]\nEnglish: It is known to very few people,a restriction was brought by Swayambhu Press on him', 'Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: usane sub kuchha apane-aap kiya. ]\nEnglish: She did it all by herself.', "Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: main dasa baje vaapas aaoonga. ]\nEnglish: I'll be back at ten.", 'Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: usane kitaab quo faad daalaa. ]\nEnglish: He tore the book apart.', 'Translate the following sentences from Hindi to English. The output should be in 

In [28]:
val_dataset = val_dataset.map(
        _preprocessing_function,
    batched=True,
    )

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

{'text': ['Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: bus vahee nahin, ]\nEnglish: Not only that,', 'Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: ballebaajon key bhoomika rann banane key saath our over poore honey key saath badalati rahati hai. ]\nEnglish: Batsman changes its side while making runs or the over is completed.', 'Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: agar tumamen thoda our sabr hota too tum kamyab how jaate. ]\nEnglish: With a little more patience, you would have succeeded.', 'Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: jung hamare hita main jaa rahaa hai. ]\nEnglish: The war is going in our favor.', 'Translate the following sentences from Hindi to English. The output

In [29]:
base_path = "/content/drive/My Drive/CSCI544ProjOutput/"
output_dir = f'{base_path}/peft-dialogue-summary-training-{str(int(time.time()))}'
peft_training_args = TrainingArguments(
    output_dir = output_dir,
    warmup_steps=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    max_steps=50,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_steps=25,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=25,
    evaluation_strategy="steps",
    eval_steps=25,
    do_eval=True,
    gradient_checkpointing=False,
    report_to="none",
    overwrite_output_dir = 'True',
    group_by_length=True,
)

model.config.use_cache = False

peft_trainer = transformers.Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=peft_training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [30]:
print(type(train_dataset['text'][0]))

<class 'str'>


In [31]:
peft_trainer.train()

Step,Training Loss,Validation Loss
25,2.6451,2.250204
50,1.4325,2.227323


TrainOutput(global_step=50, training_loss=2.0388362884521483, metrics={'train_runtime': 125.1045, 'train_samples_per_second': 1.599, 'train_steps_per_second': 0.4, 'total_flos': 90198571008000.0, 'train_loss': 2.0388362884521483, 'epoch': 0.05})

In [None]:
# model = AutoModelForCausalLM.from_pretrained('/content/drive/My Drive/peft-dialogue-summary-training-1712294967/checkpoint-375')
# tokenizer = AutoTokenizer.from_pretrained('/content/drive/My Drive/CSCI544ProjOutput/peft-dialogue-summary-training-1712294967/checkpoint-375',trust_remote_code=True)

In [32]:
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu

# Assume test_data is a list of tuples: (source_sentence, reference_translation)

sources = test_df['RH'].to_list()
references = test_df['E'].to_list()

#sources = [sources[0]]
#references = [references[0]]

s = []
r = []

for i in range(len(sources)):

    inputs = tokenizer(f'''Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi: {sources[i]} \nEnglish:''', return_tensors="pt",)

    with torch.no_grad():
        outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=20)
        text = tokenizer.batch_decode(outputs)[0]
        #print(text)

    #print(text)
    t = text.split("\nEnglish: ")
    t = t[1]. split("\n")[0]

    """
    print("t")
    print(t)
    print(t.split())
    print(references[i].split())
    print("---")
    """

    s.append(t.split())
    # translations.append(reference.split())
    r.append(references[i].split())  # Note: reference must be a list of lists for corpus_bleu

In [33]:
print(s)
print(r)

[['He', 'has', 'a', 'lot', 'of', 'money.'], ['He', 'is', 'a', 'very', 'tall', 'man.', ']'], ['If', 'you', 'want', 'coffee,', 'you', 'should', 'buy', 'it.'], ['I', 'have', 'a', 'rose', 'in', 'my', 'hand.', ']'], ['I', 'have', 'a', 'lot', 'of', 'work', 'to', 'do.'], ['He', 'worked', 'for', 'a', 'long', 'time', 'for', 'the', 'country.', ']'], ['There', 'is', 'no', 'job', 'for', 'me.'], ["I'm", 'not', 'sure', 'if', "I'll", 'be', 'able', 'to', 'do', 'it.', ']'], ['He', 'had', 'been', 'there', 'for', 'a', 'long', 'time.'], ['The', 'book', 'is', 'a', 'must', 'for', 'all', 'those', 'who', 'want', 'to', 'know', 'about', 'the', 'history', 'of', 'India.', ']'], ['They', 'are', 'very', 'studious.'], ["What's", 'the', 'matter?'], ['It', 'is', 'a', 'good', 'idea.', ']'], ['She', 'is', 'not', 'a', 'good', 'cook.', ']'], ['I', 'am', 'a', 'bird.'], ['What', 'is', 'the', 'main', 'thing?'], ['the', 'village', 'is', 'in', 'the', 'city'], ['The', 'Indian', 'National', 'Congress', 'and', 'the', 'Indian', 'N

In [34]:
hypothesis = [' '.join(i) for i in s]
reference = [' '.join(i) for i in r]
print(len(hypothesis))
print(len(reference))
filtered_data = [(h, r) for h, r in zip(hypothesis, reference) if h and r]

# If you want to separate the filtered data into hypothesis and reference lists again
filtered_hypothesis = [pair[0] for pair in filtered_data]
filtered_reference = [pair[1] for pair in filtered_data]
print(len(filtered_hypothesis))
print(len(filtered_reference))
# bleurt_results = bleurt_metric.compute(predictions=hypothesis, references=reference)
meteor_scores = meteor_metric.compute(predictions=filtered_hypothesis, references=filtered_reference)
# meteor_results = [round(v, 4) for v in meteor_scores["meteor"]]
# avg = sum(meteor_results)/len(meteor_results)
avg= meteor_scores["meteor"]

500
500
500
500


In [35]:
results['Languages'].append('Romanized Hindi -> English')
results['Avg Meteor'].append(avg)
print(avg)

0.17599526403851823


# **RH + H to English**

In [36]:
train_df, val_df = train_df3, val_df3
val_df, test_df = val_df3, test_df3

In [37]:
def create_json_record(row, prefix):
    return {
        "input_ids": f"{prefix}_{row.name}",
        "text": f"Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi: {row['H']} \nHindi Transliterated:{row['RH']} \nEnglish: {row['E']}"
    }
def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    print(batch)
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )


_preprocessing_function = partial(preprocess_batch, max_length=1024, tokenizer=tokenizer)
# Convert train_df and val_df into JSON format
# train_dataset = [create_json_record(row, "train") for _, row in train_df.iterrows()]
# val_dataset = [create_json_record(row, "val") for _, row in val_df.iterrows()]


def create_prompt(row):
    return f"Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi: {row['H']} \nHindi Transliterated:{row['RH']} \nEnglish: {row['E']}"

# Apply the function to each row and create a new 'text' column
train_df['text'] = train_df.apply(create_prompt, axis=1)
val_df['text'] = val_df.apply(create_prompt, axis=1)

columns_to_drop = ['H', 'RH','E']
train_df.drop(columns=columns_to_drop, inplace=True)
val_df.drop(columns=columns_to_drop, inplace=True)

train_dataset = Dataset.from_pandas(train_df)
val_dataset= Dataset.from_pandas(val_df)


In [38]:
train_dataset = train_dataset.map(
        _preprocessing_function,
        batched=True,
    )

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

{'text': ['Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi: बहुत कम लोग ऐसे हैं जो ये जानते हैं कि स्वयंभू प्रैस ने अमिताभ बच्चन पर प्रतिबंध लगा दिया था। \nHindi Transliterated:bahut cum log aise hain zoo yeye jaanate hain kii swayambhu press nay amitabh bachchan para pratibandh lagaa diya tha. \nEnglish: It is known to very few people,a restriction was brought by Swayambhu Press on him', 'Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi: उसने सब कुछ अपने-आप किया। \nHindi Transliterated:usane sub kuchha apane-aap kiya. \nEnglish: She did it all by herself.', "Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi: मैं दस बजे वापस आऊँगा। \nHindi Transliterated:main dasa baje vaapas aaoonga. \nEnglish: I'll be back at ten.", 'Translate the following sentences from Hindi to English. T

In [39]:
val_dataset = val_dataset.map(
        _preprocessing_function,
    batched=True,
    )

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

{'text': ['Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi: बस वही नहीं, \nHindi Transliterated:bus vahee nahin, \nEnglish: Not only that,', 'Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi: बल्लेबाजों की भूमिका रन बनने के साथ और ओवर पूरे होने के साथ बदलती रहती है। \nHindi Transliterated:ballebaajon key bhoomika rann banane key saath our over poore honey key saath badalati rahati hai. \nEnglish: Batsman changes its side while making runs or the over is completed.', 'Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi: अगर तुममें थोड़ा और सब्र होता तो तुम कामयाब हो जाते। \nHindi Transliterated:agar tumamen thoda our sabr hota too tum kamyab how jaate. \nEnglish: With a little more patience, you would have succeeded.', 'Translate the following sentences from Hindi to English. The

In [40]:
base_path = "/content/drive/My Drive/CSCI544ProjOutput/"
output_dir = f'{base_path}/peft-dialogue-summary-training-{str(int(time.time()))}'
peft_training_args = TrainingArguments(
    output_dir = output_dir,
    warmup_steps=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    max_steps=50,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_steps=25,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=25,
    evaluation_strategy="steps",
    eval_steps=25,
    do_eval=True,
    gradient_checkpointing=False,
    report_to="none",
    overwrite_output_dir = 'True',
    group_by_length=True,
)

model.config.use_cache = False

peft_trainer = transformers.Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=peft_training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [41]:
print(type(train_dataset['text'][0]))

<class 'str'>


In [42]:
peft_trainer.train()

Step,Training Loss,Validation Loss
25,1.3208,1.17974
50,0.8201,1.160453


TrainOutput(global_step=50, training_loss=1.0704654693603515, metrics={'train_runtime': 245.7927, 'train_samples_per_second': 0.814, 'train_steps_per_second': 0.203, 'total_flos': 172876447371264.0, 'train_loss': 1.0704654693603515, 'epoch': 0.05})

In [None]:
# model = AutoModelForCausalLM.from_pretrained('/content/drive/My Drive/peft-dialogue-summary-training-1712294967/checkpoint-375')
# tokenizer = AutoTokenizer.from_pretrained('/content/drive/My Drive/CSCI544ProjOutput/peft-dialogue-summary-training-1712294967/checkpoint-375',trust_remote_code=True)

In [43]:
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu

# Assume test_data is a list of tuples: (source_sentence, reference_translation)

rh = test_df['RH'].to_list()
h = test_df['H'].to_list()
references = test_df['E'].to_list()

#sources = [sources[0]]
#references = [references[0]]

s = []
r = []

for i in range(len(rh)):

    inputs = tokenizer(f'''Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi: {h[i]} \nHindi Transliterated:{rh[i]} \nEnglish:''', return_tensors="pt",)

    with torch.no_grad():
        outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=20)
        text = tokenizer.batch_decode(outputs)[0]
        #print(text)

    #print(text)
    t = text.split("\nEnglish: ")
    t = t[1]. split("\n")[0]

    """
    print("t")
    print(t)
    print(t.split())
    print(references[i].split())
    print("---")
    """

    s.append(t.split())
    # translations.append(reference.split())
    r.append(references[i].split())  # Note: reference must be a list of lists for corpus_bleu

In [44]:
hypothesis = [' '.join(i) for i in s]
reference = [' '.join(i) for i in r]
print(len(hypothesis))
print(len(reference))
filtered_data = [(h, r) for h, r in zip(hypothesis, reference) if h and r]

# If you want to separate the filtered data into hypothesis and reference lists again
filtered_hypothesis = [pair[0] for pair in filtered_data]
filtered_reference = [pair[1] for pair in filtered_data]
print(len(filtered_hypothesis))
print(len(filtered_reference))
# bleurt_results = bleurt_metric.compute(predictions=hypothesis, references=reference)
meteor_scores = meteor_metric.compute(predictions=filtered_hypothesis, references=filtered_reference)
# meteor_results = [round(v, 4) for v in meteor_scores["meteor"]]
# avg = sum(meteor_results)/len(meteor_results)
avg= meteor_scores["meteor"]

500
500
500
500


In [45]:
results['Languages'].append('Hindi + Romanized Hindi -> English')
results['Avg Meteor'].append(avg)
print(avg)

0.314668264334298


# **Results**

In [46]:
results_df = pd.DataFrame(results)
results_df


{'Languages': ['Hindi -> English',
  'Romanized Hindi -> English',
  'Hindi + Romanized Hindi -> English'],
 'Avg Meteor': [0.3065586821628368, 0.17599526403851823, 0.314668264334298]}