In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers datasets peft accelerate bitsandbytes trl safetensors torch --no-cache

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.10.0-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.29.1-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.3/297.3 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.43.0-py3-none-manylinux_2_24_x86_64.whl (102.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.2/102.2 MB[0m [31m75.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl
  Downloading trl-0.8.1-py3-none-any.whl (225 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.0/225.0 kB[0m [31m208.0

In [46]:
import torch
import pandas as pd
from functools import partial
from datasets import Dataset
from transformers import TrainingArguments
import time
import transformers
from sklearn.model_selection import train_test_split
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

column_names = ['H', 'RH', 'E']
# base_path = '/content/drive/My Drive/Colab Notebooks/CSCI 564 NLP'
# df = pd.read_csv(f'{base_path}/hindi_data/romanized_hindi_english_paper.csv', names=column_names,nrows=8000)

path = '/content/drive/MyDrive/Project_544/data/hindi_data/new_romanized_hindi_english_paper_19k.csv'

df = pd.read_csv(path, names=column_names, nrows=5000)

In [48]:
# Hugging Face model name
#model_name = "microsoft/phi-2" # not the larger version - need to look into this
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
use_flash_attention = False


# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    # quantization_config=bnb_config,
    use_cache=False,
    use_flash_attention_2=use_flash_attention,
    device_map="auto",
    torch_dtype=torch.float16
)

model.config.pretraining_tp = 1

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [49]:
# LoRA config based on QLoRA paper
peft_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    r=16,
    bias="none",
    task_type="CAUSAL_LM",
)
# Prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [50]:
# for RH to E
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(val_df, test_size=0.5, random_state=42)

In [51]:
# for H to E
train_df2, val_df2 = train_df.copy(), val_df.copy()
val_df2, test_df2 = val_df.copy(), test_df.copy()

In [52]:
# for RH+H to E
train_df3, val_df3 = train_df.copy(), val_df.copy()
val_df3, test_df3 = val_df.copy(), test_df.copy()

In [53]:
results = {'Languages': [],
           'Avg Sentence BLEU' : []}
           #'Corpus BLEU' : []}

# **Hindi to English**

In [54]:
train_df, val_df = train_df, val_df
val_df, test_df = val_df, test_df

In [55]:
def create_json_record(row, prefix):
    return {
        "input_ids": f"{prefix}_{row.name}",
        "text": f"Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi: {row['H']} \nEnglish: {row['E']}"
    }
def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    print(batch)
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )


_preprocessing_function = partial(preprocess_batch, max_length=1024, tokenizer=tokenizer)
# Convert train_df and val_df into JSON format
# train_dataset = [create_json_record(row, "train") for _, row in train_df.iterrows()]
# val_dataset = [create_json_record(row, "val") for _, row in val_df.iterrows()]


def create_prompt(row):
    return f"Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: {row['H']} \nEnglish: {row['E']}"

# Apply the function to each row and create a new 'text' column
train_df['text'] = train_df.apply(create_prompt, axis=1)
val_df['text'] = val_df.apply(create_prompt, axis=1)

columns_to_drop = ['H', 'RH','E']
train_df.drop(columns=columns_to_drop, inplace=True)
val_df.drop(columns=columns_to_drop, inplace=True)

train_dataset = Dataset.from_pandas(train_df)
val_dataset= Dataset.from_pandas(val_df)


In [56]:
train_dataset = train_dataset.map(
        _preprocessing_function,
        batched=True,
    )

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

{'text': ['Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: बहुत कम लोग ऐसे हैं जो ये जानते हैं कि स्वयंभू प्रैस ने अमिताभ बच्चन पर प्रतिबंध लगा दिया था। \nEnglish: It is known to very few people,a restriction was brought by Swayambhu Press on him', 'Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: उसने सब कुछ अपने-आप किया। \nEnglish: She did it all by herself.', "Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: मैं दस बजे वापस आऊँगा। \nEnglish: I'll be back at ten.", 'Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: उसने किताब को फाड़ डाला। \nEnglish: He tore the book apart.', 'Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: ये सपने हि

In [57]:
val_dataset = val_dataset.map(
        _preprocessing_function,
    batched=True,
    )

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

{'text': ['Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: बस वही नहीं, \nEnglish: Not only that,', 'Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: बल्लेबाजों की भूमिका रन बनने के साथ और ओवर पूरे होने के साथ बदलती रहती है। \nEnglish: Batsman changes its side while making runs or the over is completed.', 'Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: अगर तुममें थोड़ा और सब्र होता तो तुम कामयाब हो जाते। \nEnglish: With a little more patience, you would have succeeded.', 'Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi: जंग हमारे हित में जा रहा है। \nEnglish: The war is going in our favor.', 'Translate the following sentences from Hindi to English. The output should be in English and no other language.\n

In [58]:
base_path = "/content/drive/My Drive/CSCI544ProjOutput/"
output_dir = f'{base_path}/peft-dialogue-summary-training-{str(int(time.time()))}'
peft_training_args = TrainingArguments(
    output_dir = output_dir,
    warmup_steps=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    max_steps=50,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_steps=25,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=25,
    evaluation_strategy="steps",
    eval_steps=25,
    do_eval=True,
    gradient_checkpointing=False,
    report_to="none",
    overwrite_output_dir = 'True',
    group_by_length=True,
)

model.config.use_cache = False

peft_trainer = transformers.Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=peft_training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [59]:
print(type(train_dataset['text'][0]))

<class 'str'>


In [60]:
peft_trainer.train()

Step,Training Loss,Validation Loss
25,1.5076,1.215557
50,0.8578,1.129117


TrainOutput(global_step=50, training_loss=1.1827201843261719, metrics={'train_runtime': 195.5185, 'train_samples_per_second': 1.023, 'train_steps_per_second': 0.256, 'total_flos': 130831472099328.0, 'train_loss': 1.1827201843261719, 'epoch': 0.05})

In [61]:
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu

# Assume test_data is a list of tuples: (source_sentence, reference_translation)

sources = test_df['H'].to_list()
references = test_df['E'].to_list()

#sources = [sources[0]]
#references = [references[0]]

s = []
r = []

for i in range(len(sources)):

    inputs = tokenizer(f'''Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi: {sources[i]} \nEnglish:''', return_tensors="pt",)

    with torch.no_grad():
        outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=20)
        text = tokenizer.batch_decode(outputs)[0]
        #print(text)

    #print(text)
    t = text.split("\nEnglish: ")
    t = t[1]. split("\n")[0]

    """
    print("t")
    print(t)
    print(t.split())
    print(references[i].split())
    print("---")
    """

    s.append(t.split())
    # translations.append(reference.split())
    r.append(references[i].split())  # Note: reference must be a list of lists for corpus_bleu

In [62]:
print(s)
print(r)

[['The', 'reason', 'for', 'this', 'is', 'that', 'the', 'remi', 'is', 'so', 'old.'], ['He', 'was', 'a', 'very', 'powerful', 'ruler.'], ['If', "it's", 'too', 'much,', 'then', 'a', 'little', 'bit', 'of', 'water.'], ['I', 'am', 'tired', 'of', 'work.'], ['To', 'find', 'Hindi', 'words,', 'you', 'need', 'to', 'download', 'Hindi-English', 'dictionary.'], ['We', 'have', 'worked', 'in', 'India.'], ['There', 'is', 'no', 'such', 'thing', 'as', 'a', 'house', 'in', 'her.'], ['Yes,', 'I', 'am', 'very', 'happy.'], ['We', 'will', 'go', 'first', 'and', 'then', 'we', 'will', 'come', 'back.'], ['The', 'best-selling', 'book', 'of', 'India', '(ISBN', '81-88086-'], ['There', 'are', 'a', 'lot', 'of', 'students', 'in', 'this', 'school.'], ['This', 'color', 'is', 'what?'], ['Thank', 'you', 'for', 'your', 'kindness.'], ['I', "don't", 'speak', 'English.'], ['We', 'are', 'living', 'in', 'the', 'city.'], ['Is', 'it', 'true?'], ['The', 'city', 'of', 'Ravi', 'Das', 'Nagar'], ['The', 'last', 'of', 'the', 'three', 'was

In [63]:
scores = []

avg = 0

for i in s:
  # score = sentence_bleu(r, i, weights=[0.001,0.001,0,0])
  # score = sentence_bleu(r, i, weights=[0.25,0.005,0,0]) # unigram, bigram, trigram, quadrigram
  score = sentence_bleu(r, i, weights=[0.33,0.33,0.33,0])  # unigram, bigram, trigram, quadrigram - weights=[0.25,0.15,0,0]
  scores.append(score)
  avg += score

avg = avg/len(s)
#print(scores)
print("Average BLEU score: ", avg)

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Average BLEU score:  0.18557990626508364


In [None]:
cb = corpus_bleu(r, s, weights=[1,0,0,0]) #0.0001m
print(cb)

In [65]:
results['Languages'].append('Hindi -> English')
results['Avg Sentence BLEU'].append(avg)
# results['Corpus BLEU'].append(cb)

In [None]:
# peft_trainer.save_model('/content/drive/My Drive/CSCI544ProjOutput/saved_model')
# tokenizer.save_pretrained('/content/drive/My Drive/CSCI544ProjOutput/saved_tokenizer')

#**Romanized Hindi to English**

In [66]:
train_df, val_df = train_df2, val_df2
val_df, test_df = val_df2, test_df2

In [67]:
def create_json_record(row, prefix):
    return {
        "input_ids": f"{prefix}_{row.name}",
        "text": f"Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi Transliterated: {row['RH']} \nEnglish: {row['E']}"
    }
def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    print(batch)
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )


_preprocessing_function = partial(preprocess_batch, max_length=1024, tokenizer=tokenizer)
# Convert train_df and val_df into JSON format
# train_dataset = [create_json_record(row, "train") for _, row in train_df.iterrows()]
# val_dataset = [create_json_record(row, "val") for _, row in val_df.iterrows()]


def create_prompt(row):
    return f"Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi Transliterated: {row['RH']} ]\nEnglish: {row['E']}"

# Apply the function to each row and create a new 'text' column
train_df['text'] = train_df.apply(create_prompt, axis=1)
val_df['text'] = val_df.apply(create_prompt, axis=1)

columns_to_drop = ['H', 'RH','E']
train_df.drop(columns=columns_to_drop, inplace=True)
val_df.drop(columns=columns_to_drop, inplace=True)

train_dataset = Dataset.from_pandas(train_df)
val_dataset= Dataset.from_pandas(val_df)


In [68]:
train_dataset = train_dataset.map(
        _preprocessing_function,
        batched=True,
    )

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

{'text': ['Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi Transliterated: bahut cum log aise hain zoo yeye jaanate hain kii swayambhu press nay amitabh bachchan para pratibandh lagaa diya tha. ]\nEnglish: It is known to very few people,a restriction was brought by Swayambhu Press on him', 'Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi Transliterated: usane sub kuchha apane-aap kiya. ]\nEnglish: She did it all by herself.', "Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi Transliterated: main dasa baje vaapas aaoonga. ]\nEnglish: I'll be back at ten.", 'Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi Transliterated: usane kitaab quo faad daalaa. ]\nEnglish: He tore the book apart.', 'Translate the followi

In [69]:
val_dataset = val_dataset.map(
        _preprocessing_function,
    batched=True,
    )

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

{'text': ['Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi Transliterated: bus vahee nahin, ]\nEnglish: Not only that,', 'Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi Transliterated: ballebaajon key bhoomika rann banane key saath our over poore honey key saath badalati rahati hai. ]\nEnglish: Batsman changes its side while making runs or the over is completed.', 'Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi Transliterated: agar tumamen thoda our sabr hota too tum kamyab how jaate. ]\nEnglish: With a little more patience, you would have succeeded.', 'Translate the following sentences from Hindi to English. The output should be in English and no other language.\nHindi Transliterated: jung hamare hita main jaa rahaa hai. ]\nEnglish: The war is going in our favor.', 'Transla

In [70]:
base_path = "/content/drive/My Drive/CSCI544ProjOutput/"
output_dir = f'{base_path}/peft-dialogue-summary-training-{str(int(time.time()))}'
peft_training_args = TrainingArguments(
    output_dir = output_dir,
    warmup_steps=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    max_steps=50,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_steps=25,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=25,
    evaluation_strategy="steps",
    eval_steps=25,
    do_eval=True,
    gradient_checkpointing=False,
    report_to="none",
    overwrite_output_dir = 'True',
    group_by_length=True,
)

model.config.use_cache = False

peft_trainer = transformers.Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=peft_training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
print(type(train_dataset['text'][0]))

In [71]:
peft_trainer.train()

Step,Training Loss,Validation Loss
25,2.5919,2.160876
50,1.3456,2.150801


TrainOutput(global_step=50, training_loss=1.968774642944336, metrics={'train_runtime': 130.0259, 'train_samples_per_second': 1.538, 'train_steps_per_second': 0.385, 'total_flos': 93930925670400.0, 'train_loss': 1.968774642944336, 'epoch': 0.05})

In [None]:
# model = AutoModelForCausalLM.from_pretrained('/content/drive/My Drive/peft-dialogue-summary-training-1712294967/checkpoint-375')
# tokenizer = AutoTokenizer.from_pretrained('/content/drive/My Drive/CSCI544ProjOutput/peft-dialogue-summary-training-1712294967/checkpoint-375',trust_remote_code=True)

In [72]:
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu

# Assume test_data is a list of tuples: (source_sentence, reference_translation)

sources = test_df['RH'].to_list()
references = test_df['E'].to_list()

#sources = [sources[0]]
#references = [references[0]]

s = []
r = []

for i in range(len(sources)):

    inputs = tokenizer(f'''Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi Transliterated: {sources[i]} \nEnglish:''', return_tensors="pt",)

    with torch.no_grad():
        outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=20)
        text = tokenizer.batch_decode(outputs)[0]
        #print(text)

    #print(text)
    t = text.split("\nEnglish: ")
    t = t[1]. split("\n")[0]

    """
    print("t")
    print(t)
    print(t.split())
    print(references[i].split())
    print("---")
    """

    s.append(t.split())
    # translations.append(reference.split())
    r.append(references[i].split())  # Note: reference must be a list of lists for corpus_bleu

In [73]:
print(s)
print(r)

[["It's", 'a', 'big', 'car.'], ['He', 'was', 'a', 'very', 'tall', 'man.'], ['If', 'you', 'have', 'coffee,', 'you', 'should', 'drink', 'water', 'too.'], ['I', 'have', 'a', 'rose', 'in', 'my', 'garden.'], ['I', "don't", 'know', 'how', 'to', 'use', 'Firefox.'], ['I', 'worked', 'for', 'my', 'country.'], ['There', 'is', 'no', 'job', 'for', 'me.'], ["I'm", 'not', 'very', 'good', 'at', 'it.'], ['I', 'was', 'just', 'about', 'to', 'leave.'], ['The', 'book', 'is', 'a', 'must-have', 'for', 'every', 'woman', 'who', 'wants', 'to', 'know', 'more', 'about', 'the', 'Indian', 'Constitution.'], ['There', 'are', 'many', 'students', 'in', 'this', 'school.', ']'], ['Do', 'you', 'know', 'what', 'time', 'it', 'is?'], ["It's", 'a', 'good', 'idea.'], ['I', "don't", 'know', 'how', 'to', 'cook.', ']'], ['We', 'live', 'in', 'a', 'house.'], ['what', 'is', 'the', 'main', 'thing?'], ['the', 'village', 'is', 'in', 'the', 'city'], ['The', 'Indian', 'National', 'Congress', 'was', 'the', 'main', 'opposition', 'party', '

In [74]:
scores = []

avg = 0

for i in s:
  # score = sentence_bleu(r, i, weights=[0.001,0.001,0,0])
  score = sentence_bleu(r, i, weights=[0.33,0.33,0.33,0]) # unigram, bigram, trigram, quadrigram
  scores.append(score)
  avg += score

avg = avg/len(s)
# print(scores)
print("Average BLEU score: ", avg)

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Average BLEU score:  0.224853130747326


In [None]:
score = corpus_bleu(r, s, weights=[0.25,0.0001,0,0])
print("Corpus BLEU score: ", score)

In [75]:
results

{'Languages': ['Hindi -> English'], 'Avg Sentence BLEU': [0.18557990626508364]}

In [76]:
results['Languages'].append('Romanized Hindi -> English')
results['Avg Sentence BLEU'].append(avg)
# results['Corpus BLEU'].append(score)

In [None]:
# peft_trainer.save_model('/content/drive/My Drive/CSCI544ProjOutput/saved_model')
# tokenizer.save_pretrained('/content/drive/My Drive/CSCI544ProjOutput/saved_tokenizer')

# **RH + H to English**

In [77]:
train_df, val_df = train_df3, val_df3
val_df, test_df = val_df3, test_df3

In [78]:
def create_json_record(row, prefix):
    return {
        "input_ids": f"{prefix}_{row.name}",
        "text": f"Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi Transliterated: {row['H']} {row['RH']} \nEnglish: {row['E']}"
    }
def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    print(batch)
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )


_preprocessing_function = partial(preprocess_batch, max_length=1024, tokenizer=tokenizer)
# Convert train_df and val_df into JSON format
# train_dataset = [create_json_record(row, "train") for _, row in train_df.iterrows()]
# val_dataset = [create_json_record(row, "val") for _, row in val_df.iterrows()]


def create_prompt(row):
    return f"Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi Transliterated: {row['H']} {row['RH']} \nEnglish: {row['E']}"

# Apply the function to each row and create a new 'text' column
train_df['text'] = train_df.apply(create_prompt, axis=1)
val_df['text'] = val_df.apply(create_prompt, axis=1)

columns_to_drop = ['H', 'RH','E']
train_df.drop(columns=columns_to_drop, inplace=True)
val_df.drop(columns=columns_to_drop, inplace=True)

train_dataset = Dataset.from_pandas(train_df)
val_dataset= Dataset.from_pandas(val_df)


In [79]:
train_dataset = train_dataset.map(
        _preprocessing_function,
        batched=True,
    )

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

{'text': ['Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi Transliterated: बहुत कम लोग ऐसे हैं जो ये जानते हैं कि स्वयंभू प्रैस ने अमिताभ बच्चन पर प्रतिबंध लगा दिया था। bahut cum log aise hain zoo yeye jaanate hain kii swayambhu press nay amitabh bachchan para pratibandh lagaa diya tha. \nEnglish: It is known to very few people,a restriction was brought by Swayambhu Press on him', 'Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi Transliterated: उसने सब कुछ अपने-आप किया। usane sub kuchha apane-aap kiya. \nEnglish: She did it all by herself.', "Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi Transliterated: मैं दस बजे वापस आऊँगा। main dasa baje vaapas aaoonga. \nEnglish: I'll be back at ten.", 'Translate the following sentences from Hindi to English. The output should be in E

In [80]:
val_dataset = val_dataset.map(
        _preprocessing_function,
    batched=True,
    )

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

{'text': ['Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi Transliterated: बस वही नहीं, bus vahee nahin, \nEnglish: Not only that,', 'Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi Transliterated: बल्लेबाजों की भूमिका रन बनने के साथ और ओवर पूरे होने के साथ बदलती रहती है। ballebaajon key bhoomika rann banane key saath our over poore honey key saath badalati rahati hai. \nEnglish: Batsman changes its side while making runs or the over is completed.', 'Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi Transliterated: अगर तुममें थोड़ा और सब्र होता तो तुम कामयाब हो जाते। agar tumamen thoda our sabr hota too tum kamyab how jaate. \nEnglish: With a little more patience, you would have succeeded.', 'Translate the following sentences from Hindi to English. The output should be in Eng

In [81]:
base_path = "/content/drive/My Drive/CSCI544ProjOutput/"
output_dir = f'{base_path}/peft-dialogue-summary-training-{str(int(time.time()))}'
peft_training_args = TrainingArguments(
    output_dir = output_dir,
    warmup_steps=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    max_steps=50,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_steps=25,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=25,
    evaluation_strategy="steps",
    eval_steps=25,
    do_eval=True,
    gradient_checkpointing=False,
    report_to="none",
    overwrite_output_dir = 'True',
    group_by_length=True,
)

model.config.use_cache = False

peft_trainer = transformers.Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=peft_training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [82]:
print(type(train_dataset['text'][0]))

<class 'str'>


In [83]:
peft_trainer.train()

Step,Training Loss,Validation Loss
25,1.3648,1.241845
50,0.8855,1.219648


TrainOutput(global_step=50, training_loss=1.1251400756835936, metrics={'train_runtime': 248.7196, 'train_samples_per_second': 0.804, 'train_steps_per_second': 0.201, 'total_flos': 166624753311744.0, 'train_loss': 1.1251400756835936, 'epoch': 0.05})

In [None]:
# model = AutoModelForCausalLM.from_pretrained('/content/drive/My Drive/peft-dialogue-summary-training-1712294967/checkpoint-375')
# tokenizer = AutoTokenizer.from_pretrained('/content/drive/My Drive/CSCI544ProjOutput/peft-dialogue-summary-training-1712294967/checkpoint-375',trust_remote_code=True)

In [84]:
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu

# Assume test_data is a list of tuples: (source_sentence, reference_translation)

rh = test_df['RH'].to_list()
h = test_df['H'].to_list()
references = test_df['E'].to_list()

#sources = [sources[0]]
#references = [references[0]]

s = []
r = []

for i in range(len(rh)):

    inputs = tokenizer(f'''Translate the following sentences from Hindi to English. The output should be in English and no other language. \nHindi: {h[i]} {rh[i]} \nEnglish:''', return_tensors="pt",)

    with torch.no_grad():
        outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=20)
        text = tokenizer.batch_decode(outputs)[0]
        #print(text)

    #print(text)
    t = text.split("\nEnglish: ")
    t = t[1]. split("\n")[0]

    """
    print("t")
    print(t)
    print(t.split())
    print(references[i].split())
    print("---")
    """

    s.append(t.split())
    # translations.append(reference.split())
    r.append(references[i].split())  # Note: reference must be a list of lists for corpus_bleu

In [85]:
scores = []

avg = 0

for i in s:
  # score = sentence_bleu(r, i, weights=[0.001,0.001,0,0])
  score = sentence_bleu(r, i, weights=[0.33,0.33,0.33,0]) # unigram, bigram, trigram, quadrigram
  scores.append(score)
  avg += score

avg = avg/len(s)
# print(scores)
print("Average BLEU score: ", avg)

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Average BLEU score:  0.18099862907111494


In [None]:
cb = corpus_bleu(r, s, weights=[0.25,0.0001,0,0])
print(cb)

In [86]:
results

{'Languages': ['Hindi -> English', 'Romanized Hindi -> English'],
 'Avg Sentence BLEU': [0.18557990626508364, 0.224853130747326]}

In [87]:
results['Languages'].append('Hindi + Romanized Hindi -> English')
results['Avg Sentence BLEU'].append(avg)
# results['Corpus BLEU'].append(cb)

# **Results**

In [88]:
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Languages,Avg Sentence BLEU
0,Hindi -> English,0.18558
1,Romanized Hindi -> English,0.224853
2,Hindi + Romanized Hindi -> English,0.180999
