In [51]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [52]:
!pip install transformers datasets peft accelerate bitsandbytes trl safetensors torch --no-cache



In [53]:
import torch
import pandas as pd
from functools import partial
from datasets import Dataset
from transformers import TrainingArguments
import time
import transformers
from sklearn.model_selection import train_test_split
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

column_names = ['K', 'RK', 'E']
# base_path = '/content/drive/My Drive/Colab Notebooks/CSCI 564 NLP'
# df = pd.read_csv(f'{base_path}/hindi_data/romanized_hindi_english_paper.csv', names=column_names,nrows=8000)

path = '/content/drive/MyDrive/Project_544/data/korean_data/korean_romanized_english.csv'

df = pd.read_csv(path, names=column_names, nrows=7000)

In [54]:
# Hugging Face model name
#model_name = "microsoft/phi-2" # not the larger version - need to look into this
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
use_flash_attention = False


# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    # quantization_config=bnb_config,
    use_cache=False,
    use_flash_attention_2=use_flash_attention,
    device_map="auto",
    torch_dtype=torch.float16
)

model.config.pretraining_tp = 1

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [55]:
# LoRA config based on QLoRA paper
peft_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    r=16,
    bias="none",
    task_type="CAUSAL_LM",
)
# Prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [56]:
# for RK to E
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(val_df, test_size=0.5, random_state=42)

In [57]:
# for K to E
train_df2, val_df2 = train_df.copy(), val_df.copy()
val_df2, test_df2 = val_df.copy(), test_df.copy()

In [58]:
# for RK+K to E
train_df3, val_df3 = train_df.copy(), val_df.copy()
val_df3, test_df3 = val_df.copy(), test_df.copy()

In [59]:
results = {'Languages': [],
           'Avg Bert' : []}
           #'Corpus BLEU' : []}

# **Korean to English**

In [60]:
train_df, val_df = train_df, val_df
val_df, test_df = val_df, test_df

In [61]:
def create_json_record(row, prefix):
    return {
        "input_ids": f"{prefix}_{row.name}",
        "text": f"Translate the following sentences from Korean to English. The output should be in English and no other language. \nKorean: {row['K']} \nEnglish: {row['E']}"
    }
def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    print(batch)
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )


_preprocessing_function = partial(preprocess_batch, max_length=1024, tokenizer=tokenizer)
# Convert train_df and val_df into JSON format
# train_dataset = [create_json_record(row, "train") for _, row in train_df.iterrows()]
# val_dataset = [create_json_record(row, "val") for _, row in val_df.iterrows()]


def create_prompt(row):
    return f"Translate the following sentences from Korean to English. The output should be in English and no other language.\nKorean: {row['K']} \nEnglish: {row['E']}"

# Apply the function to each row and create a new 'text' column
train_df['text'] = train_df.apply(create_prompt, axis=1)
val_df['text'] = val_df.apply(create_prompt, axis=1)

columns_to_drop = ['K', 'RK','E']
train_df.drop(columns=columns_to_drop, inplace=True)
val_df.drop(columns=columns_to_drop, inplace=True)

train_dataset = Dataset.from_pandas(train_df)
val_dataset= Dataset.from_pandas(val_df)


In [62]:
train_dataset = train_dataset.map(
        _preprocessing_function,
        batched=True,
    )

Map:   0%|          | 0/3651 [00:00<?, ? examples/s]

{'text': ['Translate the following sentences from Korean to English. The output should be in English and no other language.\nKorean: 스트레스가 확 풀렸겠다. \nEnglish: Your stress must have been relieved a lot.', 'Translate the following sentences from Korean to English. The output should be in English and no other language.\nKorean: 글쎄요. 저는 청소기를 쓰지 않고 직접 쓸고 닦아요. \nEnglish: Well, I personally sweep and mop without using a vacuum cleaner.', 'Translate the following sentences from Korean to English. The output should be in English and no other language.\nKorean: 너 토마토에 소금 뿌려 먹는 거야? \nEnglish: Are you eating tomatoes with salt?', 'Translate the following sentences from Korean to English. The output should be in English and no other language.\nKorean: 선배들이 외국어를 잘하니까 걱정하지 마세요. \nEnglish: The seniors are all fluent in foreign languages so do not worry about that.', 'Translate the following sentences from Korean to English. The output should be in English and no other language.\nKorean: 이삼일에 한번 물을 주고 햇

In [63]:
val_dataset = val_dataset.map(
        _preprocessing_function,
    batched=True,
    )

Map:   0%|          | 0/456 [00:00<?, ? examples/s]



In [64]:
base_path = "/content/drive/My Drive/CSCI544ProjOutput/"
output_dir = f'{base_path}/peft-dialogue-summary-training-{str(int(time.time()))}'
peft_training_args = TrainingArguments(
    output_dir = output_dir,
    warmup_steps=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    max_steps=50,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_steps=25,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=25,
    evaluation_strategy="steps",
    eval_steps=25,
    do_eval=True,
    gradient_checkpointing=False,
    report_to="none",
    overwrite_output_dir = 'True',
    group_by_length=True,
)

model.config.use_cache = False

peft_trainer = transformers.Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=peft_training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [65]:
print(type(train_dataset['text'][0]))

<class 'str'>


In [66]:
peft_trainer.train()

Step,Training Loss,Validation Loss
25,1.5356,1.197971
50,0.9595,1.102533


TrainOutput(global_step=50, training_loss=1.2475603866577147, metrics={'train_runtime': 120.2521, 'train_samples_per_second': 1.663, 'train_steps_per_second': 0.416, 'total_flos': 112250566471680.0, 'train_loss': 1.2475603866577147, 'epoch': 0.05})

In [67]:
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu

# Assume test_data is a list of tuples: (source_sentence, reference_translation)

sources = test_df['K'].to_list()
references = test_df['E'].to_list()

#sources = [sources[0]]
#references = [references[0]]

s = []
r = []

for i in range(len(sources)):

    inputs = tokenizer(f'''Translate the following sentences from Korean to English. The output should be in English and no other language. \nKorean: {sources[i]} \nEnglish:''', return_tensors="pt",)

    with torch.no_grad():
        outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=20)
        text = tokenizer.batch_decode(outputs)[0]
        #print(text)

    #print(text)
    t = text.split("\nEnglish: ")
    t = t[1]. split("\n")[0]

    """
    print("t")
    print(t)
    print(t.split())
    print(references[i].split())
    print("---")
    """

    s.append(t.split())
    # translations.append(reference.split())
    r.append(references[i].split())  # Note: reference must be a list of lists for corpus_bleu

In [68]:
print(s)
print(r)

[['I', 'have', 'been', 'eating', 'like', 'this', 'for', 'the', 'past', 'two', 'weeks.'], ['You', 'look', 'handsome.'], ['I', 'feel', 'like', 'my', 'headache', 'is', 'getting', 'worse.', "I've", 'been', 'thinking', 'about', 'it', 'lately.'], ['Yes,', 'I', 'know', 'all', 'the', 'songs', 'of', 'the', 'singer.'], ['Yes,', "that's", 'right.', 'A', 'small', 'tree', 'is', 'not', 'safe', 'in', 'the', 'wind.', 'But', 'a', 'big'], ['Yes,', "that's", 'right.', 'The', 'insurance', 'company', 'requires', 'you', 'to', 'submit', 'the', 'documents', 'beforehand.'], ['Thank', 'you.', 'Then,', "let's", 'put', 'the', 'lower', 'part', 'of', 'the', 'chair', 'on', 'the', 'floor.'], ['Yesterday', 'I', 'saw', 'the', 'news.', 'Now', 'we', 'can', 'also', 'go', 'on', 'a', 'trip', 'to', 'the', 'universe.'], ['Please', 'put', 'this', 'in', 'the', 'kitchen.', 'This', 'is', 'our', 'kitchen.'], ['The', 'speed', 'of', 'light', 'is', 'much', 'faster', 'than', 'the', 'speed', 'of', 'sound,', 'so', 'the', 'sound', 'is', 

In [69]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [70]:
# !pip install evaluate

In [71]:
!pip install bert_score



In [72]:
from datasets import load_metric
bertscore_metric = load_metric('bertscore')

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [73]:
hypothesis = [' '.join(i) for i in s]
reference = [' '.join(i) for i in r]

print(len(hypothesis))
print(len(reference))
filtered_data = [(h, r) for h, r in zip(hypothesis, reference) if h and r]

# If you want to separate the filtered data into hypothesis and reference lists again
filtered_hypothesis = [pair[0] for pair in filtered_data]
filtered_reference = [pair[1] for pair in filtered_data]
print(len(filtered_hypothesis))
print(len(filtered_reference))
bert_scores = bertscore_metric.compute(predictions=filtered_hypothesis, references=filtered_reference, lang="en")
bert_results = [round(v, 4) for v in bert_scores["f1"]]
avg = sum(bert_results)/len(bert_results)
avg


457
457
457
457


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0.9074989059080966

In [74]:
avg

0.9074989059080966

In [75]:
results['Languages'].append('Korean -> English')
results['Avg Bert'].append(avg)
print(avg)

0.9074989059080966


#**Romanized Korean to English**

In [76]:
train_df, val_df = train_df2, val_df2
val_df, test_df = val_df2, test_df2

In [77]:
def create_json_record(row, prefix):
    return {
        "input_ids": f"{prefix}_{row.name}",
        "text": f"Translate the following sentences from Korean to English. The output should be in English and no other language. \nKorean Transliterated: {row['RK']} \nEnglish: {row['E']}"
    }
def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    print(batch)
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )


_preprocessing_function = partial(preprocess_batch, max_length=1024, tokenizer=tokenizer)
# Convert train_df and val_df into JSON format
# train_dataset = [create_json_record(row, "train") for _, row in train_df.iterrows()]
# val_dataset = [create_json_record(row, "val") for _, row in val_df.iterrows()]


def create_prompt(row):
    return f"Translate the following sentences from Korean to English. The output should be in English and no other language.\nKorean Transliterated: {row['RK']} ]\nEnglish: {row['E']}"

# Apply the function to each row and create a new 'text' column
train_df['text'] = train_df.apply(create_prompt, axis=1)
val_df['text'] = val_df.apply(create_prompt, axis=1)

columns_to_drop = ['K', 'RK','E']
train_df.drop(columns=columns_to_drop, inplace=True)
val_df.drop(columns=columns_to_drop, inplace=True)

train_dataset = Dataset.from_pandas(train_df)
val_dataset= Dataset.from_pandas(val_df)


In [78]:
train_dataset = train_dataset.map(
        _preprocessing_function,
        batched=True,
    )

Map:   0%|          | 0/3651 [00:00<?, ? examples/s]

{'text': ['Translate the following sentences from Korean to English. The output should be in English and no other language.\nKorean Transliterated: seuteureseuga hwak pulryeotgetda. ]\nEnglish: Your stress must have been relieved a lot.', 'Translate the following sentences from Korean to English. The output should be in English and no other language.\nKorean Transliterated: geulsseyo. jeoneun cheongsogireul sseuji anko jikjeop sseulgo dakkayo. ]\nEnglish: Well, I personally sweep and mop without using a vacuum cleaner.', 'Translate the following sentences from Korean to English. The output should be in English and no other language.\nKorean Transliterated: neo tomatoe sogeum ppuryeo meokneun geoya? ]\nEnglish: Are you eating tomatoes with salt?', 'Translate the following sentences from Korean to English. The output should be in English and no other language.\nKorean Transliterated: seonbaedeuri oegugeoreul jalhanikka geokjeonghaji maseyo. ]\nEnglish: The seniors are all fluent in forei

In [79]:
val_dataset = val_dataset.map(
        _preprocessing_function,
    batched=True,
    )

Map:   0%|          | 0/456 [00:00<?, ? examples/s]



In [80]:
base_path = "/content/drive/My Drive/CSCI544ProjOutput/"
output_dir = f'{base_path}/peft-dialogue-summary-training-{str(int(time.time()))}'
peft_training_args = TrainingArguments(
    output_dir = output_dir,
    warmup_steps=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    max_steps=50,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_steps=25,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=25,
    evaluation_strategy="steps",
    eval_steps=25,
    do_eval=True,
    gradient_checkpointing=False,
    report_to="none",
    overwrite_output_dir = 'True',
    group_by_length=True,
)

model.config.use_cache = False

peft_trainer = transformers.Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=peft_training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [81]:
print(type(train_dataset['text'][0]))

<class 'str'>


In [82]:
peft_trainer.train()

Step,Training Loss,Validation Loss
25,2.8261,2.369986
50,1.8005,2.336595


TrainOutput(global_step=50, training_loss=2.3133017730712893, metrics={'train_runtime': 104.4842, 'train_samples_per_second': 1.914, 'train_steps_per_second': 0.479, 'total_flos': 96045926645760.0, 'train_loss': 2.3133017730712893, 'epoch': 0.05})

In [83]:
# model = AutoModelForCausalLM.from_pretrained('/content/drive/My Drive/peft-dialogue-summary-training-1712294967/checkpoint-375')
# tokenizer = AutoTokenizer.from_pretrained('/content/drive/My Drive/CSCI544ProjOutput/peft-dialogue-summary-training-1712294967/checkpoint-375',trust_remote_code=True)

In [84]:
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu

# Assume test_data is a list of tuples: (source_sentence, reference_translation)

sources = test_df['RK'].to_list()
references = test_df['E'].to_list()

#sources = [sources[0]]
#references = [references[0]]

s = []
r = []

for i in range(len(sources)):

    inputs = tokenizer(f'''Translate the following sentences from Korean to English. The output should be in English and no other language. \nKorean Transliterated: {sources[i]} \nEnglish:''', return_tensors="pt",)

    with torch.no_grad():
        outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=20)
        text = tokenizer.batch_decode(outputs)[0]
        #print(text)

    #print(text)
    t = text.split("\nEnglish: ")
    t = t[1]. split("\n")[0]

    """
    print("t")
    print(t)
    print(t.split())
    print(references[i].split())
    print("---")
    """

    s.append(t.split())
    # translations.append(reference.split())
    r.append(references[i].split())  # Note: reference must be a list of lists for corpus_bleu

In [85]:
print(s)
print(r)

[["I'll", 'go', 'to', 'the', 'park', 'tomorrow.'], ['Yes.'], ["I'm", 'going', 'to', 'the', 'park', 'today.', "I'm", 'going', 'to', 'go', 'to', 'the', 'park.'], ['Can', 'you', 'tell', 'me', 'the', 'price', 'of', 'the', 'coffee?'], ['Yes,', 'I', 'have', 'a', 'lot', 'of', 'things', 'to', 'do.', 'I', 'have', 'to', 'go', 'to', 'the', 'store', 'and', 'buy'], ['Yes,', 'I', 'have', 'a', 'lot', 'of', 'things', 'to', 'do.', 'I', 'have', 'to', 'go', 'to', 'the', 'store', 'and', 'buy'], ['Yes,', "I'll", 'go', 'to', 'the', 'park.', "I'll", 'be', 'there', 'in', 'a', 'few', 'minutes.'], ['Can', 'you', 'tell', 'me', 'the', 'price', 'of', 'the', 'book?', 'I', 'want', 'to', 'buy', 'it.'], ["I'm", 'going', 'to', 'the', 'park.', "I'll", 'go', 'to', 'the', 'park.'], ['I', 'have', 'a', 'lot', 'of', 'things', 'to', 'do.', 'I', 'have', 'to', 'go', 'to', 'the', 'store,', 'buy', 'some', 'things'], ["I'm", 'going', 'to', 'the', 'store', 'to', 'buy', 'some', 'food.', "I'll", 'buy', 'some', 'vegetables', 'and'], [

In [86]:
hypothesis = [' '.join(i) for i in s]
reference = [' '.join(i) for i in r]

print(len(hypothesis))
print(len(reference))
filtered_data = [(h, r) for h, r in zip(hypothesis, reference) if h and r]

# If you want to separate the filtered data into hypothesis and reference lists again
filtered_hypothesis = [pair[0] for pair in filtered_data]
filtered_reference = [pair[1] for pair in filtered_data]
print(len(filtered_hypothesis))
print(len(filtered_reference))
bert_scores = bertscore_metric.compute(predictions=filtered_hypothesis, references=filtered_reference, lang="en")
bert_results = [round(v, 4) for v in bert_scores["f1"]]
avg = sum(bert_results)/len(bert_results)
avg

457
457
457
457


0.8704380743982495

In [87]:
results['Languages'].append('Romanized Korean -> English')
results['Avg Bert'].append(avg)
print(avg)

0.8704380743982495


# **RK + H to English**

In [88]:
train_df, val_df = train_df3, val_df3
val_df, test_df = val_df3, test_df3

In [89]:
def create_json_record(row, prefix):
    return {
        "input_ids": f"{prefix}_{row.name}",
        "text": f"Translate the following sentences from Korean to English. The output should be in English and no other language. \nKorean Transliterated: {row['K']} {row['RK']} \nEnglish: {row['E']}"
    }
def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    print(batch)
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )


_preprocessing_function = partial(preprocess_batch, max_length=1024, tokenizer=tokenizer)
# Convert train_df and val_df into JSON format
# train_dataset = [create_json_record(row, "train") for _, row in train_df.iterrows()]
# val_dataset = [create_json_record(row, "val") for _, row in val_df.iterrows()]


def create_prompt(row):
    return f"Translate the following sentences from Korean to English. The output should be in English and no other language. \nKorean: {row['K']} {row['RK']} \nEnglish: {row['E']}"

# Apply the function to each row and create a new 'text' column
train_df['text'] = train_df.apply(create_prompt, axis=1)
val_df['text'] = val_df.apply(create_prompt, axis=1)

columns_to_drop = ['K', 'RK','E']
train_df.drop(columns=columns_to_drop, inplace=True)
val_df.drop(columns=columns_to_drop, inplace=True)

train_dataset = Dataset.from_pandas(train_df)
val_dataset= Dataset.from_pandas(val_df)


In [90]:
train_dataset = train_dataset.map(
        _preprocessing_function,
        batched=True,
    )

Map:   0%|          | 0/3651 [00:00<?, ? examples/s]

{'text': ['Translate the following sentences from Korean to English. The output should be in English and no other language. \nKorean: 스트레스가 확 풀렸겠다. seuteureseuga hwak pulryeotgetda. \nEnglish: Your stress must have been relieved a lot.', 'Translate the following sentences from Korean to English. The output should be in English and no other language. \nKorean: 글쎄요. 저는 청소기를 쓰지 않고 직접 쓸고 닦아요. geulsseyo. jeoneun cheongsogireul sseuji anko jikjeop sseulgo dakkayo. \nEnglish: Well, I personally sweep and mop without using a vacuum cleaner.', 'Translate the following sentences from Korean to English. The output should be in English and no other language. \nKorean: 너 토마토에 소금 뿌려 먹는 거야? neo tomatoe sogeum ppuryeo meokneun geoya? \nEnglish: Are you eating tomatoes with salt?', 'Translate the following sentences from Korean to English. The output should be in English and no other language. \nKorean: 선배들이 외국어를 잘하니까 걱정하지 마세요. seonbaedeuri oegugeoreul jalhanikka geokjeonghaji maseyo. \nEnglish: The se

In [91]:
val_dataset = val_dataset.map(
        _preprocessing_function,
    batched=True,
    )

Map:   0%|          | 0/456 [00:00<?, ? examples/s]



In [92]:
base_path = "/content/drive/My Drive/CSCI544ProjOutput/"
output_dir = f'{base_path}/peft-dialogue-summary-training-{str(int(time.time()))}'
peft_training_args = TrainingArguments(
    output_dir = output_dir,
    warmup_steps=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    max_steps=50,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_steps=25,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=25,
    evaluation_strategy="steps",
    eval_steps=25,
    do_eval=True,
    gradient_checkpointing=False,
    report_to="none",
    overwrite_output_dir = 'True',
    group_by_length=True,
)

model.config.use_cache = False

peft_trainer = transformers.Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=peft_training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [93]:
print(type(train_dataset['text'][0]))

<class 'str'>


In [94]:
peft_trainer.train()

Step,Training Loss,Validation Loss
25,1.9375,1.734475
50,1.3587,1.719567


TrainOutput(global_step=50, training_loss=1.6481247711181641, metrics={'train_runtime': 154.1132, 'train_samples_per_second': 1.298, 'train_steps_per_second': 0.324, 'total_flos': 146588229365760.0, 'train_loss': 1.6481247711181641, 'epoch': 0.05})

In [95]:
# model = AutoModelForCausalLM.from_pretrained('/content/drive/My Drive/peft-dialogue-summary-training-1712294967/checkpoint-375')
# tokenizer = AutoTokenizer.from_pretrained('/content/drive/My Drive/CSCI544ProjOutput/peft-dialogue-summary-training-1712294967/checkpoint-375',trust_remote_code=True)

In [96]:
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu

# Assume test_data is a list of tuples: (source_sentence, reference_translation)

rk = test_df['RK'].to_list()
k = test_df['K'].to_list()
references = test_df['E'].to_list()

#sources = [sources[0]]
#references = [references[0]]

s = []
r = []

for i in range(len(rk)):

    inputs = tokenizer(f'''Translate the following sentences from Korean to English. The output should be in English and no other language. \nKorean: {k[i]} {rk[i]} \nEnglish: ''', return_tensors="pt",)

    with torch.no_grad():
        outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=20)
        text = tokenizer.batch_decode(outputs)[0]
        #print(text)

    #print(text)
    t = text.split("\nEnglish: ")
    t = t[1]. split("\n")[0]

    """
    print("t")
    print(t)
    print(t.split())
    print(references[i].split())
    print("---")
    """

    s.append(t.split())
    # translations.append(reference.split())
    r.append(references[i].split())  # Note: reference must be a list of lists for corpus_bleu

In [97]:
hypothesis = [' '.join(i) for i in s]
reference = [' '.join(i) for i in r]

print(len(hypothesis))
print(len(reference))
filtered_data = [(h, r) for h, r in zip(hypothesis, reference) if h and r]

# If you want to separate the filtered data into hypothesis and reference lists again
filtered_hypothesis = [pair[0] for pair in filtered_data]
filtered_reference = [pair[1] for pair in filtered_data]
print(len(filtered_hypothesis))
print(len(filtered_reference))
bert_scores = bertscore_metric.compute(predictions=filtered_hypothesis, references=filtered_reference, lang="en")
bert_results = [round(v, 4) for v in bert_scores["f1"]]
avg = sum(bert_results)/len(bert_results)
avg

457
457
457
457


0.9028873085339176

In [98]:
results['Languages'].append('Korean + Romanized Korean -> English')
results['Avg Bert'].append(avg)
print(avg)

0.9028873085339176


# **Results**

In [99]:
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Languages,Avg Bert
0,Korean -> English,0.907499
1,Romanized Korean -> English,0.870438
2,Korean + Romanized Korean -> English,0.902887
