In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers datasets peft accelerate bitsandbytes trl safetensors torch --no-cache

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.10.0-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m56.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m40.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m71.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl
  Downloading trl-0.8.5-py3-none-any.whl (245 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.1/245.1 kB[0m [31m326.

In [3]:
import torch
import pandas as pd
from functools import partial
from datasets import Dataset
from transformers import TrainingArguments
import time
import transformers
from sklearn.model_selection import train_test_split
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

column_names = ['K', 'RK', 'E']
# base_path = '/content/drive/My Drive/Colab Notebooks/CSCI 564 NLP'
# df = pd.read_csv(f'{base_path}/hindi_data/romanized_hindi_english_paper.csv', names=column_names,nrows=8000)

path = '/content/drive/MyDrive/Project_544/data/korean_data/korean_romanized_english.csv'

df = pd.read_csv(path, names=column_names, nrows=5000)

In [4]:
# Hugging Face model name
#model_name = "microsoft/phi-2" # not the larger version - need to look into this
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
use_flash_attention = False


# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    # quantization_config=bnb_config,
    use_cache=False,
    use_flash_attention_2=use_flash_attention,
    device_map="auto",
    torch_dtype=torch.float16
)

model.config.pretraining_tp = 1

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

In [5]:
# LoRA config based on QLoRA paper
peft_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    r=16,
    bias="none",
    task_type="CAUSAL_LM",
)
# Prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [6]:
# for RH to E
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(val_df, test_size=0.5, random_state=42)

In [7]:
# for H to E
train_df2, val_df2 = train_df.copy(), val_df.copy()
val_df2, test_df2 = val_df.copy(), test_df.copy()

In [8]:
# for RH+H to E
train_df3, val_df3 = train_df.copy(), val_df.copy()
val_df3, test_df3 = val_df.copy(), test_df.copy()

In [9]:
results = {'Languages': [],
           'Avg Meteor' : []}
           #'Corpus BLEU' : []}

# **Korean to English**

In [10]:
train_df, val_df = train_df, val_df
val_df, test_df = val_df, test_df

In [11]:

def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    print(batch)
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )


_preprocessing_function = partial(preprocess_batch, max_length=1024, tokenizer=tokenizer)
# Convert train_df and val_df into JSON format
# train_dataset = [create_json_record(row, "train") for _, row in train_df.iterrows()]
# val_dataset = [create_json_record(row, "val") for _, row in val_df.iterrows()]


def create_prompt(row):
    return f"Translate the following sentences from Korean to English. The output should be in English and no other language.\nKorean: {row['K']} \nEnglish: {row['E']}"

# Apply the function to each row and create a new 'text' column
train_df['text'] = train_df.apply(create_prompt, axis=1)
val_df['text'] = val_df.apply(create_prompt, axis=1)

columns_to_drop = ['K', 'RK','E']
train_df.drop(columns=columns_to_drop, inplace=True)
val_df.drop(columns=columns_to_drop, inplace=True)

train_dataset = Dataset.from_pandas(train_df)
val_dataset= Dataset.from_pandas(val_df)


In [12]:
train_dataset = train_dataset.map(
        _preprocessing_function,
        batched=True,
    )

Map:   0%|          | 0/3651 [00:00<?, ? examples/s]

{'text': ['Translate the following sentences from Korean to English. The output should be in English and no other language.\nKorean: 스트레스가 확 풀렸겠다. \nEnglish: Your stress must have been relieved a lot.', 'Translate the following sentences from Korean to English. The output should be in English and no other language.\nKorean: 글쎄요. 저는 청소기를 쓰지 않고 직접 쓸고 닦아요. \nEnglish: Well, I personally sweep and mop without using a vacuum cleaner.', 'Translate the following sentences from Korean to English. The output should be in English and no other language.\nKorean: 너 토마토에 소금 뿌려 먹는 거야? \nEnglish: Are you eating tomatoes with salt?', 'Translate the following sentences from Korean to English. The output should be in English and no other language.\nKorean: 선배들이 외국어를 잘하니까 걱정하지 마세요. \nEnglish: The seniors are all fluent in foreign languages so do not worry about that.', 'Translate the following sentences from Korean to English. The output should be in English and no other language.\nKorean: 이삼일에 한번 물을 주고 햇

In [13]:
val_dataset = val_dataset.map(
        _preprocessing_function,
    batched=True,
    )

Map:   0%|          | 0/456 [00:00<?, ? examples/s]



In [14]:
base_path = "/content/drive/My Drive/CSCI544ProjOutput/"
output_dir = f'{base_path}/peft-dialogue-summary-training-{str(int(time.time()))}'
peft_training_args = TrainingArguments(
    output_dir = output_dir,
    warmup_steps=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    max_steps=50,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_steps=25,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=25,
    evaluation_strategy="steps",
    eval_steps=25,
    do_eval=True,
    gradient_checkpointing=False,
    report_to="none",
    overwrite_output_dir = 'True',
    group_by_length=True,
)

model.config.use_cache = False

peft_trainer = transformers.Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=peft_training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [15]:
print(type(train_dataset['text'][0]))

<class 'str'>


In [16]:
peft_trainer.train()

Step,Training Loss,Validation Loss
25,1.5371,1.19543
50,0.9558,1.100056


TrainOutput(global_step=50, training_loss=1.246475715637207, metrics={'train_runtime': 103.0775, 'train_samples_per_second': 1.94, 'train_steps_per_second': 0.485, 'total_flos': 112250566471680.0, 'train_loss': 1.246475715637207, 'epoch': 0.05})

In [17]:
# from nltk.translate.bleu_score import corpus_bleu, sentence_bleu

# Assume test_data is a list of tuples: (source_sentence, reference_translation)

sources = test_df['K'].to_list()
references = test_df['E'].to_list()

#sources = [sources[0]]
#references = [references[0]]

s = []
r = []

for i in range(len(sources)):

    inputs = tokenizer(f'''Translate the following sentences from Korean to English. The output should be in English and no other language. \nKorean: {sources[i]} \nEnglish:''', return_tensors="pt",)

    with torch.no_grad():
        outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=20)
        text = tokenizer.batch_decode(outputs)[0]
        #print(text)

    #print(text)
    t = text.split("\nEnglish: ")
    t = t[1]. split("\n")[0]

    """
    print("t")
    print(t)
    print(t.split())
    print(references[i].split())
    print("---")
    """

    s.append(t.split())
    # translations.append(reference.split())
    r.append(references[i].split())  # Note: reference must be a list of lists for corpus_bleu

In [18]:
print(s)
print(r)

[['I', 'have', 'been', 'eating', 'like', 'this', 'for', 'the', 'past', 'two', 'weeks.'], ['You', 'look', 'handsome.'], ['I', 'feel', 'stressed', 'a', 'lot,', 'but', 'my', 'headache', 'is', 'getting', 'worse.', 'Do', 'you', 'have', 'any', 'idea'], ['Yes,', 'I', 'know', 'all', 'the', 'songs', 'of', 'the', 'singer.'], ['Yes,', "that's", 'right.', 'A', 'small', 'tree', 'is', 'not', 'safe', 'in', 'the', 'wind.', 'But', 'a', 'big'], ['Yes,', "that's", 'right.', 'The', 'insurance', 'company', 'requires', 'the', 'documents', 'before', 'submitting', 'them.', 'I'], ['Thank', 'you.', 'Then,', 'I', 'will', 'put', 'the', 'left', 'foot', 'on', 'the', 'ground', 'and', 'step', 'on', 'the', 'right', 'foot'], ['Yesterday', 'I', 'saw', 'the', 'news.', 'Now', 'we', 'can', 'also', 'go', 'on', 'a', 'trip', 'to', 'the', 'universe.'], ['Please', 'put', 'it', 'here.', 'This', 'is', 'our', 'house.'], ['The', 'speed', 'of', 'light', 'is', 'much', 'faster', 'than', 'the', 'speed', 'of', 'sound,', 'so', 'the', 'so

In [19]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [20]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━[0m [32m51.2/84.1 kB[0m [31m1.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0


In [21]:
from evaluate import load
meteor_metric = load('meteor')

Downloading builder script:   0%|          | 0.00/6.93k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [22]:
hypothesis = [' '.join(i) for i in s]
reference = [' '.join(i) for i in r]
print(len(hypothesis))
print(len(reference))
filtered_data = [(h, r) for h, r in zip(hypothesis, reference) if h and r]

# If you want to separate the filtered data into hypothesis and reference lists again
filtered_hypothesis = [pair[0] for pair in filtered_data]
filtered_reference = [pair[1] for pair in filtered_data]
print(len(filtered_hypothesis))
print(len(filtered_reference))
# bleurt_results = bleurt_metric.compute(predictions=hypothesis, references=reference)
meteor_scores = meteor_metric.compute(predictions=filtered_hypothesis, references=filtered_reference)
# meteor_results = [round(v, 4) for v in meteor_scores["meteor"]]
# avg = sum(meteor_results)/len(meteor_results)
avg= meteor_scores["meteor"]

457
457
457
457


In [23]:
avg

0.36388504102856967

In [24]:
results['Languages'].append('Korean -> English')
results['Avg Meteor'].append(avg)
print(avg)

0.36388504102856967


#**Romanized Korean to English**

In [25]:
train_df, val_df = train_df2, val_df2
val_df, test_df = val_df2, test_df2

In [26]:

def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    print(batch)
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )


_preprocessing_function = partial(preprocess_batch, max_length=1024, tokenizer=tokenizer)
# Convert train_df and val_df into JSON format
# train_dataset = [create_json_record(row, "train") for _, row in train_df.iterrows()]
# val_dataset = [create_json_record(row, "val") for _, row in val_df.iterrows()]


def create_prompt(row):
    return f"Translate the following sentences from Korean to English. The output should be in English and no other language.\nKorean: {row['RK']} \nEnglish: {row['E']}"

# Apply the function to each row and create a new 'text' column
train_df['text'] = train_df.apply(create_prompt, axis=1)
val_df['text'] = val_df.apply(create_prompt, axis=1)

columns_to_drop = ['K', 'RK','E']
train_df.drop(columns=columns_to_drop, inplace=True)
val_df.drop(columns=columns_to_drop, inplace=True)

train_dataset = Dataset.from_pandas(train_df)
val_dataset= Dataset.from_pandas(val_df)


In [27]:
train_dataset = train_dataset.map(
        _preprocessing_function,
        batched=True,
    )

Map:   0%|          | 0/3651 [00:00<?, ? examples/s]

{'text': ['Translate the following sentences from Korean to English. The output should be in English and no other language.\nKorean: seuteureseuga hwak pulryeotgetda. \nEnglish: Your stress must have been relieved a lot.', 'Translate the following sentences from Korean to English. The output should be in English and no other language.\nKorean: geulsseyo. jeoneun cheongsogireul sseuji anko jikjeop sseulgo dakkayo. \nEnglish: Well, I personally sweep and mop without using a vacuum cleaner.', 'Translate the following sentences from Korean to English. The output should be in English and no other language.\nKorean: neo tomatoe sogeum ppuryeo meokneun geoya? \nEnglish: Are you eating tomatoes with salt?', 'Translate the following sentences from Korean to English. The output should be in English and no other language.\nKorean: seonbaedeuri oegugeoreul jalhanikka geokjeonghaji maseyo. \nEnglish: The seniors are all fluent in foreign languages so do not worry about that.', 'Translate the follow

In [28]:
val_dataset = val_dataset.map(
        _preprocessing_function,
    batched=True,
    )

Map:   0%|          | 0/456 [00:00<?, ? examples/s]



In [29]:
base_path = "/content/drive/My Drive/CSCI544ProjOutput/"
output_dir = f'{base_path}/peft-dialogue-summary-training-{str(int(time.time()))}'
peft_training_args = TrainingArguments(
    output_dir = output_dir,
    warmup_steps=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    max_steps=50,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_steps=25,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=25,
    evaluation_strategy="steps",
    eval_steps=25,
    do_eval=True,
    gradient_checkpointing=False,
    report_to="none",
    overwrite_output_dir = 'True',
    group_by_length=True,
)

model.config.use_cache = False

peft_trainer = transformers.Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=peft_training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [30]:
print(type(train_dataset['text'][0]))

<class 'str'>


In [31]:
peft_trainer.train()

Step,Training Loss,Validation Loss
25,2.7872,2.43263
50,1.8659,2.412821


TrainOutput(global_step=50, training_loss=2.326535491943359, metrics={'train_runtime': 99.6393, 'train_samples_per_second': 2.007, 'train_steps_per_second': 0.502, 'total_flos': 92313571983360.0, 'train_loss': 2.326535491943359, 'epoch': 0.05})

In [32]:
# model = AutoModelForCausalLM.from_pretrained('/content/drive/My Drive/peft-dialogue-summary-training-1712294967/checkpoint-375')
# tokenizer = AutoTokenizer.from_pretrained('/content/drive/My Drive/CSCI544ProjOutput/peft-dialogue-summary-training-1712294967/checkpoint-375',trust_remote_code=True)

In [33]:
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu

# Assume test_data is a list of tuples: (source_sentence, reference_translation)

sources = test_df['RK'].to_list()
references = test_df['E'].to_list()

#sources = [sources[0]]
#references = [references[0]]

s = []
r = []

for i in range(len(sources)):

    inputs = tokenizer(f'''Translate the following sentences from Korean to English. The output should be in English and no other language. \nKorean: {sources[i]} \nEnglish:''', return_tensors="pt",)

    with torch.no_grad():
        outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=20)
        text = tokenizer.batch_decode(outputs)[0]
        #print(text)

    #print(text)
    t = text.split("\nEnglish: ")
    t = t[1]. split("\n")[0]

    """
    print("t")
    print(t)
    print(t.split())
    print(references[i].split())
    print("---")
    """

    s.append(t.split())
    # translations.append(reference.split())
    r.append(references[i].split())  # Note: reference must be a list of lists for corpus_bleu

In [34]:
print(s)
print(r)

[["I'm", 'going', 'to', 'the', 'park', 'today.'], ['Yes,', "I'm", 'fine.'], ["I'm", 'going', 'to', 'the', 'park', 'today.', "I'm", 'going', 'to', 'go', 'to', 'the', 'park.'], ['Can', 'you', 'tell', 'me', 'how', 'to', 'make', 'a', 'salad?'], ['Yes,', 'I', 'understand.', "I'll", 'go', 'to', 'the', 'store', 'and', 'buy', 'some', 'snacks.'], ['Yes,', "I'll", 'go', 'to', 'the', 'park.', "I'll", 'take', 'my', 'dog', 'there.'], ["I'm", 'sorry.', "I'll", 'make', 'it', 'right.'], ['Can', 'you', 'tell', 'me', 'how', 'to', 'make', 'the', 'dish?', 'I', 'want', 'to', 'try', 'it.'], ["I'm", 'going', 'to', 'the', 'park.', "I'll", 'go', 'to', 'the', 'park.'], ['I', 'have', 'a', 'lot', 'of', 'things', 'to', 'do.', 'I', 'have', 'to', 'go', 'to', 'the', 'store', 'and', 'buy', 'some', 'things'], ["I'm", 'sorry,', 'but', 'I', "don't", 'understand.', "I'm", 'not', 'sure', 'what', 'you', 'mean'], ["I'm", 'going', 'to', 'the', 'market.'], ["I'm", 'going', 'to', 'the', 'park.'], ["I'm", 'going', 'to', 'the', '

In [35]:
hypothesis = [' '.join(i) for i in s]
reference = [' '.join(i) for i in r]
print(len(hypothesis))
print(len(reference))
filtered_data = [(h, r) for h, r in zip(hypothesis, reference) if h and r]

# If you want to separate the filtered data into hypothesis and reference lists again
filtered_hypothesis = [pair[0] for pair in filtered_data]
filtered_reference = [pair[1] for pair in filtered_data]
print(len(filtered_hypothesis))
print(len(filtered_reference))
# bleurt_results = bleurt_metric.compute(predictions=hypothesis, references=reference)
meteor_scores = meteor_metric.compute(predictions=filtered_hypothesis, references=filtered_reference)
# meteor_results = [round(v, 4) for v in meteor_scores["meteor"]]
# avg = sum(meteor_results)/len(meteor_results)
avg= meteor_scores["meteor"]

457
457
457
457


In [36]:
results['Languages'].append('Romanized Korean -> English')
results['Avg Meteor'].append(avg)
print(avg)

0.1479645151059336


# **RK + K to English**

In [37]:
train_df, val_df = train_df3, val_df3
val_df, test_df = val_df3, test_df3

In [38]:

def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    print(batch)
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )


_preprocessing_function = partial(preprocess_batch, max_length=1024, tokenizer=tokenizer)
# Convert train_df and val_df into JSON format
# train_dataset = [create_json_record(row, "train") for _, row in train_df.iterrows()]
# val_dataset = [create_json_record(row, "val") for _, row in val_df.iterrows()]


def create_prompt(row):
    return f"Translate the following sentences from Korean to English. The output should be in English and no other language. \Korean: {row['RK']} \Korean: {row['K']} \nEnglish: {row['E']}"

# Apply the function to each row and create a new 'text' column
train_df['text'] = train_df.apply(create_prompt, axis=1)
val_df['text'] = val_df.apply(create_prompt, axis=1)

columns_to_drop = ['K', 'RK','E']
train_df.drop(columns=columns_to_drop, inplace=True)
val_df.drop(columns=columns_to_drop, inplace=True)

train_dataset = Dataset.from_pandas(train_df)
val_dataset= Dataset.from_pandas(val_df)


In [39]:
train_dataset = train_dataset.map(
        _preprocessing_function,
        batched=True,
    )

Map:   0%|          | 0/3651 [00:00<?, ? examples/s]

{'text': ['Translate the following sentences from Korean to English. The output should be in English and no other language. \\Korean: seuteureseuga hwak pulryeotgetda. \\Korean: 스트레스가 확 풀렸겠다. \nEnglish: Your stress must have been relieved a lot.', 'Translate the following sentences from Korean to English. The output should be in English and no other language. \\Korean: geulsseyo. jeoneun cheongsogireul sseuji anko jikjeop sseulgo dakkayo. \\Korean: 글쎄요. 저는 청소기를 쓰지 않고 직접 쓸고 닦아요. \nEnglish: Well, I personally sweep and mop without using a vacuum cleaner.', 'Translate the following sentences from Korean to English. The output should be in English and no other language. \\Korean: neo tomatoe sogeum ppuryeo meokneun geoya? \\Korean: 너 토마토에 소금 뿌려 먹는 거야? \nEnglish: Are you eating tomatoes with salt?', 'Translate the following sentences from Korean to English. The output should be in English and no other language. \\Korean: seonbaedeuri oegugeoreul jalhanikka geokjeonghaji maseyo. \\Korean: 선배

In [40]:
val_dataset = val_dataset.map(
        _preprocessing_function,
    batched=True,
    )

Map:   0%|          | 0/456 [00:00<?, ? examples/s]



In [41]:
base_path = "/content/drive/My Drive/CSCI544ProjOutput/"
output_dir = f'{base_path}/peft-dialogue-summary-training-{str(int(time.time()))}'
peft_training_args = TrainingArguments(
    output_dir = output_dir,
    warmup_steps=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    max_steps=50,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_steps=25,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=25,
    evaluation_strategy="steps",
    eval_steps=25,
    do_eval=True,
    gradient_checkpointing=False,
    report_to="none",
    overwrite_output_dir = 'True',
    group_by_length=True,
)

model.config.use_cache = False

peft_trainer = transformers.Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=peft_training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [42]:
print(type(train_dataset['text'][0]))

<class 'str'>


In [43]:
peft_trainer.train()

Step,Training Loss,Validation Loss
25,1.9153,1.731445
50,1.3651,1.722954


TrainOutput(global_step=50, training_loss=1.6401774597167968, metrics={'train_runtime': 152.8099, 'train_samples_per_second': 1.309, 'train_steps_per_second': 0.327, 'total_flos': 151564702248960.0, 'train_loss': 1.6401774597167968, 'epoch': 0.05})

In [44]:
# model = AutoModelForCausalLM.from_pretrained('/content/drive/My Drive/peft-dialogue-summary-training-1712294967/checkpoint-375')
# tokenizer = AutoTokenizer.from_pretrained('/content/drive/My Drive/CSCI544ProjOutput/peft-dialogue-summary-training-1712294967/checkpoint-375',trust_remote_code=True)

In [45]:
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu

# Assume test_data is a list of tuples: (source_sentence, reference_translation)

rk = test_df['RK'].to_list()
k = test_df['K'].to_list()
references = test_df['E'].to_list()

#sources = [sources[0]]
#references = [references[0]]

s = []
r = []

for i in range(len(rk)):

    inputs = tokenizer(f'''Translate the following sentences from Hindi to English. The output should be in English and no other language. \nKorean: {rk[i]} \Korean: {k[i]} \nEnglish:''', return_tensors="pt",)

    with torch.no_grad():
        outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=20)
        text = tokenizer.batch_decode(outputs)[0]
        #print(text)

    #print(text)
    t = text.split("\nEnglish: ")
    t = t[1]. split("\n")[0]

    """
    print("t")
    print(t)
    print(t.split())
    print(references[i].split())
    print("---")
    """

    s.append(t.split())
    # translations.append(reference.split())
    r.append(references[i].split())  # Note: reference must be a list of lists for corpus_bleu

In [46]:
hypothesis = [' '.join(i) for i in s]
reference = [' '.join(i) for i in r]
print(len(hypothesis))
print(len(reference))
filtered_data = [(h, r) for h, r in zip(hypothesis, reference) if h and r]

# If you want to separate the filtered data into hypothesis and reference lists again
filtered_hypothesis = [pair[0] for pair in filtered_data]
filtered_reference = [pair[1] for pair in filtered_data]
print(len(filtered_hypothesis))
print(len(filtered_reference))
# bleurt_results = bleurt_metric.compute(predictions=hypothesis, references=reference)
meteor_scores = meteor_metric.compute(predictions=filtered_hypothesis, references=filtered_reference)
# meteor_results = [round(v, 4) for v in meteor_scores["meteor"]]
# avg = sum(meteor_results)/len(meteor_results)
avg= meteor_scores["meteor"]

457
457
457
457


In [47]:
results['Languages'].append('Korean + Romanized Korean -> English')
results['Avg Meteor'].append(avg)
print(avg)

0.3418867010892172


# **Results**

In [48]:
results_df = pd.DataFrame(results)
results_df


Unnamed: 0,Languages,Avg Meteor
0,Korean -> English,0.363885
1,Romanized Korean -> English,0.147965
2,Korean + Romanized Korean -> English,0.341887
