In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install --upgrade openai
!pip install transformers==4.16.0
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.27.4-py3-none-any.whl (70 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.3/70.3 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
Collecting aiosignal>=1.1.2
  Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)
Collecting async-timeout<5.0,>=4.0.0a3
  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Collecting yarl<2.0,>=1.0
  Downloading yarl-1.8.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (264 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m264.6/264.6 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
Collecting multidict<7.0,>=4.5
  Downloading multidict-6.

## Prepare the test dataset that is used for translation by different models

In [3]:
import json
import urllib.request

In [4]:
url = 'https://raw.githubusercontent.com/kt2k01/petci/main/data/json/filtered.json'

In [5]:
response = urllib.request.urlopen(url)
data = json.loads(response.read())

In [6]:
training_data = []

for idiom in data:
  entry_1 = {}
  entry_2 = {}

  chinese = idiom['chinese']

  if 'gold' in idiom:
    gold = idiom['gold']
    entry_1['prompt'] = chinese + '->'
    entry_1['completion'] = ' ' + gold + '\n'
    training_data.append(entry_1)

  if idiom['human'] != []:
    human = idiom['human'][0]
    entry_2['prompt'] = chinese + '->'
    entry_2['completion'] = ' ' + human + '\n'
    training_data.append(entry_2)

In [7]:
import random
random.seed(10)

random.shuffle(training_data)

len_train = int(len(training_data )* 0.8)
train_data = training_data[:len_train]
test_data = [x for x in training_data if x not in train_data]
validation_data = train_data[:int(len_train * 0.2)]
train_data = [x for x in train_data if x not in validation_data]

In [8]:
test_gold_set = []
test_all = []

for idiom in data:
  if 'gold' in idiom:
    for test in test_data:
      if idiom['chinese'] == test['prompt'][:-2] and idiom['gold'] == test['completion'].strip():
        test_gold_set.append({'chinese': idiom['chinese'], 'gold': idiom['gold']})
        
        test_all.append({'chinese': idiom['chinese'], 
                         'gold': idiom['gold'],
                         'gold_human': [idiom['gold']] + idiom['human'],
                         'all': [idiom['gold']] + idiom['human'] + idiom['machine']})


In [9]:
test_gold_set[:10]

[{'chinese': '一般见识',
  'gold': 'lower oneself to the same level as somebody else'},
 {'chinese': '一本万利', 'gold': 'make big profits with a small capital'},
 {'chinese': '一抔黃土', 'gold': 'mere dust heaps'},
 {'chinese': '一贫如洗', 'gold': 'penniless'},
 {'chinese': '一毛不拔', 'gold': 'miserly'},
 {'chinese': '一面之交', 'gold': 'have met only once'},
 {'chinese': '一反常态', 'gold': 'act out of character'},
 {'chinese': '一得之功', 'gold': 'just an occasional, minor success'},
 {'chinese': '一塌刮子', 'gold': 'the sum total'},
 {'chinese': '一了百了', 'gold': "death ends all one's troubles"}]

## Run the test set on fine-tuned Davinci model

In [10]:
import openai

In [13]:
# commented out to disallow misclicks
# openai.api_key = "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"

In [None]:
response_id = 'ft-B3LZEh45Rcoj1trCQX814uGV'
fine_tuned_model_id = openai.FineTune.retrieve(response_id)['fine_tuned_model']
fine_tuned_model_id

In [None]:
result_set = []

for idiom in test_gold_set:
  prompt = idiom['chinese']
  gold = idiom['gold']

  response = openai.Completion.create(
      model=fine_tuned_model_id,
      prompt=prompt,
      max_tokens=20
  )

  output_text = response.choices[0].text
  clean_output = output_text[3:].split("\n", 1)[0]
  
  result_set.append({'chinese': prompt, 'gold': gold, 'davinci': clean_output})

In [None]:
result_set[:5]

In [None]:
file_name = "test_result.jsonl"

with open(file_name, "w") as output_file:
  for entry in result_set:
    json.dump(entry, output_file)
    output_file.write("\n")

## Run test on M2M100 Model

To load the fine-tuned model
1. Move all files from [M2M_fine_tuned](https://drive.google.com/drive/folders/1U1OQ9qpN0Rl8Dt-BXpVhRrsEAdusB8-F?usp=share_link) to your google drive
2. update the corresponding path to your drive in the following M2M_model_path field. In our example, the model is stored in /CSC413Final/M2MModel folder in our drive.

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


M2M_model_path = './drive/MyDrive/CSC413Final/M2MModel'
tokenizer = AutoTokenizer.from_pretrained(M2M_model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(M2M_model_path)

In [None]:
def translate_sentences(sentences):
    # Encode the input sentences
    encoded_inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

    # Generate the translations
    input_ids = encoded_inputs['input_ids']
    attention_mask = encoded_inputs['attention_mask']
    output_ids = model.generate(input_ids=input_ids.to(model.device), attention_mask=attention_mask.to(model.device),
                                forced_bos_token_id=tokenizer.get_lang_id("en"))

    # Decode the translations
    output_sentences = tokenizer.batch_decode(output_ids, skip_special_tokens=True)

    return output_sentences

In [None]:
from tqdm import tqdm

M2M100_results = []
for idiom in tqdm(test_gold_set):
  ent = {}
  ent["chinese"] = idiom['chinese']
  ent['m2m100'] = translate_sentences(idiom['chinese'])[0]
  ent['gold'] = idiom['gold']
  M2M100_results.append(ent)

M2M100_results[:10]

In [None]:
file_name = "m2m_result.jsonl"

with open(file_name, "w") as output_file:
  for entry in M2M100_results:
    json.dump(entry, output_file)
    output_file.write("\n")

## Calulating Metrics for Performance

In [None]:
import nltk
from google.colab import files
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
nltk.download('wordnet')

In [None]:
import re

def is_english(text):
    pattern = re.compile('[^a-zA-Z.,!?;:\'\"\-\s]')
    return not bool(pattern.search(text))

### Davinci

In [None]:
test_result_jsonl = './drive/MyDrive/CSC413Final/davinci/test_result.jsonl'

In [None]:
def load_jsonl(input_path) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.rstrip('\n|\r')))
    print('Loaded {} records from {}'.format(len(data), input_path))
    return data

In [None]:
result_set = load_jsonl(test_result_jsonl)

In [None]:
davinci_scores = {'BLEU_gold':[], 'BLEU_gold_human':[], 
                  'BLEU_all': [], 
                  'METEOR_gold':[], 'METEOR_gold_human':[], 
                  'METEOR_all': []}

index = 0
for sentence in result_set:
  if is_english(sentence['davinci']):
    gold_sentence = sentence['gold'].split()
    goldhuman_sentence = [x.split() for x in test_all[index]['gold_human']]
    all = [x.split() for x in test_all[index]['all']]

    davinci_sentence = sentence['davinci'].split()

    # bleu scores
    bleu_score_gold = sentence_bleu([gold_sentence], davinci_sentence)
    bleu_score_goldhuman = sentence_bleu(goldhuman_sentence, davinci_sentence)
    bleu_score_all = sentence_bleu(all, davinci_sentence)
    # meteror scores
    meteor_score_gold = meteor_score([gold_sentence], davinci_sentence)
    meteor_score_goldhuman = meteor_score(goldhuman_sentence, davinci_sentence)
    meteor_score_all = meteor_score(all, davinci_sentence)

    # append bleu scores
    davinci_scores['BLEU_gold'].append(bleu_score_gold)
    davinci_scores['BLEU_gold_human'].append(bleu_score_goldhuman)
    davinci_scores['BLEU_all'].append(bleu_score_all)

    # append meteror scores
    davinci_scores['METEOR_gold'].append(meteor_score_gold)
    davinci_scores['METEOR_gold_human'].append(meteor_score_goldhuman)
    davinci_scores['METEOR_all'].append(meteor_score_all)
  index += 1


In [None]:
import numpy as np
print("mean score for BLEU_gold: ", np.mean(davinci_scores['BLEU_gold']))
print("medium score for BLEU_gold: ", np.quantile(davinci_scores['BLEU_gold'], 0.5))

In [None]:
print("mean score for BLEU_gold_human: ", np.mean(davinci_scores['BLEU_gold_human']))
print("medium score for BLEU_gold_human: ", np.quantile(davinci_scores['BLEU_gold_human'], 0.5))

In [None]:
print("mean score for BLEU_all: ", np.mean(davinci_scores['BLEU_all']))
print("medium score for BLEU_all: ", np.quantile(davinci_scores['BLEU_all'], 0.5))

In [None]:
print("mean score for METEOR_gold: ", np.mean(davinci_scores['METEOR_gold']))
print("medium score for METEOR_gold: ", np.quantile(davinci_scores['METEOR_gold'], 0.5))

In [None]:
print("mean score for METEOR_gold_human: ",np.mean(davinci_scores['METEOR_gold_human']))
print("medium score for METEOR_gold_human: ", np.quantile(davinci_scores['METEOR_gold_human'], 0.5))

In [None]:
print("mean score for METEOR_all: ",np.mean(davinci_scores['METEOR_all']))
print("medium score for METEOR_all: ", np.quantile(davinci_scores['METEOR_all'], 0.5))

### M2M100

In [None]:
M2M100_result = load_jsonl('./drive/MyDrive/CSC413Final/M2MModel/m2m_result.jsonl')

In [None]:
m2m100_scores = {'BLEU_gold':[], 'BLEU_gold_human':[], 
                  'BLEU_all': [], 
                  'METEOR_gold':[], 'METEOR_gold_human':[], 
                  'METEOR_all': []}

index = 0
for sentence in M2M100_results:
  if is_english(sentence['m2m100']):
    gold_sentence = sentence['gold'].split()
    goldhuman_sentence = [x.split() for x in test_all[index]['gold_human']]
    all = [x.split() for x in test_all[index]['all']]

    m2m100_sentence = sentence['m2m100'].split()

    # bleu scores
    bleu_score_gold = sentence_bleu([gold_sentence], m2m100_sentence)
    bleu_score_goldhuman = sentence_bleu(goldhuman_sentence, m2m100_sentence)
    bleu_score_all = sentence_bleu(all, m2m100_sentence)
    # meteror scores
    meteor_score_gold = meteor_score([gold_sentence], m2m100_sentence)
    meteor_score_goldhuman = meteor_score(goldhuman_sentence, m2m100_sentence)
    meteor_score_all = meteor_score(all, m2m100_sentence)

    # append bleu scores
    m2m100_scores['BLEU_gold'].append(bleu_score_gold)
    m2m100_scores['BLEU_gold_human'].append(bleu_score_goldhuman)
    m2m100_scores['BLEU_all'].append(bleu_score_all)

    # append meteror scores
    m2m100_scores['METEOR_gold'].append(meteor_score_gold)
    m2m100_scores['METEOR_gold_human'].append(meteor_score_goldhuman)
    m2m100_scores['METEOR_all'].append(meteor_score_all)
  index += 1


In [None]:
import numpy as np
print("mean score for BLEU_gold: ", np.mean(m2m100_scores['BLEU_gold']))
print("medium score for BLEU_gold: ", np.quantile(m2m100_scores['BLEU_gold'], 0.5))

print("mean score for BLEU_gold_human: ",np.mean(m2m100_scores['BLEU_gold_human']))
print("medium score for BLEU_gold_human: ", np.quantile(m2m100_scores['BLEU_gold_human'], 0.5))

print("mean score for BLEU_all: ", np.mean(m2m100_scores['BLEU_all']))
print("medium score for BLEU_all: ", np.quantile(m2m100_scores['BLEU_all'], 0.5))

print("mean score for METEOR_gold: ", np.mean(m2m100_scores['METEOR_gold']))
print("medium score for METEOR_gold: ", np.quantile(m2m100_scores['METEOR_gold'], 0.5))

print("mean score for METEOR_gold_human: ", np.mean(m2m100_scores['METEOR_gold_human']))
print("medium score for METEOR_gold_human: ", np.quantile(m2m100_scores['METEOR_gold_human'], 0.5))

print("mean score for METEOR_all: ", np.mean(m2m100_scores['METEOR_all']))
print("medium score for METEOR_all: ", np.quantile(m2m100_scores['METEOR_all'], 0.5))

### Comet

In [None]:
!pip install unbabel-comet

In [None]:
comet_formatted = []
for setence in result_set:
  if is_english(setence['davinci']):
    new_entry = {}
    new_entry['src'] = setence['chinese']
    new_entry['mt'] = setence['davinci']
    new_entry['ref'] = setence['gold']

    comet_formatted.append(new_entry)

In [None]:
comet_m2m = []
for setence in M2M100_result:
  if is_english(setence['m2m100']):
    new_entry = {}
    new_entry['src'] = setence['chinese']
    new_entry['mt'] = setence['m2m100']
    new_entry['ref'] = setence['gold']

    comet_m2m.append(new_entry)

In [None]:
!pip uninstall transformers
!pip install transformers

In [None]:
from comet import download_model, load_from_checkpoint

model_path = download_model("Unbabel/wmt22-comet-da")
model = load_from_checkpoint(model_path)

model_output = model.predict(comet_formatted, batch_size=8, gpus=1)
print(model_output)

In [None]:
model_output[1]

In [None]:
model_output = model.predict(comet_m2m, batch_size=8, gpus=1)
print(model_output)

In [None]:
model_output[1]