### Importing libraries and initializing dataframes

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
import openai
from sentence_transformers import SentenceTransformer, util
import json
import os
import sys
import glob
from tqdm import tqdm
import argparse
from pathlib import Path
from tenacity import retry, stop_after_attempt, wait_random_exponential
import multiprocessing as mp
from bert_score import score
from pprint import pprint
from torchmetrics.text.bert import BERTScore
import evaluate
from evaluate import load 
from nltk.tokenize import word_tokenize

model = SentenceTransformer('all-MiniLM-L6-v2')

# def cap(x):
#     return x.title()

# new_df = df.applymap(cap)
# print(new_df)
# new_df.to_csv('all_translations_cap.csv',index='False',encoding='utf-8')


### A.  Load new translations into all_translations_scores.csv to prepare for score calculation

In [2]:
try:
    df = pd.read_csv(r'all_translations_scores.csv')
except:
    df = pd.read_csv(r'../outputs/translation_all.csv')

def add_score(translation, file_to, col_name):
    df_trans = pd.read_csv('../outputs/'+translation)
    df_to = pd.read_csv('../score_calculation/'+file_to)
    df_to[col_name] = df_trans[col_name].to_numpy()
    df_to.to_csv('../score_calculation/'+file_to)
# add_score('gpt4_7_new.csv','all_translations_scores.csv','gpt4_7_new')

add_score('gpt4_0_new.csv','all_translations_scores.csv','gpt4_0_new')
add_score('gpt4_1_new.csv','all_translations_scores.csv','gpt4_1_new')
add_score('gpt4_2_new.csv','all_translations_scores.csv','gpt4_2_new')
add_score('gpt4_3_new.csv','all_translations_scores.csv','gpt4_3_new')
add_score('gpt4_4_new.csv','all_translations_scores.csv','gpt4_4_new')
add_score('gpt4_5_new.csv','all_translations_scores.csv','gpt4_5_new')
add_score('gpt4_6_new.csv','all_translations_scores.csv','gpt4_6_new')
add_score('gpt4_8_new.csv','all_translations_scores.csv','gpt4_8_new')
add_score('gpt4_9_new.csv','all_translations_scores.csv','gpt4_9_new')
add_score('gpt4_10_new.csv','all_translations_scores.csv','gpt4_10_new')
add_score('gpt4_11_new.csv','all_translations_scores.csv','gpt4_11_new')

### B. helper functions for various score calculation

#### B.1 Calculating consine similarity using transformer model

In [3]:

# Two lists of sentences
def semantic_compare(data, filename="all_translations_scores.csv", key1 = "English_Name", key2 = "gpt35_translation"):
    
    names1 = data[key1] #"English_Name"

    names2 = data[key2] #chatGPT_translation
#     print("names1 dtype", names1.dtypes)

    #Compute embedding for both lists
    embeddings1 = model.encode(names1, convert_to_tensor=True)
    embeddings2 = model.encode(names2, convert_to_tensor=True)

    #Compute cosine-similarities
    cosine_scores = util.cos_sim(embeddings1, embeddings2)
    scores = []
    #Output the pairs with their score
    for i in range(len(names1)):
        cos = cosine_scores[i][i]
        scores.append("{:.4f}".format(cos))
    #     print("{} \t\t {} \t\t Score: {:.4f}".format(names1[i], names2[i], cos))
    key = key1+key2+'consine'
    data[key] = scores
    data.to_csv(filename, index=False,encoding='utf-8')
    data[key] = pd.to_numeric(data[key], downcast='float')
    print("Naming semantics similarity:",data[key].mean())

# try:
#     df = pd.read_csv(r'all_translations_scores.csv')
# except:
#     df = pd.read_csv(r'../outputs/translation_all.csv')

# semantic_compare(df,"all_translations_scores.csv",'nanxi_translation')
# semantic_compare(df,"all_translations_scores.csv",'rukun_translation')
# semantic_compare(df,"all_translations_scores.csv",'nanxi_translation','gpt4_translation')
# semantic_compare(df,"all_translations_scores.csv",'rukun_translation','gpt4_translation')
# semantic_compare(df,"all_translations_scores.csv",'rukun_translation','gpt4_0')
# semantic_compare(df,"all_translations_scores.csv",'nanxi_translation','gpt4_0')
# semantic_compare(CH_df,"rural_embedding_translation_literal.csv",key2 = "Literal_Translation")
# semantic_compare(CH_urban_df,"urban_embedding_translation_literal.csv",key2 = "Literal_Translation")

#### B.2 Evaluating text similarities with BERTScore

In [4]:

# When you are running this cell for the first time, 
# it will download the BERT model which will take relatively longer. 
# P, R, F1 = score(cands, refs, lang="en", verbose=True)
# print(f"System level F1 score: {F1.mean():.3f}")

def BERT_compare(data, filename,key1 = "English_Name", key2 = "gpt35_translation"):
    
    names1 = data[key1].tolist() #"English_Name"

    names2 = data[key2].tolist() #chatGPT_translation
  
    bertscore = BERTScore()
    # printing F1 scores
    scores = bertscore(names1, names2)
    F1 = scores['f1'].tolist()
    precision = scores['precision'].tolist()
    recall = scores['recall'].tolist()
    data[key1+key2+'BERT_F1'] = F1
    data[key1+key2+'BERT_precision'] = precision
    data[key1+key2+'BERT_recall'] = recall
    data.to_csv(filename, index=False,encoding='utf-8')
    # printing mean F1 scores and all the other scores
#     print(f"System level F1 score: {F1.mean():.3f}")
#     pprint(bertscore(names1, names2))
# df = pd.read_csv(r'all_translations_scores.csv')
# BERT_compare(df,"all_translations_scores.csv",'nanxi_translation')
# BERT_compare(df,"all_translations_scores.csv",'rukun_translation')
# BERT_compare(df,"all_translations_scores.csv",'nanxi_translation','gpt4_translation')
# BERT_compare(df,"all_translations_scores.csv",'rukun_translation','gpt4_translation')
# BERT_compare(df,"all_translations_scores.csv",'nanxi_translation','gpt4_0')
# BERT_compare(df,"all_translations_scores.csv",'rukun_translation','gpt4_0')
# !pip install torchmetrics
# from pprint import pprint
# from torchmetrics.text.bert import BERTScore
# preds = ["hello there", "general kenobi"]
# target = ["hello there", "master kenobi"]
# bertscore = BERTScore()
# pprint(bertscore(preds, target))



#### B.3 Evaluating text similarities with ROUGE

In [5]:

# source: https://huggingface.co/spaces/evaluate-metric/rouge

def ROUGE_compare(data, filename, key1 = "name", key2 = "gpt35_translation"):
    rouge = evaluate.load('rouge')
    references = [x.lower() for x in data[key1].tolist()] #"English_Name"
    predictions = [x.lower() for x in data[key2].tolist()] #chatGPT_translation
    scores = rouge.compute(predictions=predictions, references=references, use_aggregator=False)
    data[key1+key2+'rouge1'] = scores['rouge1']
    data[key1+key2+'rouge2'] = scores['rouge2']
    data[key1+key2+'rougeL'] = scores['rougeL']
    data[key1+key2+'rougeLsum'] = scores['rougeLsum']
    data.to_csv(filename,index=False, encoding='utf-8')
# df = pd.read_csv(r'all_translations_scores.csv')
# ROUGE_compare(df,"all_translations_scores.csv",'nanxi_translation')
# ROUGE_compare(df,"all_translations_scores.csv",'rukun_translation')
# ROUGE_compare(df,"all_translations_scores.csv",'nanxi_translation','gpt4_translation')
# ROUGE_compare(df,"all_translations_scores.csv",'rukun_translation','gpt4_translation')

# ROUGE_compare(df,"all_translations_scores.csv",'nanxi_translation','gpt4_0')
# ROUGE_compare(df,"all_translations_scores.csv",'rukun_translation','gpt4_0')

#### B.4 Evaluating text similarities with BLEU

In [6]:
# source: https://huggingface.co/spaces/evaluate-metric/bleu
# !pip install bleu
def BLEU_compare(data, filename, key1 = "name", key2 = "gpt35_translation"):
    bleu = evaluate.load('bleu')
    references = [x.lower() for x in data[key1].tolist()] #"English_Name"
    predictions = [x.lower() for x in data[key2].tolist()] #chatGPT_translation
    res = []
    for i in range(len(references)):
        res.append(bleu.compute(predictions=[predictions[i]], references=[references[i]])['bleu'])
    scores = bleu.compute(predictions=predictions, references=references) #, tokenizer=word_tokenize)
    print(scores)
    data[key1+key2+'bleu'] = res
    data.to_csv(filename,index=False, encoding='utf-8')
    
# df = pd.read_csv(r'all_translations_scores.csv')

# BLEU_compare(df,"all_translations_scores.csv",'nanxi_translation')
# BLEU_compare(df,"all_translations_scores.csv",'rukun_translation')
# BLEU_compare(df,"all_translations_scores.csv",'nanxi_translation','gpt4_translation')
# BLEU_compare(df,"all_translations_scores.csv",'rukun_translation','gpt4_translation')
# BLEU_compare(df,"all_translations_scores.csv",'nanxi_translation','gpt4_0')
# BLEU_compare(df,"all_translations_scores.csv",'rukun_translation','gpt4_0')



### C. executing all the score calculation function

In [28]:
def get_scores(col):
    df = pd.read_csv(r'all_translations_scores.csv')
    semantic_compare(df,"all_translations_scores.csv",'rukun_translation',col)
    semantic_compare(df,"all_translations_scores.csv",'nanxi_translation',col)

    df = pd.read_csv(r'all_translations_scores.csv')
    BERT_compare(df,"all_translations_scores.csv",'nanxi_translation',col)
    BERT_compare(df,"all_translations_scores.csv",'rukun_translation',col)
    
    df = pd.read_csv(r'all_translations_scores.csv')
    ROUGE_compare(df,"all_translations_scores.csv",'nanxi_translation',col)
    ROUGE_compare(df,"all_translations_scores.csv",'rukun_translation',col)

    df = pd.read_csv(r'all_translations_scores.csv')
    BLEU_compare(df,"all_translations_scores.csv",'nanxi_translation',col)
    BLEU_compare(df,"all_translations_scores.csv",'rukun_translation',col)

# replace the argument in the get_scores function with the column you're hoping to get the scores from
# get_scores('gpt4_0_new')
# get_scores('gpt4_1_new')
# get_scores('gpt4_2_new')
# get_scores('gpt4_3_new')
# get_scores('gpt4_4_new')
# get_scores('gpt4_5_new')
# get_scores('gpt4_6_new')
# get_scores('gpt4_8_new')
# get_scores('gpt4_9_new')
# get_scores('gpt4_10_new')
get_scores('gpt4_11_new')
# get_scores('gpt4_7_new')

Naming semantics similarity: 0.748861
Naming semantics similarity: 0.761592


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'bleu': 0.31072100320984514, 'precisions': [0.5659163987138264, 0.3791469194312796, 0.23893805309734514, 0.18181818181818182], 'brevity_penalty': 1.0, 'length_ratio': 1.0436241610738255, 'translation_length': 311, 'reference_length': 298}
{'bleu': 0.25591972990551154, 'precisions': [0.5659163987138264, 0.33649289099526064, 0.17699115044247787, 0.12727272727272726], 'brevity_penalty': 1.0, 'length_ratio': 1.0798611111111112, 'translation_length': 311, 'reference_length': 288}


### D. getting score averages

In [29]:
try:
    df = pd.read_csv(r'all_translations_scores.csv')
except:
    df = pd.read_csv(r'../outputs/translation_all.csv')

mean = df.mean(axis = 0, numeric_only = 1)
print(type(mean))
print(mean)

mean.to_csv('mean.csv',index=False,encoding='utf-8')

<class 'pandas.core.series.Series'>
Unnamed: 0.26                            49.500000
Unnamed: 0.25                            49.500000
Unnamed: 0.24                            49.500000
Unnamed: 0.23                            49.500000
Unnamed: 0.22                            49.500000
                                           ...    
rukun_translationgpt4_11_newrouge2        0.361508
rukun_translationgpt4_11_newrougeL        0.571465
rukun_translationgpt4_11_newrougeLsum     0.571465
nanxi_translationgpt4_11_newbleu          0.070000
rukun_translationgpt4_11_newbleu          0.023866
Length: 495, dtype: float64
