In [1]:
#!/usr/bin/env python3
import csv
import glob
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import pearsonr, spearmanr

In [2]:
#cherry_pick is used to read specific ids from an all file (e.g., to assess only one lang)
def evaluate(submission, reference_file, verbose, cherry_pick=None):
    #Read predictions:
    submission_dict = {}
    with open(submission, 'r') as sf:
        csv_reader = csv.reader(sf, delimiter="\t")
        for item in csv_reader:
            if cherry_pick:
                if item[0].startswith(cherry_pick):
                    submission_dict[item[0]] = float(item[4])
            else:
                submission_dict[item[0]] = float(item[4])

            
    #Read reference:
    gold_dict = {}
    with open(reference_file, 'r') as rf:
        csv_reader = csv.reader(rf, delimiter="\t")
        for item in csv_reader:
            if cherry_pick:
                if item[0].startswith(cherry_pick):
                    gold_dict[item[0]] = float(item[4])
            else:
                gold_dict[item[0]] = float(item[4])

    remove_ids = ["fr_150","fr_513","pt_600","pt_579"]

    for remove_id in remove_ids:
        try:
            submission_dict.pop(remove_id)
        except:
            pass
        
        try:
            gold_dict.pop(remove_id)
        except:
            pass

    if verbose == 3:
        print(submission_dict.keys())

    #Produce vectors with reference labels and predictions:
    gold = []
    predicted = []

    for key, label in gold_dict.items():
        gold.append(label)
        predicted.append(submission_dict[key])


    if verbose > 0:
        gold_len = len(gold_dict.keys())
        sub_len = len(submission_dict.keys())
        both_len = len(gold_dict.keys() & submission_dict.keys())

        if gold_len != sub_len:
            print("different ids in gold and sub")

        if gold_len != both_len:
            print("check ids: Gold")

        if sub_len != both_len:
            print("check ids: Sub")
                
        if verbose == 2:
            bad_keys = gold_dict.keys() ^ submission_dict.keys()
            if len(bad_keys) != 0:
                raise  Exception("Bad Keys: " + str(bad_keys))
        
        

    #Calculate scores:
    pearson_score = pearsonr(gold, predicted)[0]
    spearman_score = spearmanr(gold, predicted)[0]
    mae_score = mean_absolute_error(gold, predicted)
    mse_score = mean_squared_error(gold, predicted)
    rsq_score = r2_score(gold, predicted)
    
    return (pearson_score, spearman_score, mae_score, mse_score, rsq_score)

In [3]:
#verbose = 0 - no errors
#        = 1 - notify errors + continue
#        = 2 - explain errors + fail
def process_lang(gold, sys_files, lang, verbose=1, cherry_pick=None):
    for output in sys_files:
        if verbose >= 2:
            print(gold)
            print(output)
        try:
            results = evaluate(output,gold,verbose)
        except Exception as error:
            if verbose == 0:
                continue
            elif verbose == 1:
                print("Failed on " + output)
                continue
            else:
                raise error                
        parts = output.split("/")
        print("%s\t%s\t%s\t%s\t%f\t%f\t%f\t%f\t%f" % ((output, parts[1], lang, parts[2][-5]) + results))

In [4]:
lang_ids = ["ca","en","fil","fr","de","it","ja","pt","si","es"]
lang_dict = {"ca":"Catalan", 
             "en":"English",
             "fil":"Filipino",
             "fr":"French",
             "de":"German",
             "it":"Italian",
             "ja":"Japanese",
             "pt":"Portuguese",
             "si":"Sinhala",
             "es":"Spanish"}

def process_lang_all(gold, sys_files, verbose=1):
    
    for output in sys_files:
        if verbose >= 2:
            print(gold)
            print(output)
        for lang_id in lang_ids:
            try:
                results = evaluate(output,gold,verbose,lang_id)
            except Exception as error:
                if verbose == 0:
                    continue
                elif verbose == 1:
                    print("Failed on " + output)
                    continue
                else:
                    raise error                
            parts = output.split("/")
            print("%s\t%s\t%s\t%s\t%f\t%f\t%f\t%f\t%f" % ((output, parts[1], lang_dict[lang_id], "A") + results))

In [10]:
def call_process_lang(name1,name2,abbrv):
    gold = "MLSP_Organisers/Gold/" + name1 + "/multilex_test_" + name2 + "_lcp_labels.tsv"
    sys_files = glob.glob("MLSP_Participants/*/multilex_test_" + abbrv + "_lcp*")
    process_lang(gold,sys_files,name1)

In [13]:
call_process_lang("Catalan"   , "catalan",    "ca")
call_process_lang("English"   , "english",    "en")
call_process_lang("Filipino"  , "filipino",   "fil")
call_process_lang("French"    , "french",     "fr")
call_process_lang("German"    , "german",     "de")
call_process_lang("Italian"   , "italian",    "it")
call_process_lang("Japanese"  , "japanese",   "ja")
call_process_lang("Portuguese", "portuguese", "pt")
call_process_lang("Sinhala"   , "sinhala",    "si")
call_process_lang("Spanish"   , "spanish",    "es")
call_process_lang("All"       , "all",        "all_combined")

all_gold = "MLSP_Organisers/Gold/All/multilex_test_all_lcp_labels.tsv"
all_sys_files = glob.glob("MLSP_Participants/*/multilex_test_all_combined_lcp*")

process_lang_all(all_gold, all_sys_files)

MLSP_Participants/Archaeology/Extra/full_mt/multilex_test_ca_lcp_5.tsv	Archaeology	Catalan	E	0.154346	0.127678	0.211199	0.063610	-1.678674
MLSP_Participants/Archaeology/Extra/base_mt/multilex_test_ca_lcp_3.tsv	Archaeology	Catalan	E	0.243333	0.200048	0.186026	0.050979	-1.146786
MLSP_Participants/Archaeology/Extra/full_orig/multilex_test_ca_lcp_6.tsv	Archaeology	Catalan	E	0.272141	0.277575	0.126424	0.024330	-0.024579
MLSP_Participants/Archaeology/Extra/base_orig/multilex_test_ca_lcp_4.tsv	Archaeology	Catalan	E	0.294091	0.292113	0.125561	0.024036	-0.012164
MLSP_Participants/Archaeology/Extra/full_orig/multilex_test_en_lcp_6.tsv	Archaeology	English	E	0.507326	0.552056	0.122099	0.029520	0.196450
MLSP_Participants/Archaeology/Extra/base_orig/multilex_test_en_lcp_4.tsv	Archaeology	English	E	0.337010	0.243006	0.137923	0.034341	0.065213
MLSP_Participants/Archaeology/Extra/full_mt/multilex_test_fil_lcp_5.tsv	Archaeology	Filipino	E	0.170322	0.200824	0.152792	0.039501	-0.817912
MLSP_Participants/A

MLSP_Participants/GMU/multilex_test_all_combined_lcp_1.tsv	GMU	Spanish	A	0.195741	0.177190	0.164575	0.038193	-0.080626
MLSP_Participants/TMU-HIT/multilex_test_all_combined_lcp_1.tsv	TMU-HIT	Catalan	A	0.527892	0.532703	0.173802	0.046624	-0.963400
MLSP_Participants/TMU-HIT/multilex_test_all_combined_lcp_1.tsv	TMU-HIT	English	A	0.803605	0.701731	0.129761	0.025124	0.316113
MLSP_Participants/TMU-HIT/multilex_test_all_combined_lcp_1.tsv	TMU-HIT	Filipino	A	0.501333	0.524401	0.229518	0.075568	-2.477754
MLSP_Participants/TMU-HIT/multilex_test_all_combined_lcp_1.tsv	TMU-HIT	French	A	0.625277	0.630168	0.166857	0.045194	0.270389
MLSP_Participants/TMU-HIT/multilex_test_all_combined_lcp_1.tsv	TMU-HIT	German	A	0.658246	0.681340	0.174145	0.047695	-0.765369
MLSP_Participants/TMU-HIT/multilex_test_all_combined_lcp_1.tsv	TMU-HIT	Italian	A	0.539149	0.555681	0.231311	0.078275	-1.787445
MLSP_Participants/TMU-HIT/multilex_test_all_combined_lcp_1.tsv	TMU-HIT	Japanese	A	0.644811	0.647914	0.138640	0.030276	-0.0