In [68]:
import os
import json
from datasets import load_dataset
from sklearn.metrics import (
                          f1_score,
                          precision_recall_fscore_support,
                          accuracy_score,
                          balanced_accuracy_score,
                          precision_score,
                          recall_score,
)

import pandas as pd


In [69]:
datasets_pairs = [
                    'Pairs_Cardillo_random_split', 
                    'Pairs_Jankowiac_random_split', 
                    'Pairs_Cardillo_lexical_split', 'Pairs_Jankowiac_lexical_split',
                    "Pairs_Cardillo_set","Pairs_Jankowiac_set"
                    ]
datasets_quadruples = [
                        'Quadruples_Green_lexical_split', 'Quadruples_Green_random_split',
                        'Quadruples_Kmiecik_random_split','Quadruples_Kmiecik_lexical_split_on_CD', 'Quadruples_Kmiecik_lexical_split_on_AB' ,
                        'Quadruples_SAT_MET_FILTERED_lexical_split','Quadruples_SAT_MET_FILTERED_random_split',
                        "Quadruples_SAT_MET_FILTERED_set",'Quadruples_Green_set'
                        ]

In [70]:
int2label, label2int = {},{}
int2label["pair"] = {
  0 : "anomalous",
  1 : "literal",
  2 : "metaphoric",
}
int2label["quadruple"] = {
  0 : "anomalous",
  1 : "analogous",
  2 : "metaphoric",
}
label2int["pair"] = {
  "anomalous" : 0,
  "literal" : 1,
  "metaphoric" : 2,
}
label2int["quadruple"] = {
  "anomalous": 0,
  "analogous": 1,
  "metaphoric": 2,
}
mylabels = {
    "Cardillo":[
        "literal",
        "metaphoric"
        ],
    "Pairs": [
        "anomalous",
        "literal",
        "metaphoric"
        ],
    "Quadruples": [
        "anomalous",
        "analogous",
        "metaphoric"
        ]
}
labelset = [ "anomalous", "literal", "metaphoric","metaphor","analogous"]

In [71]:
all_data ={}

for data in datasets_pairs:
    all_data[data] = load_dataset("Joanne/Metaphors_and_Analogies", data)
    
    
for data in datasets_quadruples:
    all_data[data] = load_dataset("Joanne/Metaphors_and_Analogies", data)

## All datasets utils dictionaries
- datasets_sent2label
- datasets_sent2idx
- datasets_idx2sent
- datasets_datasplits

In [72]:
dsplits= ["train","validation","test"]
pairs_dsets = ["Pairs_Cardillo_set", "Pairs_Jankowiac_set"]
quad_dsets=["Quadruples_SAT_MET_FILTERED_set",'Quadruples_Green_set']

pairs_cl = [
    "Pairs_Cardillo",
    "Pairs_Jankowiac",
]

quad_cl = [
    "Quadruples_SAT_MET_FILTERED",
    'Quadruples_Green',    
]

dlab = ["metaphor","anomaly","literal"]

In [73]:
datasets_sent2label = {}
datasets_sent2idx = {}
datasets_idx2sent = {}


for dataset in pairs_dsets:
    datasets_sent2label[dataset] = {}
    datasets_sent2idx[dataset] = {}
    datasets_idx2sent[dataset] = {}
    n = 0
    for i,e in enumerate(all_data[dataset]["test"]):
        for j,f in enumerate(e["sentences"]):
            datasets_sent2label[dataset][f] = e["labels"][j]
            datasets_sent2idx[dataset][f] = n
            datasets_idx2sent[dataset][n] = f
            n+=1
            
            
for dataset in quad_dsets:
    datasets_sent2label[dataset] = {}
    datasets_sent2idx[dataset] = {}
    datasets_idx2sent[dataset] = {}
    n = 0
    for i,e in enumerate(all_data[dataset]["test"]):
        for j,f in enumerate(e["pairs"]):
            t = eval(e["stem"])
            t.extend(f)
            t = tuple(t)
            datasets_sent2label[dataset][t] = e["labels"][j]
            datasets_sent2idx[dataset][t] = n
            datasets_idx2sent[dataset][n] = t
            n+=1

In [74]:
import copy

datasets_datasplits = {}

for dataset in pairs_cl:
    datasets_datasplits[dataset]={"lexical":{},"random":{}}
    for z in dsplits:
        for x in all_data[dataset+"_random_split"][z]:
            datasets_datasplits[dataset]["random"][x["sentence"]]=z  
        for x in all_data[dataset+"_lexical_split"][z]:
            datasets_datasplits[dataset]["lexical"][x["sentence"]]=z   
            

for dataset in quad_cl:
    datasets_datasplits[dataset]={"lexical":{},"random":{}}
    for z in dsplits:
        for x in all_data[dataset+"_random_split"][z]:
            t = copy.copy(x["AB"])
            t.extend(x["CD"])
            t = tuple(t)
            datasets_datasplits[dataset]["random"][t]=z
        for x in all_data[dataset+"_lexical_split"][z]:
            t = copy.copy(x["AB"])
            t.extend(x["CD"])
            t = tuple(t)
            datasets_datasplits[dataset]["lexical"][t]=z  

    
        

In [75]:
def EvaluationMetrics(y_pred,y_true):
    analysis={}
    #analysis["logits"] = [[y*1 for y in x] for x in logits]
    analysis["accuracy"]={}
    #micro
    analysis["micro-f1"]= round(f1_score(y_true,y_pred,average="micro"),4)
    analysis["micro-precision"]=round(precision_score(y_true,y_pred,average="micro"),4)
    analysis["micro-recall"]=round(recall_score(y_true,y_pred, average="micro"),4)
    # macro
    analysis["macro-f1"]=round(f1_score(y_true,y_pred,average="macro"),4)
    analysis["macro-precision"]=round(precision_score(y_true,y_pred,average="macro"),4)
    analysis["macro-recall"]=round(recall_score(y_true,y_pred,average="macro"),4)
    #weighted
    analysis["weighted-f1"]=round(f1_score(y_true,y_pred,average="weighted"),4)
    analysis["weighted-precision"]=round(precision_score(y_true,y_pred,average="weighted"),4)
    analysis["weighted-recall"]=round(recall_score(y_true,y_pred,average="weighted"),4)
    #Accuracy
    analysis["accuracy"]["balanced"]=round(balanced_accuracy_score(y_true,y_pred),4)
    analysis["accuracy"]["standard"]=round(accuracy_score(y_true,y_pred),4)
    analysis["class_prf1"]=[list(x) for x in precision_recall_fscore_support(y_true,y_pred)]
    for i,x in enumerate(analysis["class_prf1"][:-1]):
        analysis["class_prf1"][i] = [round(y,4) for y in x]
    analysis["class_prf1"][-1]= [int(x*1) for x in analysis["class_prf1"][-1 ]]
    return analysis

# Hsuvas results

In [50]:
hsuvas_exp1 = "hsuvas_results/finetuning-gpt3/"
hsuvas_exp3 = "hsuvas_results/GPT4-0-shot"

### Finetuning gpt3

In [35]:
results = {}
for data_set in os.listdir(hsuvas_exp1):
    res_file = os.listdir(hsuvas_exp1+"/"+data_set)
    for re in res_file:
        p = hsuvas_exp1+"/"+data_set+"/"+re
        r = open(p).readlines()
        r = [x.strip().split(",") for x in r ]
        k = {}
        k["idx"]=[x[0] for x in r[1:]]
        k["text"]=[x[1] for x in r[1:]]
        k["true"]=[int(x[2]) for x in r[1:]]
        k["predicted"]=[int(x[3]) for x in r[1:]]
        try:
            k["logits"]=[[float(x[4]),float(x[5]),float(x[6])] for x in r[1:]]
        except:
            k["logits"]=[[float(x[4]),float(x[5])] for x in r[1:]]
        k["model"]="gpt3-"+re[-5]
        k["dataset"]=re[:-6]
        if not k["dataset"] in results:
            results[k["dataset"]]={}
        results[k["dataset"]][k["model"]]=k
 
        
for x in results:
    for y in results[x]:
        #print(len(results[x][y]["predicted"]),len(results[x][y]["true"]))
        results[x][y]["class_metrics"]=EvaluationMetrics(results[x][y]["predicted"],results[x][y]["true"])
        
head = ["Dataset","model","size", "Acc.","Macro-F1","Anomaly F1","Literal F1","Metaphor F1"]  
lines = []
for x in results:
    print(x)
    if "Cardillo" in x:
        for y in results[x]:
            line = [
                x,
                y,
                "-",
                results[x][y]["class_metrics"]["accuracy"]["standard"],
                results[x][y]["class_metrics"]["macro-f1"],
                "-",
                results[x][y]["class_metrics"]["class_prf1"][2][0],
                results[x][y]["class_metrics"]["class_prf1"][2][1]
            ]
            lines.append(line)
    else:
        for y in results[x]:
            line = [
                x,
                y,
                "-",
                results[x][y]["class_metrics"]["accuracy"]["standard"],
                results[x][y]["class_metrics"]["macro-f1"],
                results[x][y]["class_metrics"]["class_prf1"][2][0],
                results[x][y]["class_metrics"]["class_prf1"][2][1],
                results[x][y]["class_metrics"]["class_prf1"][2][2],
            ]
            print("\t".join([str(x) for x in line]))
            lines.append(line)
            
            
output = "utils_dictionaries/"
with open(output+"results_finetuning_gpt3.json", "w") as outfile:
    json.dump(results,outfile,indent="\t")
    
filename = "GPT3_finetuning.csv"
df = pd.DataFrame(lines,columns=head)
df.to_csv(filename, index=False)         

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Quadruples_Green_random_split
Quadruples_Green_random_split	gpt3-c	-	0.2708	0.2681	0.2703	0.24	0.2941
Quadruples_Green_random_split	gpt3-b	-	0.4167	0.4216	0.3529	0.4118	0.5
Quadruples_Green_random_split	gpt3-a	-	0.3125	0.3222	0.1714	0.4828	0.3125
Quadruples_Green_random_split	gpt3-d	-	0.4167	0.4323	0.3226	0.6154	0.359
Quadruples_SAT_MET_FILTERED_random_split
Quadruples_SAT_MET_FILTERED_random_split	gpt3-a	-	0.7398	0.3073	0.8499	0.0719	0.0
Quadruples_SAT_MET_FILTERED_random_split	gpt3-c	-	0.7412	0.3332	0.8535	0.1111	0.0351
Quadruples_SAT_MET_FILTERED_random_split	gpt3-d	-	0.7646	0.4343	0.8687	0.2619	0.1724
Quadruples_SAT_MET_FILTERED_random_split	gpt3-b	-	0.7398	0.3206	0.8491	0.1127	0.0
Pairs_Jankowiac_lexical_split
Pairs_Jankowiac_lexical_split	gpt3-d	-	0.8015	0.803	0.8081	0.8675	0.7333
Pairs_Jankowiac_lexical_split	gpt3-a	-	0.3603	0.1894	0.0435	0.0	0.5246
Pairs_Jankowiac_lexical_split	gpt3-b	-	0.6985	0.6942	0.7111	0.6571	0.7143
Pairs_Jankowiac_lexical_split	gpt3-c	-	0.5515	0.5314	0.59

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [36]:
head = ["hf_index","prediction","true","item"]
for x in results:
    for y in results[x]:
        lines = []
        filename = "error_analysis/"+y+"_finetuning_"+x+".csv"
        for i,e in enumerate(results[x][y]["true"]): 
            if results[x][y]["true"][i]!=results[x][y]["predicted"][i]:
                line = [
                    results[x][y]["idx"][i],
                    results[x][y]["predicted"][i],
                    results[x][y]["true"][i],
                    results[x][y]["text"][i]
                ]
                lines.append(line)
        df = pd.DataFrame(lines,columns=head)
        df.to_csv(filename, index=False)        

## Zero shot GPT4

In [11]:
import copy    
dsimp = {
    'Quadruples_SAT_MET_FILTERED_random_split':'Quadruples_SAT_MET_FILTERED_set',
    'Quadruples_SAT_MET_FILTERED_lexical_split':'Quadruples_SAT_MET_FILTERED_set',
    'Quadruples_Green_lexical_split':'Quadruples_Green_set',
    'Quadruples_Green_random_split':'Quadruples_Green_set',
    "Pairs_Jankowiac_random_split":"Pairs_Jankowiac_set",
    "Pairs_Jankowiac_lexical_split":"Pairs_Jankowiac_set",
    'Pairs_Cardillo_lexical_split':'Pairs_Cardillo_set',
    'Pairs_Cardillo_random_split':'Pairs_Cardillo_set',
    'Pairs_Cardillo_lexical_split_test':'Pairs_Cardillo_set',
    'Pairs_Cardillo_random_split_test':'Pairs_Cardillo_set',
        }

In [38]:
results_e3 = {}
for res_file in os.listdir(hsuvas_exp3):
    print(res_file)
    d = res_file[:-20]
    results_e3[d]={"true":[],"predicted":[],"model":res_file[21:25],"dataset":d,"experiment":"zero-shot-instruction"}
    if "SAT" in d or "Green" in d:
        results_e3[d]["pair1"]=[]
        results_e3[d]["pair2"]=[]
        results_e3[d]["explanation"]=[]
    elif "Cardillo" in d:
        results_e3[d]["text"]=[]
        results_e3[d]["pair"]=[]
        results_e3[d]["explanation"]=[]           
    else:
        results_e3[d]["text"]=[]    
        #results_e3[d]["true"]=([e["label"] for e in all_data[d]["test"]])
    p = hsuvas_exp3+"/"+res_file
    r = open(p)
    for i,line in enumerate(r):
        try:
            a = eval(eval(line))
        except:
            a = json.loads(line)
        if "SAT" in d or "Green" in d:
            try:
                pr,p1,p2,ex = a["label"],a["pair1"],a["pair2"],a["explanation"]
            except:
                print("k",a)
                pr,p1,p2,ex = a["label"],a["pair"][0],a["pair"][1],a["explanation"]
            results_e3[d]["predicted"].append(pr)
            results_e3[d]["pair1"].append(p1)
            results_e3[d]["pair2"].append(p2)
            results_e3[d]["explanation"].append(ex)         
        elif "Cardillo" in d:
            try:
            #if 1:
                results_e3[d]["predicted"].append(a["labels"])
                results_e3[d]["text"].append(a["sentence"])
                results_e3[d]["explanation"].append(a["explanation"]) 
                results_e3[d]["pair"].append(a["pair"]) 
            except:
                print(a)
        else:
            results_e3[d]["predicted"].append(a["output"])
            results_e3[d]["text"].append(a["input"])

    
for d in results_e3:
    k = dsimp[d]
    if "Quadruple" in d:
        for i in range(len(results_e3[d]["pair1"])): 
            insti = copy.copy(results_e3[d]["pair1"][i])
            insti.extend(results_e3[d]["pair2"][i])
            insti = tuple(insti)
            results_e3[d]["true"].append(datasets_sent2label[k][insti])
    elif "Pair" in d:
        for i in range(len(results_e3[d]["text"])): 
            insti =results_e3[d]["text"][i]
            #print(k,insti)
            try:
                results_e3[d]["true"].append(datasets_sent2label[k][insti])
            except:
                try:
                    results_e3[d]["true"].append(datasets_sent2label[k][insti+" "])
                except:
                    insti = insti[:-1]+" ."
                    results_e3[d]["true"].append(datasets_sent2label[k][insti])
            
for x in results_e3:
    print(x)
    results_e3[x]["class_metrics"]=EvaluationMetrics(results_e3[x]["predicted"],results_e3[x]["true"])
    
o = list(results_e3.keys())
o.sort()
for x in o:
    print(x)
    print("\t\t\t\t\t", results_e3[x]["class_metrics"]["macro-f1"],results_e3[x]["class_metrics"]["class_prf1"][2])
    

head = ["Dataset","model","size", "Acc.","Macro-F1","Anomaly F1","Literal F1","Metaphor F1"]  
lines = []
for x in results_e3:
    print(x)
    if "Cardillo" in x:
        #for y in results_e3[x]:
            line = [
                x,
                "GPT4",
                "-",
                results_e3[x]["class_metrics"]["accuracy"]["standard"],
                results_e3[x]["class_metrics"]["macro-f1"],
                "-",
                results_e3[x]["class_metrics"]["class_prf1"][2][0],
                results_e3[x]["class_metrics"]["class_prf1"][2][1]
            ]
            lines.append(line)
    else:
        #for y in results_e3[x]:
            line = [
                x,
                "GPT4",
                "-",
                results_e3[x]["class_metrics"]["accuracy"]["standard"],
                results_e3[x]["class_metrics"]["macro-f1"],
                results_e3[x]["class_metrics"]["class_prf1"][2][0],
                results_e3[x]["class_metrics"]["class_prf1"][2][1],
                results_e3[x]["class_metrics"]["class_prf1"][2][2],
            ]
            print("\t".join([str(x) for x in line]))
            lines.append(line)
        
output = "utils_dictionaries/"
with open(output+"results_zeroshot_1by1_gpt4.json", "w") as outfile:
    json.dump(results_e3,outfile,indent="\t")
    
filename = "GPT4_zeroshot_1by1.csv"
df = pd.DataFrame(lines,columns=head)
df.to_csv(filename, index=False)     

Pairs_Jankowiac_random_split_gpt4_responses.json
Pairs_Jankowiac_lexical_split_gpt4_responses.json
Pairs_Cardillo_lexical_split_gpt4_responses.json
Quadruples_SAT_MET_FILTERED_random_split_gpt4_responses.json
Quadruples_Green_lexical_split_gpt4_responses.json
Pairs_Cardillo_random_split_test_gpt4_responses.json
Quadruples_SAT_MET_FILTERED_lexical_split_gpt4_responses.json
Pairs_Cardillo_lexical_split_test_gpt4_responses.json
Pairs_Cardillo_random_split_gpt4_responses.json
Quadruples_Green_random_split_gpt4_responses.json
Pairs_Jankowiac_random_split
Pairs_Jankowiac_lexical_split
Pairs_Cardillo_lexical_split
Quadruples_SAT_MET_FILTERED_random_split
Quadruples_Green_lexical_split
Pairs_Cardillo_random_split_test
Quadruples_SAT_MET_FILTERED_lexical_split
Pairs_Cardillo_lexical_split_test
Pairs_Cardillo_random_split
Quadruples_Green_random_split
Pairs_Cardillo_lexical_split
					 0.8565 [0.8381, 0.8748]
Pairs_Cardillo_lexical_split_test
					 0.8376 [0.8156, 0.8596]
Pairs_Cardillo_random_s

In [39]:
p = hsuvas_exp3+"/"+res_file
r = open(p)
for i,line in enumerate(r):
    #try:
    #a = eval(eval(line))
    a = eval(line)

In [40]:
import copy
head = ["hf_index","prediction","true","item"]
for x in results_e3:
    #for y in results_e3[x]:
    lines = []
    filename = "error_analysis/GPT4_zeroshot_1by1_"+x+".csv"
    for i,e in enumerate(results_e3[x]["true"]): 
        if results_e3[x]["true"][i]!=results_e3[x]["predicted"][i]:
            try:
                line = [
                    i,
                    results_e3[x]["predicted"][i],
                    results_e3[x]["true"][i],
                    results_e3[x]["text"][i]
                ]
            except:
                c = copy.copy(results_e3[x]["pair1"][i])
                c.extend(results_e3[x]["pair2"][i])
                line = [
                    i,
                    results_e3[x]["predicted"][i],
                    results_e3[x]["true"][i],
                    c
                ]
            lines.append(line)
    df = pd.DataFrame(lines,columns=head)
    df.to_csv(filename, index=False)   

## Zero shot GPT 3.5

In [25]:
results_e3 = {}
for res_file in os.listdir("hsuvas_results/GPT3.5-0shot"):
    d = res_file[:-20]
    print(d)
    results_e3[d]={"true":[],"predicted":[],"model":res_file[21:25],"dataset":d,"experiment":"zero-shot-instruction"}
    if "SAT" in d or "Green" in d or "Km" in d:
        results_e3[d]["pair1"]=[]
        results_e3[d]["pair2"]=[]
        results_e3[d]["explanation"]=[]
    else:
        results_e3[d]["text"]=[]
    print(res_file)
    p = "hsuvas_results/GPT3.5-0shot/"+res_file
    r = open(p)
    for i,line in enumerate(r):
        try:
            a = eval(eval(line))
        except:
            a = eval(line)
        results_e3[d]["true"].append(all_data[d]["test"][i]["label"])
        if "SAT" in d or "Green" in d or "Km" in d:
            try:
                pr,p1,p2,ex = a["label"],a["pair1"],a["pair2"],a["explanation"]
            except:
                try:
                    pr,p1,p2,ex = a["label"],a["pair"][0],a["pair"][1],a["explanation"]
                except:
                    try:
                        pr,p1,p2,ex = a["labels"],a["pair"][0],a["pair"][1],a["explanation"]
                    except:
                        print(a)
            results_e3[d]["predicted"].append(pr)
            results_e3[d]["pair1"].append(p1)
            results_e3[d]["pair2"].append(p2)
            results_e3[d]["explanation"].append(ex) 
        elif "Cardillo" in d or "Jank" in d:
            try:
                results_e3[d]["predicted"].append(a["label"])
                results_e3[d]["text"].append([[x[0].lower(),x[1].lower()] for x in a["pair"]])
            except:
                #try:
                if 1:
                    results_e3[d]["predicted"].append(a["labels"])
                    results_e3[d]["text"].append([[x[0].lower(),x[1].lower()] for x in a["pair"]])
                #except:
                    #print("fk",a)
        else:
            try:
                results_e3[d]["predicted"].append(a["output"])
                results_e3[d]["text"].append(a["input"])
            except:
                print(a)
    print(i)
        
    #all_data['Pairs_Jankowiac_lexical_split']["tes
    
    
for x in results_e3:
    results_e3[x]["class_metrics"]=EvaluationMetrics(results_e3[x]["predicted"],results_e3[x]["true"])

    
for x in results_e3:
    print(x)
    print("\t\t\t\t\t", results_e3[x]["class_metrics"]["macro-f1"],results_e3[x]["class_metrics"]["class_prf1"][2])   
    
    
    
    
head = ["Dataset","model","size", "Acc.","Macro-F1","Anomaly F1","Literal F1","Metaphor F1"]  
lines = []
for x in results_e3:
    print(x)
    if "Cardillo" in x:
        #for y in results_e3[x]:
            line = [
                x,
                "GPT3.5",
                "-",
                results_e3[x]["class_metrics"]["accuracy"]["standard"],
                results_e3[x]["class_metrics"]["macro-f1"],
                "-",
                results_e3[x]["class_metrics"]["class_prf1"][2][0],
                results_e3[x]["class_metrics"]["class_prf1"][2][1]
            ]
            lines.append(line)
    else:
        #for y in results_e3[x]:
            line = [
                x,
                "GPT3.5",
                "-",
                results_e3[x]["class_metrics"]["accuracy"]["standard"],
                results_e3[x]["class_metrics"]["macro-f1"],
                results_e3[x]["class_metrics"]["class_prf1"][2][0],
                results_e3[x]["class_metrics"]["class_prf1"][2][1],
                results_e3[x]["class_metrics"]["class_prf1"][2][2],
            ]
            print("\t".join([str(x) for x in line]))
            lines.append(line)
        
output = "utils_dictionaries/"
with open(output+"results_zeroshot_1by1_gpt3.5.json", "w") as outfile:
    json.dump(results_e3,outfile,indent="\t")
    
filename = "GPT3.5_zeroshot_1by1.csv"
df = pd.DataFrame(lines,columns=head)
df.to_csv(filename, index=False)     
    
    
    
    


Quadruples_Green_random_split
Quadruples_Green_random_split_gpt3_responses.json
47
Pairs_Jankowiac_lexical_split
Pairs_Jankowiac_lexical_split_gpt3_responses.json


IndexError: Invalid key: 136 is out of bounds for size 136

In [14]:
import copy
head = ["hf_index","prediction","true","item"]
for x in results_e3:
    #for y in results_e3[x]:
    lines = []
    filename = "error_analysis/GPT3.5_zeroshot_1by1_"+x+".csv"
    for i,e in enumerate(results_e3[x]["true"]): 
        if results_e3[x]["true"][i]!=results_e3[x]["predicted"][i]:
            try:
                line = [
                    i,
                    results_e3[x]["predicted"][i],
                    results_e3[x]["true"][i],
                    results_e3[x]["text"][i]
                ]
            except:
                c = copy.copy(results_e3[x]["pair1"][i])
                c.extend(results_e3[x]["pair2"][i])
                line = [
                    i,
                    results_e3[x]["predicted"][i],
                    results_e3[x]["true"][i],
                    c
                ]
            lines.append(line)
    df = pd.DataFrame(lines,columns=head)
    df.to_csv(filename, index=False)   

IndexError: list index out of range

# Lists 

In [76]:
dsplits= ["train","validation","test"]
pairs_dsets = ["Pairs_Cardillo_set", "Pairs_Jankowiac_set"]
quad_dsets=["Quadruples_SAT_MET_FILTERED_set",'Quadruples_Green_set']

pairs_cl = [
    "Pairs_Cardillo",
    "Pairs_Jankowiac",
]

quad_cl = [
    "Quadruples_SAT_MET_FILTERED",
    'Quadruples_Green',    
]

dlab = ["metaphor","anomaly","literal"]

# Asahi results
## All models perplexities
### Score

In [77]:
# Score : is X , SAT_MET
asahi_sat_score = 'asahi_results/metaphor_results/scores_sat/'
s = os.listdir(asahi_sat_score)
asahi_sat_score_met = []
asahi_sat_score_lit = []
asahi_sat_score_ano = []

for y in s:
    x = y.split(".")
    model,t = x[0],x[1]
    if t=="metaphor":
        asahi_sat_score_met.append([y,model])
    elif t=="literal":
        asahi_sat_score_lit.append([y,model])
    elif t == "anomaly":
        asahi_sat_score_ano.append([y,model])

        
# Score : is X , other sets
asahi_scores = "asahi_results/metaphor_results/scores/"
asahi_scores_openai = "asahi_results/metaphor_results/scores_openai/"    
        

scores = os.listdir(asahi_scores)
scores_openai = os.listdir(asahi_scores_openai)

print(scores[:4],'\n\n',scores_openai[:4])

['t5-11b.Metaphors_and_Analogies_Quadruples_Green_set_test.json', 'roberta-base.Metaphors_and_Analogies_Pairs_Cardillo_set_test.json', 'gpt-neo-2.7B.Metaphors_and_Analogies_Pairs_Jankowiac_set_test.json', 'opt-66b.Metaphors_and_Analogies_Pairs_Jankowiac_set_test.json'] 

 ['babbage.Metaphors_and_Analogies_Pairs_Cardillo_set_test.json', 'curie.Metaphors_and_Analogies_Pairs_Cardillo_set_test.json', 'curie.Metaphors_and_Analogies_Pairs_Jankowiac_set_test.json', 'davinci.Metaphors_and_Analogies_Quadruples_Green_set_test.json']


### No prompt

In [78]:
asahi_no_prompt = "asahi_results/metaphor_results/scores_no_prompt/"
asahi_no_prompt_openai = "asahi_results/metaphor_results/scores_openai_no_prompt/"   

# SAT


# Open Models
# Open AI

no_prompt = os.listdir(asahi_no_prompt)
no_prompt_openai = os.listdir(asahi_no_prompt_openai)

print(no_prompt[:4],'\n\n',no_prompt_openai[:4])


['t5-11b.Metaphors_and_Analogies_Quadruples_Green_set_test.json', 'roberta-base.Metaphors_and_Analogies_Pairs_Cardillo_set_test.json', 'gpt-neo-2.7B.Metaphors_and_Analogies_Pairs_Jankowiac_set_test.json', 'opt-66b.Metaphors_and_Analogies_Pairs_Jankowiac_set_test.json'] 

 ['babbage.Metaphors_and_Analogies_Pairs_Cardillo_set_test.json', 'curie.Metaphors_and_Analogies_Pairs_Cardillo_set_test.json', 'curie.Metaphors_and_Analogies_Pairs_Jankowiac_set_test.json', 'davinci.Metaphors_and_Analogies_Quadruples_Green_set_test.json']


### Instructions scores

In [79]:
asahi_instructions = "asahi_results/metaphor_results/scores_instruction/"
asahi_openai_instructions = "asahi_results/metaphor_results/scores_openai_instruction/"   

# Open Models
# OpenAI

instructions = os.listdir(asahi_instructions)
instructions_openai = os.listdir(asahi_openai_instructions)

print(instructions[:4],'\n\n',instructions_openai[:4])

['t5-11b.Metaphors_and_Analogies_Quadruples_Green_set_test.json', 'roberta-base.Metaphors_and_Analogies_Pairs_Cardillo_set_test.json', 'opt-66b.Metaphors_and_Analogies_Pairs_Jankowiac_set_test.json', 't5-large.Metaphors_and_Analogies_Pairs_Cardillo_set_test.json'] 

 ['babbage.Metaphors_and_Analogies_Pairs_Cardillo_set_test.json', 'curie.Metaphors_and_Analogies_Pairs_Cardillo_set_test.json', 'curie.Metaphors_and_Analogies_Pairs_Jankowiac_set_test.json', 'davinci.Metaphors_and_Analogies_Quadruples_Green_set_test.json']


In [80]:
def Flatten(ll):
    l =[]
    for x in ll:
        for y in x:
            l.append(y)
    return l

### Score dictionary - all datasets but SAT_MET

In [81]:
# Open source

results_score ={} 

for x in scores:#[:1]:
    p = json.load(open(asahi_scores+"/"+x))
    x = x.replace(".Metaphors_and_Analogies_","|")
    w = x.split("|")
    model,dataset=w[0],w[1][:-10]
    
    if not dataset in results_score:
        results_score[dataset]={"metaphor":{},"anomaly":{},"literal":{}}
    for z in dlab:
        if not model in results_score[dataset][z]:    
            results_score[dataset][z][model] = {"text":[],"true":Flatten(p["labels"]),"idx":[],"score":[]}
        for i,y in enumerate(p[z]):
            results_score[dataset][z][model]["score"].append(y["score"])
            results_score[dataset][z][model]["idx"].append(y["index"])
            results_score[dataset][z][model]["text"].append(datasets_idx2sent[dataset][i])
            assert results_score[dataset][z][model]["true"][i]==datasets_sent2label[dataset][datasets_idx2sent[dataset][i]]

#    results_score["metaphor"][x[1]]={"text":[],"true":[],"score":[],"binary_labels":[],"idx":[],"label":[],"datastplit":[]}


In [82]:
# Open AI

for x in scores_openai:#[:1]:
    p = json.load(open(asahi_scores_openai+"/"+x))
    x = x.replace(".Metaphors_and_Analogies_","|")
    w = x.split("|")
    model,dataset=w[0],w[1][:-10]
    if not dataset in results_score:
        results_score[dataset]={"metaphor":{},"anomaly":{},"literal":{}}
    for z in dlab:
        if not model in results_score[dataset][z]:    
            results_score[dataset][z][model] = {"text":[],"true":Flatten(p["labels"]),"idx":[],"score":[]}
        for i,y in enumerate(p[z]):
            results_score[dataset][z][model]["score"].append(y["score"])
            results_score[dataset][z][model]["idx"].append(y["index"])
            results_score[dataset][z][model]["text"].append(datasets_idx2sent[dataset][i])
            assert results_score[dataset][z][model]["true"][i]==datasets_sent2label[dataset][datasets_idx2sent[dataset][i]]

In [83]:
results_score['Quadruples_Green_set']["metaphor"]["davinci"]["text"][10]

('baker', 'cake', 'chef', 'meal')

In [84]:
output = "utils_dictionaries/"
with open(output+"results_score_zero_shot.json", "w") as outfile:
    json.dump(results_score,outfile,indent="\t")
    

### No prompt  - all datasets but SAT_MET

In [85]:
# No prompt


results_no_prompt = {}

for x in no_prompt:#[:1]:
    p = json.load(open(asahi_no_prompt+"/"+x))
    x = x.replace(".Metaphors_and_Analogies_","|")
    w = x.split("|")
    model,dataset=w[0],w[1][:-10]
    if not dataset in results_no_prompt:
        results_no_prompt[dataset]={}
    if not model in results_no_prompt[dataset]:    
        results_no_prompt[dataset][model] = {"text":[],"true":Flatten(p["labels"]),"idx":[],"score":[]}
    for i,y in enumerate(p["perplexity"]):
        results_no_prompt[dataset][model]["score"].append(y["score"])
        results_no_prompt[dataset][model]["idx"].append(y["index"])
        results_no_prompt[dataset][model]["text"].append(datasets_idx2sent[dataset][i])
        assert results_no_prompt[dataset][model]["true"][i]==datasets_sent2label[dataset][datasets_idx2sent[dataset][i]]



In [86]:
for x in no_prompt_openai:#[:1]:
    p = json.load(open(asahi_no_prompt_openai+"/"+x))
    x = x.replace(".Metaphors_and_Analogies_","|")
    w = x.split("|")
    model,dataset=w[0],w[1][:-10]
    if not dataset in results_no_prompt:
        results_no_prompt[dataset]={}
    if not model in results_no_prompt[dataset]:    
        results_no_prompt[dataset][model] = {"text":[],"true":Flatten(p["labels"]),"idx":[],"score":[]}
    for i,y in enumerate(p["perplexity"]):
        results_no_prompt[dataset][model]["score"].append(y["score"])
        results_no_prompt[dataset][model]["idx"].append(y["index"])
        results_no_prompt[dataset][model]["text"].append(datasets_idx2sent[dataset][i])
        assert results_no_prompt[dataset][model]["true"][i]==datasets_sent2label[dataset][datasets_idx2sent[dataset][i]]



In [87]:
output = "utils_dictionaries/"
with open(output+"results_no_prompt_zero_shot.json", "w") as outfile:
    json.dump(results_no_prompt,outfile,indent="\t")

### Instruction

In [88]:
# Instruction : all questions

# Open source

results_instructions ={} 

for x in instructions:#[:1]:
    p = json.load(open(asahi_instructions+"/"+x))
    x = x.replace(".Metaphors_and_Analogies_","|")
    w = x.split("|")
    model,dataset=w[0],w[1][:-10]
    if not dataset in results_instructions:
        results_instructions[dataset]={"metaphor":{},"anomaly":{},"literal":{}}
    for z in dlab:
        if not model in results_instructions[dataset][z]:    
            results_instructions[dataset][z][model] = {"text":[],"true":Flatten(p["labels"]),"idx":[],"score":[]}
        for i,y in enumerate(p[z]):
            results_instructions[dataset][z][model]["score"].append(y["score"])
            results_instructions[dataset][z][model]["idx"].append(y["index"])
            results_instructions[dataset][z][model]["text"].append(datasets_idx2sent[dataset][i])
            assert results_instructions[dataset][z][model]["true"][i]==datasets_sent2label[dataset][datasets_idx2sent[dataset][i]]


In [89]:
# Open source


for x in instructions_openai:#[:1]:
    p = json.load(open(asahi_openai_instructions+"/"+x))
    x = x.replace(".Metaphors_and_Analogies_","|")
    w = x.split("|")
    model,dataset=w[0],w[1][:-10]
    if not dataset in results_instructions:
        results_instructions[dataset]={"metaphor":{},"anomaly":{},"literal":{}}
    for z in dlab:
        if not model in results_instructions[dataset][z]:    
            results_instructions[dataset][z][model] = {"text":[],"true":Flatten(p["labels"]),"idx":[],"score":[]}
        for i,y in enumerate(p[z]):
            results_instructions[dataset][z][model]["score"].append(y["score"])
            results_instructions[dataset][z][model]["idx"].append(y["index"])
            results_instructions[dataset][z][model]["text"].append(datasets_idx2sent[dataset][i])
            assert results_instructions[dataset][z][model]["true"][i]==datasets_sent2label[dataset][datasets_idx2sent[dataset][i]]


In [90]:
output = "utils_dictionaries/"
with open(output+"results_instructions_zero_shot.json", "w") as outfile:
    json.dump(results_instructions,outfile,indent="\t")

## SAT_MET Dictionaries 
- satsetid : sat quadruples to set_ids
- satlabs : quadruples to labels
- satplit : quadruples to data_split, randomm and lexical


In [91]:
satlabs = {}
for x in all_data["Quadruples_SAT_MET_FILTERED_set"]['test']:
    for i,pair in enumerate(x["pairs"]):
        target = eval(x["stem"])
        target.extend(pair)
        satlabs[tuple(target)]=x["labels"][i]
        
len(satlabs)

1710

In [92]:
satsetid = {}
for x in all_data["Quadruples_SAT_MET_FILTERED_set"]['test']:
    for i,pair in enumerate(x["pairs"]):
        target = eval(x["stem"])
        target.extend(pair)
        satsetid[tuple(target)]=x["id"]
        #print(x["pair_ids"])
        
len(satsetid)

1710

In [93]:
satsplits = {"lexical":{},"random":{}}
for z in dsplits:
    for x in all_data["Quadruples_SAT_MET_FILTERED_lexical_split"][z]:
        r = x["AB"]
        r.extend(x["CD"])
        satsplits["lexical"][tuple(r)]=z

In [94]:
for z in dsplits:
    for x in all_data["Quadruples_SAT_MET_FILTERED_random_split"][z]:
        r = x["AB"]
        r.extend(x["CD"])
        satsplits["random"][tuple(r)]=z

In [95]:
results_sat_score = {"metaphor":{},"anomaly":{},"analogy":{}}

for x  in asahi_sat_score_met:
    results_sat_score["metaphor"][x[1]]={"text":[],"true":[],"score":[],"binary_labels":[],"idx":[],"label":[]}
    p = json.load(open(asahi_sat_score +x[0]))
    for i, y in enumerate(p):
        if tuple(y["target"]) in satlabs:
            results_sat_score["metaphor"][x[1]]["text"].append(y["target"])
            results_sat_score["metaphor"][x[1]]["binary_labels"].append(y['label'])
            results_sat_score["metaphor"][x[1]]["score"].append(y["ppl"])
            results_sat_score["metaphor"][x[1]]["true"].append(satlabs[tuple(y["target"])])
            results_sat_score["metaphor"][x[1]]["idx"].append(y["index"])
            
for x  in asahi_sat_score_lit:
    results_sat_score["analogy"][x[1]]={"text":[],"true":[],"score":[],"binary_labels":[],"idx":[],"label":[]}
    p = json.load(open(asahi_sat_score +x[0]))
    for i, y in enumerate(p):
        if tuple(y["target"]) in satlabs:
            results_sat_score["analogy"][x[1]]["text"].append(y["target"])
            results_sat_score["analogy"][x[1]]["binary_labels"].append(y['label'])
            results_sat_score["analogy"][x[1]]["score"].append(y["ppl"])
            results_sat_score["analogy"][x[1]]["true"].append(satlabs[tuple(y["target"])])
            results_sat_score["analogy"][x[1]]["idx"].append(y["index"])
            
            
for x  in asahi_sat_score_ano:
    results_sat_score["anomaly"][x[1]]={"text":[],"true":[],"score":[],"binary_labels":[],"idx":[],"label":[]}
    p = json.load(open(asahi_sat_score +x[0]))
    for i, y in enumerate(p):
        if tuple(y["target"]) in satlabs:
            results_sat_score["anomaly"][x[1]]["text"].append(y["target"])
            results_sat_score["anomaly"][x[1]]["binary_labels"].append(y['label'])
            results_sat_score["anomaly"][x[1]]["score"].append(y["ppl"])
            results_sat_score["anomaly"][x[1]]["true"].append(satlabs[tuple(y["target"])])
            results_sat_score["anomaly"][x[1]]["idx"].append(y["index"])
            
        

In [96]:
output = "utils_dictionaries/"
with open(output+"results_sat_score_zero_shot.json", "w") as outfile:
    json.dump(results_sat_score,outfile,indent="\t")

### SAT_met no score

In [97]:
res_dir = "asahi_results/sat-scores-no-prompt-other-models/"
a = os.listdir(res_dir)
a = [x for x in a if "sat_full_None.prompt" in x]
didisat = {}
#print(sattext)
sat_no_score = {}
for x in a:
    p = json.load(open(res_dir+x))
    model = (x[:-26])
    sat_no_score[model]={"text":[],"score":[],"true":[],"idx":[]}
    for i,y in enumerate(p):
        q = {}
        text = y["input"]+y["output"]
        flag = 0
        for quad in satlabs:
            c = 0
            for word in quad:
                if word in text :
                    c+=1
            if c==4:
                mes4mots = quad
                flag = 1
        if flag==1:
            sat_no_score[model]["score"].append(y["score"])
            sat_no_score[model]["text"].append(mes4mots)
            sat_no_score[model]["true"].append(satlabs[mes4mots])
            sat_no_score[model]["idx"].append(satsetid [mes4mots])
            didisat[i]=mes4mots


In [98]:
p = json.load(open("asahi_results/gpt3-scores-noprompt-SAT/davinci_sat_full_None.prompt.json"))
sat_no_score["davinci"] = {"text":sat_no_score[model]["text"],"score":[],"true":sat_no_score[model]["true"],"idx":sat_no_score[model]["idx"]}
n = 0
for x in p:
    for y in x[1]:
        if n in didisat:
            sat_no_score["davinci"]["score"].append(y)
        n+=1

In [99]:
output = "utils_dictionaries/"
with open(output+"results_sat_no_prompt_zero_shot.json", "w") as outfile:
    json.dump(sat_no_score,outfile,indent="\t")

## GPT4 and 3.5 ANSWERS  on SAT original

In [100]:
p = json.load(open(res_dir+r))
p[0]

TypeError: can only concatenate str (not "list") to str

In [None]:
from statistics import mean
res_dir = "asahi_results/generation-sat-original/"
resfiles = os.listdir(res_dir)
sat_openAI = {}

for r in resfiles:
    model = r.split(".")[0]
    if "full.1" in r:
        prompt = "prompt1"
    else:
        prompt="prompt3"
    sat_openAI[prompt+"_"+model] = {"predicted":[],"idx":[],"true":[]}
    p = json.load(open(res_dir+r))
    accuracy =[]
    print(model)
    for i,x in enumerate(p):
        pred = -1
        if "Llama" in model or "Mix" in model:
            reply = x["reply"][len(x["input"]):].strip().strip(":").strip()
            reply = reply.split("\n")[0]
        else:
            reply = x["reply"]
        out = x['input'].split("\n")[1:-1]
        n = 0
        if len(reply)==0:
            n+=1
        elif reply[0] in ['1', '2', '3', '4', '5']:
            pred = int(reply[0]) - 1
        elif reply[1] in ['1', '2', '3', '4', '5']:
            pred = int(reply[1]) - 1
        elif reply.replace("option ", "").replace("Option ", "").replace(": ", "")[0] in ['1', '2', '3', '4', '5']:
            pred = int(reply.replace("option ", "").replace("Option ", "").replace(": ", "")[0]) - 1
        elif any(reply[:-1].lower() in o.lower() for o in out):
            pred = int([o for o in out if reply[:-1].lower() in o.lower()][0][0]) - 1
        elif reply[5] in ['1', '2', '3', '4', '5']:
            pred = int(reply[5]) - 1
        else:
            print()
            #print(out)
            print("|"+reply+"|")
            print()
            #raise ValueError("unknown reply")
        accuracy.append(int(int(x['answer']) == pred))           
        sat_openAI[prompt+"_"+model]["predicted"].append(pred)
        sat_openAI[prompt+"_"+model]["true"].append(x["answer"])
        sat_openAI[prompt+"_"+model]["idx"].append(i)
    print(prompt, model, mean(accuracy),"empty answer",n,"n_parsed",len(sat_openAI[prompt+"_"+model]["predicted"]))


In [None]:
for x in sat_openAI:
    sat_openAI[x]["class_metrics"]=EvaluationMetrics(sat_openAI[x]["predicted"],sat_openAI[x]["true"])

    
for x in sat_openAI:
    print(x)
    print("\t\t\t\t\t", sat_openAI[x]["class_metrics"]["accuracy"],sat_openAI[x]["class_metrics"]["macro-f1"],sat_openAI[x]["class_metrics"]["class_prf1"][2])   
   

        
output = "utils_dictionaries/"
with open(output+"results_original_sat_openAI_zero_shot.json", "w") as outfile:
    json.dump(sat_openAI,outfile,indent="\t")

In [None]:
head = ["Dataset","model","size", "Acc.","Macro-F1"]  
lines = []
for x in sat_openAI:
    line = [
        x,
        "-",
        "-",
        sat_openAI[x]["class_metrics"]["accuracy"]["standard"],
        sat_openAI[x]["class_metrics"]["macro-f1"],
    ]
    print("\t".join([str(x) for x in line]))
    lines.append(line)
        

filename = "original_SAT_zeroshot_set.csv"
df = pd.DataFrame(lines,columns=head)
df.to_csv(filename, index=False)     
    
    
    

In [None]:
head = ["hf_index","prediction","true","set_id"]
for x in sat_openAI:
    #for y in results_e3[x]:
    lines = []
    filename = "error_analysis/GPT3.5&4_zeroshot_set_"+x+".csv"
    for i,e in enumerate(sat_openAI[x]["true"]): 
        if sat_openAI[x]["true"][i]!=sat_openAI[x]["predicted"][i]:
            line = [
                i,
                sat_openAI[x]["predicted"][i],
                sat_openAI[x]["true"][i],
                sat_openAI[x]["idx"][i]
            ]
            lines.append(line)
    df = pd.DataFrame(lines,columns=head)
    df.to_csv(filename, index=False)  

## GPT 3.5 other sets

In [None]:
# see Asahi chat process script

# Sat_met scores results

In [None]:
three_sat_combined = {}

for m in results_sat_score["anomaly"]:
    three_sat_combined[m]={"scores":[],"scaled_scores":[],"labels":[],"metrics":[]}
    max_ano = max(results_sat_score["anomaly"][m]["score"])
    min_ano = min(results_sat_score["anomaly"][m]["score"])
    max_ana = max(results_sat_score["analogy"][m]["score"])
    min_ana = min(results_sat_score["analogy"][m]["score"])
    max_met = max(results_sat_score["metaphor"][m]["score"])
    min_met = min(results_sat_score["metaphor"][m]["score"])
    for i in range(len(results_sat_score["anomaly"][m]["text"])):   
        ano = results_sat_score["anomaly"][m]["score"][i]
        scaled_ano = ((ano-min_ano)/(max_ano - min_ano))*0.75
        ana = results_sat_score["analogy"][m]["score"][i]
        scaled_ana = ((ana-min_ana)/(max_ana - min_ana))
        met = results_sat_score["metaphor"][m]["score"][i]
        scaled_met = ((met - min_met)/(max_met - min_met))
        k = [ano,ana,met]
        scaled_k = [scaled_ano,scaled_ana,scaled_met]
        three_sat_combined[m]["scores"].append(k)
        three_sat_combined[m]["scaled_scores"].append(scaled_k)
        label = scaled_k.index(min(scaled_k))
        #label = scaled_k.index(min(scaled_k))
        three_sat_combined[m]["labels"].append(label)
        
sat_true = results_sat_score["metaphor"][m]["true"]


for m in three_sat_combined:
    three_sat_combined[m]["metrics"]= EvaluationMetrics(three_sat_combined[m]["labels"],sat_true)      
    print(m,'\t',three_sat_combined[m]["metrics"]["macro-f1"],'\t',three_sat_combined[m]["metrics"]["class_prf1"][2])
    

output = "utils_dictionaries/"
with open(output+"results_score3satcombines_complete_set_zero_shot.json", "w") as outfile:
    json.dump(three_sat_combined,outfile,indent="\t")

## SAT_Met results Random split test set

In [None]:
three_sat_combined = {}

test_sc_ana,test_sc_met,test_sc_ano = [],[],[]
test_sat_true = [ ]

for i,s in enumerate(results_sat_score["analogy"][m]["score"]):
    t= tuple(results_sat_score["analogy"][m]["text"][i])
    if satsplits["random"][t]=="test":
        test_sc_ana.append(s)
        test_sat_true.append(results_sat_score["analogy"][m]["true"][i])
    
for i,s in enumerate(results_sat_score["metaphor"][m]["score"]):
    t= tuple(results_sat_score["metaphor"][m]["text"][i])
    if satsplits["random"][t]=="test":
        test_sc_met.append(s)

for i,s in enumerate(results_sat_score["anomaly"][m]["score"]):
    t= tuple(results_sat_score["anomaly"][m]["text"][i])
    if satsplits["random"][t]=="test":
        test_sc_ano.append(s)


for m in results_sat_score["anomaly"]:
    three_sat_combined[m]={"scores":[],"scaled_scores":[],"labels":[],"metrics":[]}
    max_ano = max(test_sc_ano)
    min_ano = min(test_sc_ano)
    max_ana = max(test_sc_ana)
    min_ana = min(test_sc_ana)
    max_met = max(test_sc_met)
    min_met = min(test_sc_met)
    for i in range(len(results_sat_score["anomaly"][m]["text"])):  
        t = tuple(results_sat_score["anomaly"][m]["text"][i])
        if satsplits["random"][t]=="test":
            ano = results_sat_score["anomaly"][m]["score"][i]
            scaled_ano = ((ano-min_ano)/(max_ano - min_ano))* 0.75
            ana = results_sat_score["analogy"][m]["score"][i]
            scaled_ana = ((ana-min_ana)/(max_ana - min_ana))
            met = results_sat_score["metaphor"][m]["score"][i]
            scaled_met = ((met - min_met)/(max_met - min_met))
            k = [ano,ana,met]
            scaled_k = [scaled_ano,scaled_ana,scaled_met]
            three_sat_combined[m]["scores"].append(k)
            three_sat_combined[m]["scaled_scores"].append(scaled_k)
            label = scaled_k.index(min(scaled_k))
            three_sat_combined[m]["labels"].append(label)
        
sat_true = results_sat_score["metaphor"][m]["true"]


for m in three_sat_combined:
    three_sat_combined[m]["metrics"]= EvaluationMetrics(three_sat_combined[m]["labels"],test_sat_true)      
    print(m,'\t',three_sat_combined[m]["metrics"]["macro-f1"],'\t',three_sat_combined[m]["metrics"]["class_prf1"][2])
    

    
output = "utils_dictionaries/"
with open(output+"results_score3satcombines_randomSPlit_testSet_zero_shot.json", "w") as outfile:
    json.dump(three_sat_combined,outfile,indent="\t")

## SAT_Met results Lexical split test set

In [None]:
three_sat_combined = {}


test_sc_ana,test_sc_met,test_sc_ano = [],[],[]
test_sat_true = [ ]

for i,s in enumerate(results_sat_score["analogy"][m]["score"]):
    t= tuple(results_sat_score["analogy"][m]["text"][i])
    if satsplits["lexical"][t]=="test":
        test_sc_ana.append(s)
        test_sat_true.append(results_sat_score["analogy"][m]["true"][i])
    
for i,s in enumerate(results_sat_score["metaphor"][m]["score"]):
    t= tuple(results_sat_score["metaphor"][m]["text"][i])
    if satsplits["lexical"][t]=="test":
        test_sc_met.append(s)

for i,s in enumerate(results_sat_score["anomaly"][m]["score"]):
    t= tuple(results_sat_score["anomaly"][m]["text"][i])
    if satsplits["lexical"][t]=="test":
        test_sc_ano.append(s)


for m in results_sat_score["anomaly"]:
    three_sat_combined[m]={"scores":[],"scaled_scores":[],"labels":[],"metrics":[]}
    max_ano = max(test_sc_ano)
    min_ano = min(test_sc_ano)
    max_ana = max(test_sc_ana)
    min_ana = min(test_sc_ana)
    max_met = max(test_sc_met)
    min_met = min(test_sc_met)
    for i in range(len(results_sat_score["anomaly"][m]["text"])):  
        t = tuple(results_sat_score["anomaly"][m]["text"][i])
        if satsplits["lexical"][t]=="test":
            ano = results_sat_score["anomaly"][m]["score"][i]
            scaled_ano = ((ano-min_ano)/(max_ano - min_ano))*0.75
            ana = results_sat_score["analogy"][m]["score"][i]
            scaled_ana = ((ana-min_ana)/(max_ana - min_ana))
            met = results_sat_score["metaphor"][m]["score"][i]
            scaled_met = ((met - min_met)/(max_met - min_met))
            k = [ano,ana,met]
            scaled_k = [scaled_ano,scaled_ana,scaled_met]
            three_sat_combined[m]["scores"].append(k)
            three_sat_combined[m]["scaled_scores"].append(scaled_k)
            label = scaled_k.index(min(scaled_k))
            three_sat_combined[m]["labels"].append(label)
        
sat_true = results_sat_score["metaphor"][m]["true"]


for m in three_sat_combined:
    three_sat_combined[m]["metrics"]= EvaluationMetrics(three_sat_combined[m]["labels"],test_sat_true)      
    print(m,'\t',three_sat_combined[m]["metrics"]["macro-f1"],'\t',three_sat_combined[m]["metrics"]["class_prf1"][2])
   
output = "utils_dictionaries/"
with open(output+"results_score3satcombines_lexicalSplit_testSet_zero_shot.json", "w") as outfile:
    json.dump(three_sat_combined,outfile,indent="\t")

# SAT_MET score on our labels  binary met-analogy

In [None]:
two_sc_ana,two_sc_met = [],[]
for i,s in enumerate(results_sat_score["analogy"][m]["score"]):
    if results_sat_score["analogy"][m]["true"][i]!=0:
        two_sc_ana.append(s)
    
for i,s in enumerate(results_sat_score["metaphor"][m]["score"]):
    if results_sat_score["metaphor"][m]["true"][i]!=0:
        two_sc_met.append(s)

print(len(two_sc_ana))
    
two_sat_combined = {}
for m in results_sat_score["anomaly"]:
    two_sat_combined[m]={"scores":[],"scaled_scores":[],"labels":[],"metrics":[]}
    max_ana = max(two_sc_ana)
    min_ana = min(two_sc_ana)
    max_met = max(two_sc_met)
    min_met = min(two_sc_met)
    for i in range(len(results_sat_score["anomaly"][m]["text"])): 
        if sat_true[i]!=0:
            ana = results_sat_score["analogy"][m]["score"][i]
            scaled_ana = ((ana-min_ana)/(max_ana - min_ana))
            met = results_sat_score["metaphor"][m]["score"][i]
            scaled_met = ((met - min_met)/(max_met - min_met))
            k = [ana,met]
            scaled_k = [scaled_ana,scaled_met]
            two_sat_combined[m]["scores"].append(k)
            two_sat_combined[m]["scaled_scores"].append(scaled_k)
            #label = k.index(min(k))
            label = scaled_k.index(min(scaled_k))
            two_sat_combined[m]["labels"].append(label+1)
        
sat_true = results_sat_score["metaphor"][m]["true"]
two_sat_true = [x for x in sat_true if x!=0]

for m in two_sat_combined:
    two_sat_combined[m]["metrics"]= EvaluationMetrics(two_sat_combined[m]["labels"],two_sat_true)      
    #print(three_sat_combined[m]["labels"][:10])
    print(m,'\t','\t',two_sat_combined[m]["metrics"]["macro-f1"],'\t',two_sat_combined[m]["metrics"]["class_prf1"][2])
    
    
output = "utils_dictionaries/"
with open(output+"results_score2satcombined_binary_complete_set_zero_shot.json", "w") as outfile:
    json.dump(two_sat_combined,outfile,indent="\t")

# Other datasets : score, zero shot

## GPT 3's perplexity

## GPT4

# Output useful dictionaries

# Joanne results

In [46]:

output = "utils_dictionaries/"

datasets_sent2labelk = {}
datasets_sent2idxk = {}
datasets_datasplitsk = {}


for dataset in datasets_sent2label:
    datasets_sent2labelk[dataset] = {}
    for x in datasets_sent2label[dataset]:
        datasets_sent2labelk[dataset][str(x)]=datasets_sent2label[dataset][x]
        
for dataset in datasets_sent2idx:
    datasets_sent2idxk[dataset] = {}
    for x in datasets_sent2idx[dataset]:
        datasets_sent2idxk[dataset][str(x)]=datasets_sent2idx[dataset][x]
        
for dataset in datasets_datasplits:
    datasets_datasplitsk[dataset] = {"random":{},"lexical":{}}
    for fpl in datasets_datasplits[dataset]:
        for x in datasets_datasplits[dataset][fpl]:
            datasets_datasplitsk[dataset][fpl][str(x)]=datasets_datasplits[dataset][fpl][x]
        

print(1)

with open(output+"datasets_sent2label.json", "w") as outfile:
    json.dump(datasets_sent2labelk,outfile,indent="\t")
  
print(2)

with open(output+"datasets_sent2idx.json", "w") as outfile:
    json.dump(datasets_sent2idxk,outfile,indent="\t")
  
print(3)

with open(output+"datasets_idx2sent.json", "w") as outfile:
    json.dump(datasets_idx2sent,outfile,indent="\t")

print(4)

with open(output+"datasets_datasplits.json", "w") as outfile:
    json.dump(datasets_datasplitsk,outfile,indent="\t")

1
2
3
4


In [None]:
joanne_t5_finetuning = "joanne_results"
joanne_t5_finetuning = "joanne_results"

## Confusion Matrices

In [34]:
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

for data in results:
  if "Cardillo" in data:
    set_labels = mylabels["Cardillo"] 
  elif "Pairs" in data:
    set_labels = mylabels["Pairs"] 
  elif "Quadruples" in data:
    set_labels = mylabels["Quadruples"]
  print(metrics.classification_report(results[data]["true"], results[data]["generated"]))
  cm = metrics.confusion_matrix(results[data]["true"], results[data]["generated"])
  df_cm = pd.DataFrame(cm, index = set_labels, columns = set_labels)
  plt.figure(figsize = (10,7))
  sn.heatmap(df_cm, annot=True, cmap='Purples', fmt='g')

NameError: name 'results' is not defined