## Imports

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.metrics import multilabel_confusion_matrix

In [2]:
import sys

PROJECT_PATH = '/home/adiel/full-temporal-relation'
if PROJECT_PATH not in sys.path:
    sys.path.append(PROJECT_PATH)

from full_temporal_relation.visualization.data import plot_relation_bars
from full_temporal_relation.data.preprocessing import load_data
from full_temporal_relation.metrics import (relation_table, recall, precision, 
                                            calculate_f1, calculate_micro_f1, 
                                            calculate_relax_micro_f1, summary_results)

# Data

In [3]:
DATA_PATH = Path('../data')
TRC_RESULTS_PATH = DATA_PATH / 'TRC' / 'results'

In [4]:
gold_df = load_data(DATA_PATH/ 'MATRES' / 'platinum.txt')
gold_df

Unnamed: 0,docid,verb1,verb2,eiid1,eiid2,relation,label,unique_id
0,WSJ_20130322_159,apologized,happened,e1,e5,VAGUE,VAGUE,e1-e5
1,WSJ_20130322_159,apologized,wrapped,e1,e6,BEFORE,BEFORE,e1-e6
2,WSJ_20130322_159,apologized,seemed,e1,e10,BEFORE,BEFORE,e1-e10
3,WSJ_20130322_159,apologized,yield,e1,e11,VAGUE,VAGUE,e1-e11
4,WSJ_20130322_159,happened,wrapped,e5,e6,BEFORE,BEFORE,e5-e6
...,...,...,...,...,...,...,...,...
832,CNN_20130322_248,sparing,begin,e3,e6,BEFORE,BEFORE,e3-e6
833,CNN_20130322_248,sparing,said,e3,e7,EQUAL,EQUAL,e3-e7
834,CNN_20130322_248,expected,begin,e4,e6,BEFORE,BEFORE,e4-e6
835,CNN_20130322_248,expected,said,e4,e7,BEFORE,BEFORE,e4-e7


# Gemini-1.5-Pro

In [5]:
model_name = 'gemini-1.5-pro'
method = 'zero-shot'

## Zero-Shot

In [6]:
df_results = pd.read_csv(TRC_RESULTS_PATH / method / f'platinum-results-{model_name}-{method}.csv')
df_results

Unnamed: 0,docid,verb1,verb2,eiid1,eiid2,relation,unique_id,model_name,p_label,iter,score,relation_selected,max_score,conflict_rel,n_cycles,min_vote
0,AP_20130322,killed,season,e2,e1000028,BEFORE,e1000028-e2,gemini-1.5-pro,BEFORE,0.0,1,BEFORE,5.0,,,3
1,AP_20130322,started,sparking,e3,e4,BEFORE,e3-e4,gemini-1.5-pro,BEFORE,0.0,1,BEFORE,4.0,,,3
2,AP_20130322,sparking,turn,e4,e5,BEFORE,e4-e5,gemini-1.5-pro,BEFORE,0.0,1,BEFORE,4.0,,,3
3,AP_20130322,turn,ended,e5,e6,BEFORE,e5-e6,gemini-1.5-pro,BEFORE,0.0,1,,,,,3
4,AP_20130322,ended,according,e6,e7,BEFORE,e6-e7,gemini-1.5-pro,BEFORE,0.0,1,BEFORE,4.0,,,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4775,nyt_20130322_strange_computer,use,test,e25,e27,BEFORE,e25-e27,gemini-1.5-pro,BEFORE,4.0,1,BEFORE,5.0,,,3
4776,nyt_20130322_strange_computer,tell,react,e28,e29,BEFORE,e28-e29,gemini-1.5-pro,BEFORE,4.0,1,BEFORE,5.0,,,3
4777,nyt_20130322_strange_computer,react,burst,e29,e30,BEFORE,e29-e30,gemini-1.5-pro,BEFORE,4.0,1,BEFORE,5.0,,,3
4778,nyt_20130322_strange_computer,react,pulse,e29,e31,BEFORE,e29-e31,gemini-1.5-pro,BEFORE,4.0,1,BEFORE,5.0,,,3


In [7]:
preds_df = (df_results[['docid', 'unique_id', 'relation_selected', 'p_label']]
            .copy()
            .dropna()
            .drop_duplicates(['docid', 'unique_id', 'relation_selected'])
            .rename({'relation_selected': 'relation'}, axis='columns'))
preds_df

Unnamed: 0,docid,unique_id,relation,p_label
0,AP_20130322,e1000028-e2,BEFORE,BEFORE
1,AP_20130322,e3-e4,BEFORE,BEFORE
2,AP_20130322,e4-e5,BEFORE,BEFORE
4,AP_20130322,e6-e7,BEFORE,BEFORE
5,AP_20130322,e8-e9,BEFORE,BEFORE
...,...,...,...,...
4674,nyt_20130322_strange_computer,e28-e29,BEFORE,BEFORE
4675,nyt_20130322_strange_computer,e29-e30,BEFORE,BEFORE
4676,nyt_20130322_strange_computer,e29-e31,BEFORE,BEFORE
4677,nyt_20130322_strange_computer,e30-e31,EQUAL,EQUAL


In [63]:
matched_preds = pd.merge(preds_df, gold_df[['docid', 'unique_id']], how='inner', on=['docid', 'unique_id'])
gold_covarage = matched_preds.shape[0] / gold_df.shape[0] * 100
print(f'the covarage of the model is {gold_covarage:.2f}%')

the covarage of the model is 22.10%


In [8]:
df = relation_table(gold_df, preds_df, model_name, target_col='label')
df

Unnamed: 0_level_0,Unnamed: 1_level_0,gemini-1.5-pro,gemini-1.5-pro,gemini-1.5-pro,gemini-1.5-pro,gemini-1.5-pro,gemini-1.5-pro
Unnamed: 0_level_1,Unnamed: 1_level_1,BEFORE,AFTER,EQUAL,VAGUE,no_predictions,sum
gold-labeled,BEFORE,94.0,2.0,1.0,2.0,325.0,424.0
gold-labeled,AFTER,57.0,1.0,3.0,0.0,208.0,269.0
gold-labeled,EQUAL,8.0,0.0,1.0,0.0,22.0,31.0
gold-labeled,VAGUE,16.0,0.0,0.0,0.0,97.0,113.0
gold-labeled,no_label,354.0,2.0,27.0,0.0,0.0,383.0
gold-labeled,sum,529.0,5.0,32.0,2.0,652.0,


In [65]:
df.to_excel(DATA_PATH / f'models-comp-results-{model_name}-{method}.xlsx')

In [9]:
recall_df = recall(df)
recall_df

Unnamed: 0,BEFORE,AFTER,EQUAL,VAGUE
0,0.221698,0.003717,0.032258,0.0


In [10]:
precision_df = precision(df)
precision_df

Unnamed: 0,BEFORE,AFTER,EQUAL,VAGUE
0,0.177694,0.2,0.03125,0.0


In [18]:
calculate_f1(precision_df.to_numpy()[0], recall_df.to_numpy()[0])

array([0.19727177, 0.00729927, 0.03174603, 0.        ])

In [19]:
calculate_micro_f1(precision_df.to_numpy()[0], recall_df.to_numpy()[0])

0.0076591422878803375

In [11]:
calculate_relax_micro_f1(precision_df.to_numpy()[0], recall_df.to_numpy()[0])

0.0076591422878803375

### Completion

In [12]:
df_pro_completion_results = pd.read_csv(TRC_RESULTS_PATH / method / f'platinum-results-{model_name}-{method}-completion.csv')
preds_zero_pro_completion_df = (df_pro_completion_results[['docid', 'unique_id', 'relation_selected', 'p_label']]
            .copy()
            .dropna()
            .drop_duplicates(['docid', 'unique_id', 'relation_selected'])
            .rename({'relation_selected': 'relation'}, axis='columns'))

preds_zero_pro_completion_df

Unnamed: 0,docid,unique_id,relation,p_label
0,AP_20130322,e2-e3,BEFORE,BEFORE
2,AP_20130322,e3-e4,BEFORE,BEFORE
3,AP_20130322,e3-e6,BEFORE,BEFORE
4,AP_20130322,e3-e7,BEFORE,BEFORE
5,AP_20130322,e4-e6,BEFORE,BEFORE
...,...,...,...,...
2592,nyt_20130321_china_pollution,e2-e8,BEFORE,BEFORE
2601,nyt_20130321_china_pollution,e14-e8,BEFORE,BEFORE
2602,nyt_20130321_china_pollution,e15-e8,BEFORE,BEFORE
2603,nyt_20130321_china_pollution,e16-e8,BEFORE,BEFORE


In [13]:
df = relation_table(gold_df, preds_zero_pro_completion_df, f'{model_name}-{method}-completion', target_col='label')
df

Unnamed: 0_level_0,Unnamed: 1_level_0,gemini-1.5-pro-zero-shot-completion,gemini-1.5-pro-zero-shot-completion,gemini-1.5-pro-zero-shot-completion,gemini-1.5-pro-zero-shot-completion,gemini-1.5-pro-zero-shot-completion,gemini-1.5-pro-zero-shot-completion
Unnamed: 0_level_1,Unnamed: 1_level_1,BEFORE,AFTER,EQUAL,VAGUE,no_predictions,sum
gold-labeled,BEFORE,165.0,4.0,12.0,8.0,235.0,424.0
gold-labeled,AFTER,86.0,15.0,7.0,4.0,157.0,269.0
gold-labeled,EQUAL,3.0,1.0,3.0,1.0,23.0,31.0
gold-labeled,VAGUE,30.0,4.0,2.0,3.0,74.0,113.0
gold-labeled,no_label,5.0,0.0,0.0,1.0,0.0,6.0
gold-labeled,sum,289.0,24.0,24.0,17.0,489.0,


In [68]:
matched_preds = pd.merge(preds_zero_pro_completion_df, gold_df[['docid', 'unique_id']], how='inner', on=['docid', 'unique_id'])
gold_covarage = matched_preds.shape[0] / gold_df.shape[0] * 100
print(f'the covarage of the model is {gold_covarage:.2f}%')

the covarage of the model is 41.58%


In [69]:
cycles_score = df_pro_completion_results[['docid', 'n_cycles']].drop_duplicates()['n_cycles'].sum() / df_pro_completion_results['docid'].nunique()
print(f'there is a {cycles_score * 100}% chance for a cycle')

there is a 0.0% chance for a cycle


In [70]:
df.to_excel(DATA_PATH / f'models-comp-results-{model_name}-{method}-completion.xlsx')

In [14]:
recall_df = recall(df)
recall_df

Unnamed: 0,BEFORE,AFTER,EQUAL,VAGUE
0,0.389151,0.055762,0.096774,0.026549


In [15]:
precision_df = precision(df)
precision_df

Unnamed: 0,BEFORE,AFTER,EQUAL,VAGUE
0,0.570934,0.625,0.125,0.176471


In [16]:
calculate_f1(precision_df.to_numpy()[0], recall_df.to_numpy()[0])

array([0.4628331 , 0.10238908, 0.10909091, 0.04615385])

In [17]:
calculate_micro_f1(precision_df.to_numpy()[0], recall_df.to_numpy()[0])

0.03552694661446764

In [18]:
calculate_relax_micro_f1(precision_df.to_numpy()[0], recall_df.to_numpy()[0])

0.04639799879613124

### Explenation

In [19]:
df_pro_completion_explanation_results = pd.read_csv(TRC_RESULTS_PATH / method / f'platinum-results-{model_name}-{method}-completion-explanation.csv')
preds_zero_pro_completion_explanation_df = (df_pro_completion_explanation_results[['docid', 'unique_id', 'relation_selected', 'p_label']]
            .copy()
            .dropna()
            .drop_duplicates(['docid', 'unique_id', 'relation_selected'])
            .rename({'relation_selected': 'relation'}, axis='columns'))

preds_zero_pro_completion_explanation_df

Unnamed: 0,docid,unique_id,relation,p_label
0,AP_20130322,e2-e3,BEFORE,BEFORE
2,AP_20130322,e3-e4,BEFORE,BEFORE
3,AP_20130322,e3-e6,BEFORE,BEFORE
4,AP_20130322,e3-e7,BEFORE,BEFORE
7,AP_20130322,e6-e7,BEFORE,BEFORE
...,...,...,...,...
3783,nyt_20130322_strange_computer,e11-e14,BEFORE,BEFORE
3784,nyt_20130322_strange_computer,e11-e16,BEFORE,BEFORE
3792,nyt_20130322_strange_computer,e19-e22,BEFORE,BEFORE
3793,nyt_20130322_strange_computer,e19-e23,BEFORE,BEFORE


In [20]:
df = relation_table(gold_df, preds_zero_pro_completion_explanation_df, f'{model_name}-{method}-completion-explanation', target_col='label')
df

Unnamed: 0_level_0,Unnamed: 1_level_0,gemini-1.5-pro-zero-shot-completion-explanation,gemini-1.5-pro-zero-shot-completion-explanation,gemini-1.5-pro-zero-shot-completion-explanation,gemini-1.5-pro-zero-shot-completion-explanation,gemini-1.5-pro-zero-shot-completion-explanation,gemini-1.5-pro-zero-shot-completion-explanation
Unnamed: 0_level_1,Unnamed: 1_level_1,BEFORE,AFTER,EQUAL,VAGUE,no_predictions,sum
gold-labeled,BEFORE,276.0,11.0,10.0,28.0,99.0,424.0
gold-labeled,AFTER,141.0,46.0,6.0,21.0,55.0,269.0
gold-labeled,EQUAL,13.0,5.0,4.0,6.0,3.0,31.0
gold-labeled,VAGUE,50.0,8.0,2.0,17.0,36.0,113.0
gold-labeled,no_label,0.0,0.0,0.0,0.0,0.0,0.0
gold-labeled,sum,480.0,70.0,22.0,72.0,193.0,


In [49]:
matched_preds = pd.merge(preds_zero_pro_completion_explanation_df, gold_df[['docid', 'unique_id']], how='inner', on=['docid', 'unique_id'])
gold_covarage = matched_preds.shape[0] / gold_df.shape[0] * 100
print(f'the covarage of the model is {gold_covarage:.2f}%')

the covarage of the model is 76.94%


In [51]:
cycles_score = df_pro_completion_explanation_results[['docid', 'n_cycles']].drop_duplicates()['n_cycles'].sum() / df_pro_completion_explanation_results['docid'].nunique()
print(f'there is a {cycles_score * 100}% chance for a cycle')

there is a 10.0% chance for a cycle


In [52]:
df.to_excel(DATA_PATH / f'models-comp-results-{model_name}-{method}-completion-explanation.xlsx')

In [21]:
recall_df = recall(df)
recall_df

Unnamed: 0,BEFORE,AFTER,EQUAL,VAGUE
0,0.650943,0.171004,0.129032,0.150442


In [22]:
precision_df = precision(df)
precision_df

Unnamed: 0,BEFORE,AFTER,EQUAL,VAGUE
0,0.575,0.657143,0.181818,0.236111


In [10]:
calculate_f1(precision_df.to_numpy()[0], recall_df.to_numpy()[0])

array([0.61061947, 0.27138643, 0.1509434 , 0.18378378])

In [12]:
calculate_micro_f1(precision_df.to_numpy()[0], recall_df.to_numpy()[0])

0.1032992638797949

In [23]:
calculate_relax_micro_f1(precision_df.to_numpy()[0], recall_df.to_numpy()[0])

0.13279603692662253

## Few-Shot

In [6]:
method = 'few-shot'

In [25]:
df_pro_few_results = pd.read_csv(TRC_RESULTS_PATH / method / f'platinum-results-{model_name}-{method}.csv')
preds_few_pro_df = (df_pro_few_results[['docid', 'unique_id', 'relation_selected', 'p_label']]
            .copy()
            .dropna()
            .drop_duplicates(['docid', 'unique_id', 'relation_selected'])
            .rename({'relation_selected': 'relation'}, axis='columns'))


df = relation_table(gold_df, preds_few_pro_df, model_name, target_col='label')
df

Unnamed: 0_level_0,Unnamed: 1_level_0,gemini-1.5-pro,gemini-1.5-pro,gemini-1.5-pro,gemini-1.5-pro,gemini-1.5-pro,gemini-1.5-pro
Unnamed: 0_level_1,Unnamed: 1_level_1,BEFORE,AFTER,EQUAL,VAGUE,no_predictions,sum
gold-labeled,BEFORE,151.0,0.0,0.0,4.0,269.0,424.0
gold-labeled,AFTER,91.0,2.0,0.0,3.0,173.0,269.0
gold-labeled,EQUAL,8.0,0.0,2.0,0.0,21.0,31.0
gold-labeled,VAGUE,26.0,3.0,0.0,3.0,81.0,113.0
gold-labeled,no_label,479.0,13.0,5.0,23.0,0.0,520.0
gold-labeled,sum,755.0,18.0,7.0,33.0,544.0,


In [18]:
relation_table(gold_df, preds_few_pro_df, model_name)

Unnamed: 0_level_0,Unnamed: 1_level_0,gemini-1.5-pro,gemini-1.5-pro,gemini-1.5-pro,gemini-1.5-pro,gemini-1.5-pro,gemini-1.5-pro
Unnamed: 0_level_1,Unnamed: 1_level_1,BEFORE,AFTER,EQUAL,VAGUE,no_predictions,sum
gold-labeled,BEFORE,247.0,0.0,0.0,4.0,442.0,693.0
gold-labeled,AFTER,0.0,0.0,0.0,0.0,0.0,0.0
gold-labeled,EQUAL,8.0,0.0,2.0,0.0,21.0,31.0
gold-labeled,VAGUE,31.0,0.0,0.0,1.0,81.0,113.0
gold-labeled,no_label,505.0,0.0,5.0,10.0,0.0,520.0
gold-labeled,sum,791.0,0.0,7.0,15.0,544.0,


In [73]:
matched_preds = pd.merge(preds_few_pro_df, gold_df[['docid', 'unique_id']], how='inner', on=['docid', 'unique_id'])
gold_covarage = matched_preds.shape[0] / gold_df.shape[0] * 100
print(f'the covarage of the model is {gold_covarage:.2f}%')

the covarage of the model is 35.01%


In [74]:
cycles_score = df_pro_few_results[['docid', 'n_cycles']].drop_duplicates()['n_cycles'].sum() / df_pro_few_results['docid'].nunique()
print(f'there is a {cycles_score * 100}% chance for a cycle')

there is a 5.0% chance for a cycle


In [75]:
df.to_excel(DATA_PATH / f'models-comp-results-{model_name}-{method}.xlsx')

In [26]:
recall_df = recall(df)
recall_df

Unnamed: 0,BEFORE,AFTER,EQUAL,VAGUE
0,0.356132,0.007435,0.064516,0.026549


In [27]:
precision_df = precision(df)
precision_df

Unnamed: 0,BEFORE,AFTER,EQUAL,VAGUE
0,0.2,0.111111,0.285714,0.090909


In [28]:
calculate_f1(precision_df.to_numpy()[0], recall_df.to_numpy()[0])

array([0.25614928, 0.01393728, 0.10526316, 0.04109589])

In [29]:
calculate_micro_f1(precision_df.to_numpy()[0], recall_df.to_numpy()[0])

0.04682637699492519

In [30]:
calculate_relax_micro_f1(precision_df.to_numpy()[0], recall_df.to_numpy()[0])

0.056547534434606105


### Completion

In [14]:
f'platinum-results-{model_name}-{method}-completion.csv'

'platinum-results-gemini-1.5-pro-few-shot-completion.csv'

In [31]:
df_pro_completion_results = pd.read_csv(TRC_RESULTS_PATH / method / f'platinum-results-{model_name}-{method}-completion.csv')
preds_few_pro_completion_df = (df_pro_completion_results[['docid', 'unique_id', 'relation_selected', 'p_label']]
            .copy()
            .dropna()
            .drop_duplicates(['docid', 'unique_id', 'relation_selected'])
            .rename({'relation_selected': 'relation'}, axis='columns'))

preds_few_pro_completion_df

Unnamed: 0,docid,unique_id,relation,p_label
0,AP_20130322,e2-e3,BEFORE,BEFORE
1,AP_20130322,e2-e4,BEFORE,BEFORE
2,AP_20130322,e3-e4,BEFORE,BEFORE
3,AP_20130322,e3-e6,BEFORE,BEFORE
4,AP_20130322,e3-e7,BEFORE,BEFORE
...,...,...,...,...
4067,nyt_20130322_strange_computer,e19-e23,BEFORE,BEFORE
4068,nyt_20130322_strange_computer,e22-e23,BEFORE,BEFORE
4069,nyt_20130322_strange_computer,e19-e24,BEFORE,BEFORE
4070,nyt_20130322_strange_computer,e22-e24,BEFORE,BEFORE


In [32]:
df = relation_table(gold_df, preds_few_pro_completion_df, f'{model_name}-{method}-completion', target_col='label')
df

Unnamed: 0_level_0,Unnamed: 1_level_0,gemini-1.5-pro-few-shot-completion,gemini-1.5-pro-few-shot-completion,gemini-1.5-pro-few-shot-completion,gemini-1.5-pro-few-shot-completion,gemini-1.5-pro-few-shot-completion,gemini-1.5-pro-few-shot-completion
Unnamed: 0_level_1,Unnamed: 1_level_1,BEFORE,AFTER,EQUAL,VAGUE,no_predictions,sum
gold-labeled,BEFORE,372.0,2.0,4.0,27.0,19.0,424.0
gold-labeled,AFTER,219.0,22.0,2.0,13.0,13.0,269.0
gold-labeled,EQUAL,24.0,2.0,0.0,4.0,1.0,31.0
gold-labeled,VAGUE,90.0,7.0,1.0,12.0,3.0,113.0
gold-labeled,no_label,0.0,0.0,0.0,0.0,0.0,0.0
gold-labeled,sum,705.0,33.0,7.0,56.0,36.0,


In [16]:
df = relation_table(gold_df, preds_few_pro_completion_df, f'{model_name}-{method}-completion', target_col='label')
df

Unnamed: 0_level_0,Unnamed: 1_level_0,gemini-1.5-pro-few-shot-completion,gemini-1.5-pro-few-shot-completion,gemini-1.5-pro-few-shot-completion,gemini-1.5-pro-few-shot-completion,gemini-1.5-pro-few-shot-completion,gemini-1.5-pro-few-shot-completion
Unnamed: 0_level_1,Unnamed: 1_level_1,BEFORE,AFTER,EQUAL,VAGUE,no_predictions,sum
gold-labeled,BEFORE,405.0,0.0,0.0,0.0,19.0,424.0
gold-labeled,AFTER,0.0,256.0,0.0,0.0,13.0,269.0
gold-labeled,EQUAL,0.0,0.0,30.0,0.0,1.0,31.0
gold-labeled,VAGUE,0.0,0.0,0.0,110.0,3.0,113.0
gold-labeled,no_label,0.0,0.0,0.0,0.0,0.0,0.0
gold-labeled,sum,405.0,256.0,30.0,110.0,36.0,


In [78]:
matched_preds = pd.merge(preds_few_pro_completion_df, gold_df[['docid', 'unique_id']], how='inner', on=['docid', 'unique_id'])
gold_covarage = matched_preds.shape[0] / gold_df.shape[0] * 100
print(f'the covarage of the model is {gold_covarage:.2f}%')

the covarage of the model is 95.70%


In [79]:
cycles_score = df_pro_completion_results[['docid', 'n_cycles']].drop_duplicates()['n_cycles'].sum() / df_pro_completion_results['docid'].nunique()
print(f'there is a {cycles_score * 100}% chance for a cycle')

there is a 0.0% chance for a cycle


In [80]:
df.to_excel(DATA_PATH / f'models-comp-results-{model_name}-{method}-completion-2.xlsx')

In [33]:
recall_df = recall(df)
recall_df

Unnamed: 0,BEFORE,AFTER,EQUAL,VAGUE
0,0.877358,0.081784,0.0,0.106195


In [34]:
precision_df = precision(df)
precision_df

Unnamed: 0,BEFORE,AFTER,EQUAL,VAGUE
0,0.52766,0.666667,0.0,0.214286


In [35]:
calculate_f1(precision_df.to_numpy()[0], recall_df.to_numpy()[0])

array([0.65899026, 0.14569536, 0.        , 0.14201183])

In [36]:
calculate_micro_f1(precision_df.to_numpy()[0], recall_df.to_numpy()[0])

0.2852412759693945

In [37]:
calculate_relax_micro_f1(precision_df.to_numpy()[0], recall_df.to_numpy()[0])

0.4121016555547362

### Explenation

In [7]:
df_pro_completion_explanation_results = pd.read_csv(TRC_RESULTS_PATH / method / f'platinum-results-{model_name}-{method}-completion-explanation.csv')
preds_few_pro_completion_explanation_df = (df_pro_completion_explanation_results[['docid', 'unique_id', 'relation_selected', 'p_label']]
            .copy()
            .dropna()
            .drop_duplicates(['docid', 'unique_id', 'relation_selected'])
            .rename({'relation_selected': 'relation'}, axis='columns'))

preds_few_pro_completion_explanation_df

Unnamed: 0,docid,unique_id,relation,p_label
0,AP_20130322,e2-e3,BEFORE,BEFORE
1,AP_20130322,e2-e4,BEFORE,BEFORE
2,AP_20130322,e3-e4,BEFORE,BEFORE
3,AP_20130322,e3-e6,BEFORE,BEFORE
4,AP_20130322,e3-e7,BEFORE,BEFORE
...,...,...,...,...
3917,nyt_20130322_strange_computer,e16-e19,VAGUE,VAGUE
3920,nyt_20130322_strange_computer,e19-e22,BEFORE,BEFORE
3921,nyt_20130322_strange_computer,e19-e23,BEFORE,BEFORE
3922,nyt_20130322_strange_computer,e22-e23,BEFORE,BEFORE


In [8]:
df = relation_table(gold_df, preds_few_pro_completion_explanation_df, f'{model_name}-{method}-completion-explanation', target_col='label')
df

Unnamed: 0_level_0,Unnamed: 1_level_0,gemini-1.5-pro-few-shot-completion-explanation,gemini-1.5-pro-few-shot-completion-explanation,gemini-1.5-pro-few-shot-completion-explanation,gemini-1.5-pro-few-shot-completion-explanation,gemini-1.5-pro-few-shot-completion-explanation,gemini-1.5-pro-few-shot-completion-explanation
Unnamed: 0_level_1,Unnamed: 1_level_1,BEFORE,AFTER,EQUAL,VAGUE,no_predictions,sum
gold-labeled,BEFORE,327.0,5.0,4.0,31.0,57.0,424.0
gold-labeled,AFTER,206.0,24.0,0.0,11.0,28.0,269.0
gold-labeled,EQUAL,22.0,2.0,1.0,1.0,5.0,31.0
gold-labeled,VAGUE,83.0,0.0,0.0,7.0,23.0,113.0
gold-labeled,no_label,0.0,0.0,0.0,0.0,0.0,0.0
gold-labeled,sum,638.0,31.0,5.0,50.0,113.0,


In [9]:
matched_preds = pd.merge(preds_few_pro_completion_explanation_df, gold_df[['docid', 'unique_id']], how='inner', on=['docid', 'unique_id'])
gold_covarage = matched_preds.shape[0] / gold_df.shape[0] * 100
print(f'the covarage of the model is {gold_covarage:.2f}%')

the covarage of the model is 86.50%


In [10]:
cycles_score = df_pro_completion_explanation_results[['docid', 'n_cycles']].drop_duplicates()['n_cycles'].sum() / df_pro_completion_explanation_results['docid'].nunique()
print(f'there is a {cycles_score * 100}% chance for a cycle')

there is a 0.0% chance for a cycle


In [11]:
df.to_excel(DATA_PATH / f'models-comp-results-{model_name}-{method}-completion-explanation.xlsx')

In [12]:
recall_df = recall(df)
recall_df

Unnamed: 0,BEFORE,AFTER,EQUAL,VAGUE
0,0.771226,0.089219,0.032258,0.061947


In [13]:
precision_df = precision(df)
precision_df

Unnamed: 0,BEFORE,AFTER,EQUAL,VAGUE
0,0.512539,0.774194,0.2,0.14


In [14]:
calculate_f1(precision_df.to_numpy()[0], recall_df.to_numpy()[0])

array([0.61581921, 0.16      , 0.05555556, 0.08588957])

In [15]:
calculate_micro_f1(precision_df.to_numpy()[0], recall_df.to_numpy()[0])

0.13929604669440937

In [16]:
calculate_relax_micro_f1(precision_df.to_numpy()[0], recall_df.to_numpy()[0])

0.2208661097744532

# Gemini-1.5-Flash

In [38]:
model_name = 'gemini-1.5-flash'

## Zero-Shot

In [39]:
method = 'zero-shot'

In [42]:
df_flash_results = pd.read_csv(TRC_RESULTS_PATH / method / f'platinum-results-{model_name}-{method}.csv')
preds_zero_flash_df = (df_flash_results[['docid', 'unique_id', 'relation_selected', 'p_label']]
            .copy()
            .dropna()
            .drop_duplicates(['docid', 'unique_id', 'relation_selected'])
            .rename({'relation_selected': 'relation'}, axis='columns'))


df = relation_table(gold_df, preds_zero_flash_df, model_name, target_col='label')
df

FileNotFoundError: [Errno 2] No such file or directory: '../data/TRC/results/zero-shot/platinum-results-gemini-1.5-flash-zero-shot.csv'

In [33]:
preds_zero_flash_df

Unnamed: 0,docid,unique_id,relation
0,AP_20130322,e2-e3,BEFORE
1,AP_20130322,e3-e4,BEFORE
2,AP_20130322,e4-e5,BEFORE
3,AP_20130322,e5-e6,BEFORE
5,AP_20130322,e8-e9,BEFORE
...,...,...,...
8740,nyt_20130322_strange_computer,e11-e9,BEFORE
8742,nyt_20130322_strange_computer,e14-e15,BEFORE
8744,nyt_20130322_strange_computer,e18-e19,BEFORE
8749,nyt_20130322_strange_computer,e24-e25,BEFORE


In [62]:
matched_preds = pd.merge(preds_zero_flash_df, gold_df[['docid', 'unique_id']], how='inner', on=['docid', 'unique_id'])
gold_covarage = matched_preds.shape[0] / gold_df.shape[0] * 100
print(f'the covarage of the model is {gold_covarage:.2f}%')

the covarage of the model is 18.52%


In [63]:
cycles_score = df_flash_results[['docid', 'n_cycles']].drop_duplicates()['n_cycles'].sum() / df_flash_results['docid'].nunique()
print(f'there is a {cycles_score * 100}% chance for a cycle')

there is a 0.0% chance for a cycle


In [64]:
df.to_excel(DATA_PATH / f'models-comp-results-{model_name}.xlsx')

### Completion

In [65]:
df_flash_completion_results = pd.read_csv(TRC_RESULTS_PATH / method / f'platinum-results-{model_name}-completion.csv')
preds_zero_flash_completion_df = (df_flash_completion_results[['docid', 'unique_id', 'relation_selected']]
            .copy()
            .dropna()
            .drop_duplicates()
            .rename({'relation_selected': 'relation'}, axis='columns'))


df = relation_table(gold_df, preds_zero_flash_completion_df, model_name)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,gemini-1.5-flash,gemini-1.5-flash,gemini-1.5-flash,gemini-1.5-flash,gemini-1.5-flash
Unnamed: 0_level_1,Unnamed: 1_level_1,BEFORE,EQUAL,VAGUE,no_predictions,sum
gold-labeled,BEFORE,154.0,1.0,0.0,538.0,693.0
gold-labeled,EQUAL,8.0,0.0,0.0,23.0,31.0
gold-labeled,VAGUE,28.0,0.0,0.0,85.0,113.0
gold-labeled,no_label,13.0,0.0,0.0,0.0,13.0
gold-labeled,sum,203.0,1.0,0.0,646.0,


In [66]:
matched_preds = pd.merge(preds_zero_flash_df, gold_df[['docid', 'unique_id']], how='inner', on=['docid', 'unique_id'])
gold_covarage = matched_preds.shape[0] / gold_df.shape[0] * 100
print(f'the covarage of the model is {gold_covarage:.2f}%')

the covarage of the model is 18.52%


In [67]:
cycles_score = df_flash_results[['docid', 'n_cycles']].drop_duplicates()['n_cycles'].sum() / df_flash_results['docid'].nunique()
print(f'there is a {cycles_score * 100}% chance for a cycle')

there is a 0.0% chance for a cycle


In [68]:
df.to_excel(DATA_PATH / f'models-comp-results-{model_name}-completion.xlsx')

## Few-Shot

In [35]:
method = 'few-shot'

In [36]:
df_results = pd.read_csv(TRC_RESULTS_PATH / method / f'platinum-results-{model_name}.csv')
df_results

Unnamed: 0,docid,verb1,verb2,eiid1,eiid2,relation,unique_id,model_name,n_iter,score,relation_selected,max_score,ccr,scr,n_cycles,min_vote
0,AP_20130322,killed,started,e2,e3,BEFORE,e2-e3,gemini-1.5-flash,0.0,1,BEFORE,5.0,0.121212,1.0,,3
1,AP_20130322,started,sparking,e3,e4,BEFORE,e3-e4,gemini-1.5-flash,0.0,1,BEFORE,5.0,0.121212,1.0,,3
2,AP_20130322,started,turn,e3,e5,BEFORE,e3-e5,gemini-1.5-flash,0.0,1,BEFORE,5.0,0.121212,1.0,,3
3,AP_20130322,started,ended,e3,e6,BEFORE,e3-e6,gemini-1.5-flash,0.0,1,,,0.121212,1.0,,3
4,AP_20130322,started,according,e3,e7,BEFORE,e3-e7,gemini-1.5-flash,0.0,1,,,0.121212,1.0,,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14204,nyt_20130322_strange_computer,tell,react,e28,e29,BEFORE,e28-e29,gemini-1.5-flash,4.0,1,BEFORE,5.0,0.000000,1.0,,3
14205,nyt_20130322_strange_computer,react,burst,e29,e30,BEFORE,e29-e30,gemini-1.5-flash,4.0,1,BEFORE,5.0,0.000000,1.0,,3
14206,nyt_20130322_strange_computer,react,pulse,e29,e31,BEFORE,e29-e31,gemini-1.5-flash,4.0,1,BEFORE,5.0,0.000000,1.0,,3
14207,nyt_20130322_strange_computer,burst,pulse,e30,e31,VAGUE,e30-e31,gemini-1.5-flash,4.0,1,,,0.000000,1.0,,3


In [37]:
preds_df = (df_results[['docid', 'unique_id', 'relation_selected']]
            .copy()
            .dropna()
            .drop_duplicates()
            .rename({'relation_selected': 'relation'}, axis='columns'))
preds_df

Unnamed: 0,docid,unique_id,relation
0,AP_20130322,e2-e3,BEFORE
1,AP_20130322,e3-e4,BEFORE
2,AP_20130322,e3-e5,BEFORE
5,AP_20130322,e4-e5,BEFORE
8,AP_20130322,e5-e6,BEFORE
...,...,...,...
14021,nyt_20130322_strange_computer,e25-e27,BEFORE
14022,nyt_20130322_strange_computer,e28-e29,BEFORE
14023,nyt_20130322_strange_computer,e29-e30,BEFORE
14024,nyt_20130322_strange_computer,e29-e31,BEFORE


In [38]:
matched_preds = pd.merge(preds_df, gold_df[['docid', 'unique_id']], how='inner', on=['docid', 'unique_id'])
gold_covarage = matched_preds.shape[0] / gold_df.shape[0] * 100
print(f'the covarage of the model is {gold_covarage:.2f}%')

the covarage of the model is 21.98%


In [39]:
cycles_score = df_results[['docid', 'n_cycles']].drop_duplicates()['n_cycles'].sum() / df_results['docid'].nunique()
print(f'there is a {cycles_score * 100}% chance for a cycle')

there is a 0.0% chance for a cycle


In [41]:
df = relation_table(gold_df, preds_df, model_name)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,gemini-1.5-flash,gemini-1.5-flash,gemini-1.5-flash,gemini-1.5-flash,gemini-1.5-flash
Unnamed: 0_level_1,Unnamed: 1_level_1,BEFORE,EQUAL,VAGUE,no_predictions,sum
gold-labeled,BEFORE,158.0,0.0,0.0,535.0,693.0
gold-labeled,EQUAL,7.0,0.0,0.0,24.0,31.0
gold-labeled,VAGUE,19.0,0.0,0.0,94.0,113.0
gold-labeled,no_label,1081.0,0.0,0.0,0.0,1081.0
gold-labeled,sum,1265.0,0.0,0.0,653.0,


In [42]:
df.to_excel(DATA_PATH / f'models-comp-results-{model_name}.xlsx')

# Llama-3.1-70b-versatile

In [5]:
model_name = 'llama-3.1-70b-versatile'

## Zero-Shot

In [6]:
method = 'zero-shot'

In [10]:
df_flash_results = pd.read_csv(TRC_RESULTS_PATH / method / f'platinum-results-{model_name}-{method}.csv')
preds_zero_flash_df = (df_flash_results[['docid', 'unique_id', 'relation_selected', 'p_label']]
            .copy()
            .dropna()
            .drop_duplicates(['docid', 'unique_id', 'relation_selected'])
            .rename({'relation_selected': 'relation'}, axis='columns'))


df = relation_table(gold_df, preds_zero_flash_df, model_name, target_col='label')
df

Unnamed: 0_level_0,Unnamed: 1_level_0,llama-3.1-70b-versatile,llama-3.1-70b-versatile,llama-3.1-70b-versatile,llama-3.1-70b-versatile,llama-3.1-70b-versatile,llama-3.1-70b-versatile
Unnamed: 0_level_1,Unnamed: 1_level_1,BEFORE,AFTER,EQUAL,VAGUE,no_predictions,sum
gold-labeled,BEFORE,33.0,0.0,1.0,13.0,377.0,424.0
gold-labeled,AFTER,40.0,0.0,0.0,24.0,205.0,269.0
gold-labeled,EQUAL,3.0,0.0,0.0,1.0,27.0,31.0
gold-labeled,VAGUE,8.0,0.0,0.0,6.0,99.0,113.0
gold-labeled,no_label,123.0,4.0,1.0,206.0,0.0,334.0
gold-labeled,sum,207.0,4.0,2.0,250.0,708.0,


In [11]:
matched_preds = pd.merge(preds_zero_flash_df, gold_df[['docid', 'unique_id']], how='inner', on=['docid', 'unique_id'])
gold_covarage = matched_preds.shape[0] / gold_df.shape[0] * 100
print(f'the covarage of the model is {gold_covarage:.2f}%')

the covarage of the model is 15.41%


In [12]:
cycles_score = df_flash_results[['docid', 'n_cycles']].drop_duplicates()['n_cycles'].sum() / df_flash_results['docid'].nunique()
print(f'there is a {cycles_score * 100}% chance for a cycle')

there is a 0.0% chance for a cycle


In [13]:
df.to_excel(DATA_PATH / f'models-comp-results-{model_name}-{method}.xlsx')

# Meta-Llama-3.1-8B-Instruct-Turbo

In [6]:
model_name = 'Meta-Llama-3.1-8B-Instruct-Turbo'

## Zero-Shot

In [7]:
method = 'zero-shot'

In [8]:
summary_results(TRC_RESULTS_PATH / method / f'platinum-results-{model_name}-{method}.csv', 
                gold_df, 
                model_name=f'platinum-results-{model_name}-{method}.csv')

  return pd.DataFrame(columns=[col[1] for col in df.columns][:4], data=[labels_values / sum_per_label])


relation,VAGUE,VAGUE,VAGUE,BEFORE,BEFORE,BEFORE,AFTER,AFTER,AFTER,EQUAL,EQUAL,EQUAL,micro-f1,relax-micro-f1,cycles,coverage
metric,precision,recall,f1,precision,recall,f1,precision,recall,f1,precision,recall,f1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0.077156,0.120283,0.094009,0.222222,0.007435,0.014388,,0.0,,0.006923,0.079646,0.012739,,,1.0 / 20,0.210275


### Completion

In [9]:
df_flash_results = pd.read_csv(TRC_RESULTS_PATH / method / f'platinum-results-{model_name}-{method}-completion.csv')
preds_zero_flash_df = (df_flash_results[['docid', 'unique_id', 'relation_selected', 'p_label']]
            .copy()
            .dropna()
            .drop_duplicates(['docid', 'unique_id', 'relation_selected'])
            .rename({'relation_selected': 'relation'}, axis='columns'))


df = relation_table(gold_df, preds_zero_flash_df, model_name, target_col='label')
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Meta-Llama-3.1-8B-Instruct-Turbo,Meta-Llama-3.1-8B-Instruct-Turbo,Meta-Llama-3.1-8B-Instruct-Turbo,Meta-Llama-3.1-8B-Instruct-Turbo,Meta-Llama-3.1-8B-Instruct-Turbo,Meta-Llama-3.1-8B-Instruct-Turbo
Unnamed: 0_level_1,Unnamed: 1_level_1,BEFORE,AFTER,EQUAL,VAGUE,no_predictions,sum
gold-labeled,BEFORE,88.0,43.0,13.0,129.0,151.0,424.0
gold-labeled,AFTER,56.0,18.0,10.0,65.0,120.0,269.0
gold-labeled,EQUAL,7.0,6.0,0.0,10.0,8.0,31.0
gold-labeled,VAGUE,17.0,17.0,1.0,39.0,39.0,113.0
gold-labeled,no_label,0.0,0.0,0.0,0.0,0.0,0.0
gold-labeled,sum,168.0,84.0,24.0,243.0,318.0,


In [66]:
df_flash_results['docid'].nunique()

18

In [37]:
matched_preds = pd.merge(preds_zero_flash_df, gold_df[['docid', 'unique_id']], how='inner', on=['docid', 'unique_id'])
gold_covarage = matched_preds.shape[0] / gold_df.shape[0] * 100
print(f'the covarage of the model is {gold_covarage:.2f}%')

the covarage of the model is 62.01%


In [65]:
n_cycles = df_flash_results[['docid', 'n_cycles']].drop_duplicates()['n_cycles'].sum()
n_docs = df_flash_results['docid'].nunique()
print(f'there is a {n_cycles}/{n_docs} chance for a cycle')

there is a 2.0/18 chance for a cycle


In [39]:
df.to_excel(DATA_PATH / f'models-comp-results-{model_name}-{method}-completion.csv.xlsx')

In [47]:
recall_df = recall(df)
recall_df

Unnamed: 0,BEFORE,AFTER,EQUAL,VAGUE
0,0.207547,0.066914,0.0,0.345133


In [48]:
precision_df = precision(df)
precision_df

Unnamed: 0,BEFORE,AFTER,EQUAL,VAGUE
0,0.52381,0.214286,0.0,0.160494


In [49]:
calculate_f1(precision_df.to_numpy()[0], recall_df.to_numpy()[0])

array([0.2972973 , 0.101983  , 0.        , 0.21910112])

In [50]:
calculate_micro_f1(precision_df.to_numpy()[0], recall_df.to_numpy()[0])

0.056727423928354774

In [51]:
calculate_relax_micro_f1(precision_df.to_numpy()[0], recall_df.to_numpy()[0])

0.04274112408767239

### Explanation

In [52]:
df_flash_results = pd.read_csv(TRC_RESULTS_PATH / method / f'platinum-results-{model_name}-{method}-completion-explanation.csv')
preds_zero_flash_df = (df_flash_results[['docid', 'unique_id', 'relation_selected', 'p_label']]
            .copy()
            .dropna()
            .drop_duplicates(['docid', 'unique_id', 'relation_selected'])
            .rename({'relation_selected': 'relation'}, axis='columns'))


df = relation_table(gold_df, preds_zero_flash_df, model_name, target_col='label')
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Meta-Llama-3.1-8B-Instruct-Turbo,Meta-Llama-3.1-8B-Instruct-Turbo,Meta-Llama-3.1-8B-Instruct-Turbo,Meta-Llama-3.1-8B-Instruct-Turbo,Meta-Llama-3.1-8B-Instruct-Turbo,Meta-Llama-3.1-8B-Instruct-Turbo
Unnamed: 0_level_1,Unnamed: 1_level_1,BEFORE,AFTER,EQUAL,VAGUE,no_predictions,sum
gold-labeled,BEFORE,77.0,15.0,0.0,213.0,119.0,424.0
gold-labeled,AFTER,64.0,8.0,0.0,134.0,63.0,269.0
gold-labeled,EQUAL,8.0,1.0,0.0,12.0,10.0,31.0
gold-labeled,VAGUE,13.0,7.0,0.0,61.0,32.0,113.0
gold-labeled,no_label,6.0,1.0,0.0,2.0,0.0,9.0
gold-labeled,sum,168.0,32.0,0.0,422.0,224.0,


In [32]:
matched_preds = pd.merge(preds_zero_flash_df, gold_df[['docid', 'unique_id']], how='inner', on=['docid', 'unique_id'])
gold_covarage = matched_preds.shape[0] / gold_df.shape[0] * 100
print(f'the covarage of the model is {gold_covarage:.2f}%')

the covarage of the model is 73.24%


In [34]:
n_cycles = df_flash_results[['docid', 'n_cycles']].drop_duplicates()['n_cycles'].sum()
n_docs = df_flash_results['docid'].nunique()
print(f'there is a {n_cycles}/{n_docs} chance for a cycle')

there is a 3.0/20 chance for a cycle


In [35]:
df.to_excel(DATA_PATH / f'models-comp-results-{model_name}-{method}-completion-explanation.csv.xlsx')

In [53]:
recall_df = recall(df)
recall_df

Unnamed: 0,BEFORE,AFTER,EQUAL,VAGUE
0,0.181604,0.02974,0.0,0.539823


In [60]:
precision_df = precision(df).fillna(0.0)
precision_df

  return pd.DataFrame(columns=[col[1] for col in df.columns][:4], data=[labels_values / sum_per_label])


Unnamed: 0,BEFORE,AFTER,EQUAL,VAGUE
0,0.458333,0.25,0.0,0.14455


In [61]:
calculate_f1(precision_df.to_numpy()[0], recall_df.to_numpy()[0])

array([0.26013514, 0.05315615, 0.        , 0.22803738])

In [62]:
calculate_micro_f1(precision_df.to_numpy()[0], recall_df.to_numpy()[0])

0.36323351604384435

In [63]:
calculate_relax_micro_f1(precision_df.to_numpy()[0], recall_df.to_numpy()[0])

0.03700857493398767