# 1. Imports

In [None]:
import pandas as pd
import numpy as np
from ast import literal_eval as leval
import seaborn as sns
from tqdm.auto import tqdm
tqdm.pandas()
from matplotlib import pyplot as plt

def get_outliers(series):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3-q1 #Interquartile range
    fence_low  = q1-1.5*iqr
    fence_high = q3+1.5*iqr
    outliers = series[(series < fence_low) | (series > fence_high)]
    return outliers

from emoji import UNICODE_EMOJI

def is_emoji(s):
    flags = re.findall(u'[\U0001F1E6-\U0001F1FF]', s)
    if flags:
        return True
    return s in UNICODE_EMOJI

# 2. Get WTR dataset

In [None]:
reference_text_df = pd.read_csv('text_extraction/reference_html_as_sentences_df.csv')

## 2.1. References

In here we have the information about the references, including the reference ids and the text/sentences extracted.

In [None]:
reference_text_df.info()

In [None]:
def check_column_dist(df, col):
    counts = df[col].value_counts().reset_index()
    counts.columns = [col, 'counts']
    counts['per'] = 100*counts['counts']/counts['counts'].sum()
    counts[col] = counts[col].astype(str)
    return counts

# Netlocs are equaly spread
# codes are all good
# reasons are good
fig, ax = plt.subplots(1,3,figsize=(15,10))
sns.barplot(data=check_column_dist(reference_text_df, 'netloc_agg'), x='per', y='netloc_agg', ax=ax[0])
sns.barplot(data=check_column_dist(reference_text_df, 'code'), x='per', y='code', ax=ax[1])
sns.barplot(data=check_column_dist(reference_text_df, 'reason'), x='per', y='reason', ax=ax[2])
plt.tight_layout()
plt.show()

## 2.2. Claims

This has all the claim data, including labels, aliases, descriptions, and language IDs.

In [None]:
claim_data_df = pd.read_csv('text_extraction/text_reference_claims_df.csv')

In [None]:
claim_data_df.info()

In [None]:
print('Total counts:')
print(f'{claim_data_df.claim_id.unique().shape[0]} unique claims')
print(f'{claim_data_df.reference_id.unique().shape[0]} unique references')

In [None]:
# Considerable amount of aliases for entities, LOTS for properties, reasonable for objects.
# We can use this to generate multiple verbalisations based on aliases.
fig, ax = plt.subplots(4,3,figsize=(15,10))
sns.barplot(data=check_column_dist(claim_data_df, 'rank'), x='per', y='rank', ax=ax[0][0])
sns.barplot(data=check_column_dist(claim_data_df, 'datatype'), x='per', y='datatype', ax=ax[0][1])
#sns.barplot(data=check_column_dist(claim_data_df, 'entity_label_lan'), x='per', y='entity_label_lan', ax=ax[0][2])
sns.barplot(data=check_column_dist(claim_data_df, 'entity_label_lan'), x='per', y='entity_label_lan', ax=ax[1][0])
sns.barplot(data=check_column_dist(claim_data_df, 'entity_alias_lan'), x='per', y='entity_alias_lan', ax=ax[1][1])
sns.barplot(data=check_column_dist(claim_data_df, 'entity_desc_lan'), x='per', y='entity_desc_lan', ax=ax[1][2])
sns.barplot(data=check_column_dist(claim_data_df, 'property_label_lan'), x='per', y='property_label_lan', ax=ax[2][0])
sns.barplot(data=check_column_dist(claim_data_df, 'property_alias_lan'), x='per', y='property_alias_lan', ax=ax[2][1])
sns.barplot(data=check_column_dist(claim_data_df, 'property_desc_lan'), x='per', y='property_desc_lan', ax=ax[2][2])
sns.barplot(data=check_column_dist(claim_data_df, 'object_label_lan'), x='per', y='object_label_lan', ax=ax[3][0])
sns.barplot(data=check_column_dist(claim_data_df, 'object_alias_lan'), x='per', y='object_alias_lan', ax=ax[3][1])
sns.barplot(data=check_column_dist(claim_data_df, 'object_desc_lan'), x='per', y='object_desc_lan', ax=ax[3][2])
plt.tight_layout()
plt.show()

In [None]:
# Distribution of entities and properties involved in the claim data

fig, ax = plt.subplots(4,2,figsize=(10,12))

ax[0][0].set_yscale('log')
entity_count = claim_data_df.entity_id.value_counts()
ax[0][0].set_title('Boxplot of Entity distribution')
sns.boxplot(data = entity_count, ax=ax[0][0])
ax[0][1].set_title('KDE of Entity distribution')
sns.kdeplot(data = entity_count, ax=ax[0][1])

ax[1][0].set_yscale('log')
property_count = claim_data_df.property_id.value_counts()
ax[1][0].set_title('Boxplot of Property distribution')
sns.boxplot(data = property_count, ax=ax[1][0])
ax[1][1].set_title('KDE of Property distribution')
sns.kdeplot(data = property_count, ax=ax[1][1])

datatype_count = claim_data_df.datatype.value_counts().reset_index()
ax[2][0].set_title('Barplot of Datatype distribution')
sns.barplot(data = datatype_count, ax=ax[2][0], x='index', y='datatype')

entity_alias_count = claim_data_df.entity_alias.apply(lambda x : len(leval(x)) if x != 'no-alias' else 0)
ax[2][1].set_title('KDE of Entity alias count distribution')
sns.kdeplot(data = entity_alias_count, ax=ax[2][1])

entity_alias_count = claim_data_df.property_alias.apply(lambda x : len(leval(x)) if x != 'no-alias' else 0)
ax[3][0].set_title('KDE of Property alias count distribution')
sns.kdeplot(data = entity_alias_count, ax=ax[3][0])

entity_alias_count = claim_data_df.object_alias.apply(lambda x : len(leval(x)) if x != 'no-alias' else 0)
ax[3][1].set_title('KDE of Object alias count distribution')
sns.kdeplot(data = entity_alias_count, ax=ax[3][1])

plt.tight_layout()
plt.show()

# We can see that entities and properties are somewhat fairly spread, with most within a lesser volume and a few outliers.
# Datatype has an OK distribution, nothing wrong there for this analysis.
# KDE of alias counts for ent, prop, and obj show most cases on low count and a few outliers.

# 3. Verbalisation

In [None]:
from verbalisation import verbalisation_module

# If updating the module
#from importlib import reload
#reload(verbalisation_module)

verb_module = verbalisation_module.VerbModule()

In [None]:
import torch
torch.cuda.is_available(),\
torch.cuda.device_count(),\
torch.cuda.current_device(),\
torch.cuda.device(0),\
torch.cuda.get_device_name(0)

In [None]:
!nvidia-smi

In [None]:
verbs = verb_module.verbalise('translate Graph to English: <H> World Trade Center <R> height <T> 200 meter <H> World Trade Center <R> is a <T> tower')
#print(verbs)
assert verbs == 'The World Trade Center is a tower with a height of 200 meters.'

verbs = verb_module.verbalise({'subject': 'World Trade Center','predicate': 'height','object' : '200 meter'})
#print(verbs)
assert verbs == 'The height of the World Trade Center is 200 meters.'

verbs = verb_module.verbalise([[
        {'subject': 'World Trade Center','predicate': 'is a','object' : 'tower'},
        {'subject': 'World Trade Center','predicate': 'height','object' : '200 meter'}
]])
#print(verbs)
assert verbs == 'The World Trade Center is a tower with a height of 200 meters.'

verbs = verb_module.verbalise([
    {'subject': 'World Trade Center','predicate': 'is a','object' : 'tower'},
    {'subject': 'World Trade Center','predicate': 'height','object' : '200 meter'}
])
#print(verbs)
assert verbs == [
    'The World Trade Center is a tower.',
    'The height of the World Trade Center is 200 meters.'
]

verbs = verb_module.verbalise({'subject': 'Cuhppulčohkka','predicate': 'instance of','object' : 'native label'})
#print(verbs)
assert verbs == 'Cuhppul<unk>ohkka is an instance of a native label.'

verb_module.add_label_to_unk_replacer('Cuhppulčohkka')
sent = verb_module.replace_unks_on_sentence('Cuhppul<unk>ohkka is a native label.', empty_after=True)
#print(verbs)
assert sent == 'Cuhppulčohkka is a native label.'

verb_module.add_label_to_unk_replacer('Cuhppulohkkač')
sent = verb_module.replace_unks_on_sentence('Cuhppulohkka<unk> is a native label.', empty_after=True)
#print(verbs)
assert sent == 'Cuhppulohkkač is a native label.'


verb_module.add_label_to_unk_replacer('🇬🇧')
sent = verb_module.replace_unks_on_sentence('The Value Added Tax Order 2016 applies to jurisdiction of <unk>.', empty_after=True)
#print(verbs)
assert sent == 'The Value Added Tax Order 2016 applies to jurisdiction of 🇬🇧.'

In [None]:
try:
    verbalised_claims_df = pd.read_csv('verbalisation/verbalised_claims_df.csv')
except Exception:
    verbalised_claims_df = None
verbalised_claims_df

In [None]:
import json
import time

BATCH_SIZE = 16
verbalised_claims_this_batch = []

claim_data_to_keep = [
    'reference_id', 'entity_id', 'claim_id', 'rank', 'property_id', 'datatype',
    'entity_label', 'entity_desc', 'property_label', 'property_desc', 'object_label', 'object_desc'
] # also add entity_label_is_alias, same for property and object

if verbalised_claims_df is not None and not verbalised_claims_df.empty:
    verbalised_claims = json.loads(
        verbalised_claims_df[verbalised_claims_df['verbalisation'] != 'NO_VERBALISATION'].to_json(orient="records")
    )
else:
    verbalised_claims = []

print(len(verbalised_claims))

with open('verbalisation.log','w+',encoding='utf-8') as f:

    for i, row in tqdm(claim_data_df.iterrows(), total=claim_data_df.shape[0]):
    
        try:

            subjects = [row['entity_label']] if row['entity_label_lan'] == 'en' else []
            subjects += leval(row['entity_alias']) if row['entity_alias_lan'] == 'en' else []

            for i_s, subject in enumerate(subjects):

                predicates = [row['property_label']] if row['property_label_lan'] == 'en' else []
                predicates += leval(row['property_alias']) if row['property_alias_lan'] == 'en' else []

                for i_p, predicate in enumerate(predicates):

                    objects = [row['object_label']] if row['object_label_lan'] == 'en' else []
                    objects += leval(row['object_alias']) if row['object_alias_lan'] == 'en' else []

                    for i_o, object_ in enumerate(objects):

                        verbalised_claim_entry = {
                            c : row[c] for c in claim_data_to_keep
                        }                        
                        
                        if type(verbalised_claims_df) == pd.core.frame.DataFrame:
                            if not verbalised_claims_df[
                                (verbalised_claims_df['reference_id'] == row['reference_id']) &\
                                (verbalised_claims_df['claim_id'] == row['claim_id']) &\
                                (verbalised_claims_df['entity_label'] == subject) &\
                                (verbalised_claims_df['object_label'] == object_) &\
                                (verbalised_claims_df['property_label'] == predicate)                        
                            ].empty:
                                continue                        
                        #print('nonempty found:','-'.join([subject, predicate, object_]))

                        verbalised_claim_entry.update({
                            'entity_label_is_alias': (i_s != 0),
                            'property_label_is_alias': (i_p != 0),
                            'object_label_is_alias': (i_o != 0),
                            'entity_label': subject,
                            'property_label': predicate,
                            'object_label': object_
                        })
                        
                        #print(f"[{i}/{claim_data_df.shape[0]-1}, {i_s}/{len(subjects)-1}, {i_p}/{len(predicates)-1}, {i_o}/{len(objects)-1}]")
                        is_last =\
                            (i == claim_data_df.shape[0]-1) &\
                            (i_s == len(subjects)-1) &\
                            (i_p == len(predicates)-1) &\
                            (i_o == len(objects)-1)
                        #if is_last:
                        #    print('LAST')
                            
                        
                        verbalised_claims_this_batch.append(verbalised_claim_entry)
                        if len(verbalised_claims_this_batch) >= BATCH_SIZE or is_last:
                            #print('verbalising...')
                            verbalisation_inputs = [{
                                'subject':e['entity_label'], 'predicate':e['property_label'], 'object':e['object_label']
                            } for e in verbalised_claims_this_batch]

                            try:
                                #verbalisations = verb_module.verbalise(verbalisation_inputs)
                                verbalisations = ['TEST' for _ in verbalisation_inputs]
                                #f.write(
                                #    f'Verbalising: {[(v["reference_id"], v['claim_id']) for v in verbalised_claims_this_batch]}\n'
                                #)
                            except Exception:
                                print('ERROR VERBALISING:', verbalisation_inputs, sep='\n')
                                raise            

                            for j in range(len(verbalised_claims_this_batch)):
                                try:
                                    verb_module.add_label_to_unk_replacer(verbalised_claims_this_batch[j]['entity_label'])
                                    verb_module.add_label_to_unk_replacer(verbalised_claims_this_batch[j]['object_label'])
                                    verbalised_claims_this_batch[j].update({
                                        'verbalisation' : verbalisations[j],
                                        'verbalisation_unks_replaced': verb_module.replace_unks_on_sentence(
                                            verbalisations[j], empty_after=True
                                        )
                                    })
                                except Exception:
                                    print('REPLACING_ERROR on', verbalised_claims_this_batch[j])
                                    verbalised_claims_this_batch[j].update({
                                        'verbalisation' : verbalisations[j],
                                        'verbalisation_unks_replaced': 'REPLACING_ERROR'
                                    })

                            verbalised_claims += verbalised_claims_this_batch
                            verbalised_claims_this_batch = []

        except Exception:
            print(row)
            #pprint(verbalised_claims_this_row)
            raise

# Load 
verbalised_claims_df = pd.DataFrame(verbalised_claims)

In [None]:
import re

# REMOVING REMAINING <UNK> TOKENS
verbalised_claims_df['verbalisation_unks_replaced_then_dropped'] = None
for i, row in tqdm(verbalised_claims_df.iterrows(), total=verbalised_claims_df.shape[0]):
    s = row['verbalisation_unks_replaced']
    # Removing remaining <unk> tokens
    s = re.sub('<unk>', '', s)
    # Removing doublespaces
    s = re.sub(r'\s+', ' ', s).strip()
    # Removing spaces before punctuation
    s = re.sub(r'\s([?.!",](?:\s|$))', r'\1', s)
    
    verbalised_claims_df.loc[i, 'verbalisation_unks_replaced_then_dropped'] = s

In [None]:
# Checking if every combination of ENGLISH label+alias for (s,p,o) tuples has been covered
# NO OUTPUTS = ALL OK

claims_list = verbalised_claims_df.claim_id.unique().tolist()
for c_id in claims_list:
    row = claim_data_df[claim_data_df['claim_id'] == c_id]
    matching_claims = row.shape[0]
    verbalised_n_rows = verbalised_claims_df[verbalised_claims_df.claim_id == c_id].shape[0]
    
    verbalised_n_rows_target =\
        ((len(leval(row['entity_alias'].values[0])) if row['entity_alias_lan'].iloc[0] == 'en' else 0) +1) * \
        ((len(leval(row['property_alias'].values[0])) if row['property_alias_lan'].iloc[0] == 'en' else 0) +1) * \
        ((len(leval(row['object_alias'].values[0])) if row['object_alias_lan'].iloc[0] == 'en' else 0) +1)
    
    try:
        assert verbalised_n_rows == verbalised_n_rows_target * matching_claims
    except AssertionError:
        print('Match Error:',c_id)
        print( row.index.values,
            verbalised_n_rows, 'out of', verbalised_n_rows_target * matching_claims,
            f'{verbalised_n_rows_target * matching_claims - verbalised_n_rows} to go')

In [None]:
verbalised_claims_df.to_csv('verbalisation/verbalised_claims_df.csv', index=None)

## Verbalised data CORRECTION and analysis

In [None]:
verbalised_claims_df = pd.read_csv('verbalisation/verbalised_claims_df.csv')
verbalised_claims_df.info()

In [None]:
verbalised_claims_df.head()

In [None]:
# Define property aliases to use as main verbalisations
special_properties = {
    'P1031': 'citation',
    'P106': 'profession',
    'P1066': 'apprentice of',
    'P1196': 'nature of death',
    'P1308': 'position holder',
    'P131': 'is located in',
    'P1346': 'won by',
    'P136': {#dict replaces if the key is either in the entity label or the object label
        'film': 'film genre', # Manually replace here, as 'genre of' would be good but is not there
        '': 'genre' #default
    },
    'P1435': 'designation', # Manually replace the Overhailes case
    'P1441': 'featured in work',
    'P1448': 'name',
    'P1476': 'titled',
    'P1542': 'causes',
    'P1559': 'native name',
    'P166' : {
        'Doctor': 'recognition title',
        '': 'award received'
    },
    'P17': {
        'trial': 'host country',
        '': 'land'# Manually correct cases where food is introduced (Gups Ponmala)
    },
    'P186': 'made from',
    'P189': 'found in',
    'P195': 'art collection',#Manually correct cases due to extensive entity label formatting
    'P2017': 'isomeric SMILES', #this is the main label, this is just to remind myself to manually correct them due to extensive label format
    'P21': 'gender',
    'P233': 'SMILES', #Manually correct cases due to extensive entity label formatting
    'P26': 'marry', #Manually correct some cases due to tense
    'P279': 'is a type of',
    'P2896': 'publication frequency',#Manually correct cases due to missing link between 1 week = weekly expressions.
    'P31': 'is a',
    'P3373': 'is sibling of',
    'P364': 'original language',
    'P39' : 'held position',
    'P40' : 'has child',
    'P451' : 'is partner of',
    'P452' : 'sector',#Manually correct line of credit cases
    'P485': 'archive location',
    'P5021': 'assessment', #Manually correct inversion cases
    'P527': 'parts',
    'P551': 'resided in',
    'P571': 'created',
    'P580': 'starting',
    'P582': 'ending',
    'P607': 'in conflict',
    'P674': 'characters', #Manually correct cases here
    'P725': 'voice actor', #Manually correct cases here
    'P734': 'last name',
    'P735': 'first name',
    'P780': 'symptoms',
    'P793': 'event',
    'P802': 'students',
    #'P8045' Manually correct this one
    'P915': 'filmed at',
    'P921': 'about',
    'P97': 'hereditary title'
}

verbalised_claims_df['is_main_verbalisation'] = None
verbalised_claims_df['alternative_alias_used'] = None

for i, row in verbalised_claims_df.iterrows():
    # If a special property (to be replaced by alias), do a custom logic where
    # an official property_label is elected instead
    if row['property_id'] in special_properties.keys():   
        verbalised_claims_df.loc[i, 'alternative_alias_used'] = True
        preferred_property_label = special_properties[row['property_id']]
        if type(preferred_property_label) == str:
            # if the new label is str, just flag as main verbalisation in case the entity/object are main labels
            # and the property label is the new official label
            if not row['entity_label_is_alias'] and\
                row['property_label'] == preferred_property_label and\
                not row['object_label_is_alias']:
                
                verbalised_claims_df.loc[i, 'is_main_verbalisation'] = True
            else:
                verbalised_claims_df.loc[i, 'is_main_verbalisation'] = False
        elif type(preferred_property_label) == dict:
            # if it's a dict, elect as new official label only if either entity or object labels contain
            # the key, and select the property label that is its value
            # if one is identified this way, skip the rest
            # we start with it not being a main verbalisation until we find a key that matches
            verbalised_claims_df.loc[i, 'is_main_verbalisation'] = False
            
            # check if another row with the same claim id and reference id is 
            # not already the main verbalisation
            if verbalised_claims_df[
                (verbalised_claims_df['claim_id'] == row['claim_id']) &\
                (verbalised_claims_df['reference_id'] == row['reference_id'])
            ].is_main_verbalisation.sum() > 0:
                continue
                
            for key in preferred_property_label.keys():
                if key in row['entity_label'] or key in row['object_label']:
                    if not row['entity_label_is_alias'] and\
                        row['property_label'] == preferred_property_label[key] and\
                        not row['object_label_is_alias']:
                        #print(row)
                        verbalised_claims_df.loc[i, 'is_main_verbalisation'] = True
                        if key=='':                            
                            verbalised_claims_df.loc[i, 'alternative_alias_used'] = False
                        break #stop iterating over keys of preferred_property_label
                
                
    # Else, or if we fail to find an official property_label through the logic above,
    # just check if all X_is_alias columns are false
    else:        
        verbalised_claims_df.loc[i, 'alternative_alias_used'] = False
        if not row['entity_label_is_alias'] and\
            not row['property_label_is_alias'] and\
            not row['object_label_is_alias']:

            verbalised_claims_df.loc[i, 'is_main_verbalisation'] = True
        else:
            verbalised_claims_df.loc[i, 'is_main_verbalisation'] = False


In [None]:
assert verbalised_claims_df['is_main_verbalisation'].value_counts()[True] == 972
#972 is the total of unique ref_claim pairs
assert verbalised_claims_df['is_main_verbalisation'].isna().sum() == 0
assert verbalised_claims_df['alternative_alias_used'].isna().sum() == 0
# no empty cells

In [None]:
verbalised_claims_df[verbalised_claims_df['is_main_verbalisation'] == True][
    verbalised_claims_df[verbalised_claims_df['is_main_verbalisation'] == True].duplicated('claim_id')
]

In [None]:
print('Total verbalised counts:')
print(f'{verbalised_claims_df.claim_id.unique().shape[0]} unique claims')
print(f'{verbalised_claims_df.reference_id.unique().shape[0]} unique references')

In [None]:
# 
n_unk_replacements = verbalised_claims_df[verbalised_claims_df['verbalisation'].apply(lambda x : '<unk>' in x)].shape[0]
print(
    f"Unk replacement was needed in {n_unk_replacements} ({100*n_unk_replacements/verbalised_claims_df.shape[0]}%) of verbalisations"
)

n_unk_replacements_solved = verbalised_claims_df[verbalised_claims_df['verbalisation_unks_replaced'].apply(lambda x : '<unk>' in x)].shape[0]
print(
    f"Unk replacement was NOT solved in {n_unk_replacements_solved} ({100*n_unk_replacements_solved/n_unk_replacements}%) of cases"
)

In [None]:
# Distribution of entities and properties involved in the verbalisations

fig, ax = plt.subplots(2,2,figsize=(10,8))

ax[0][0].set_yscale('log')
entity_count = verbalised_claims_df.entity_id.value_counts()
ax[0][0].set_title('Boxplot of Entity distribution')
sns.boxplot(data = entity_count, ax=ax[0][0])
ax[0][1].set_title('KDE of Entity distribution')
sns.kdeplot(data = entity_count, ax=ax[0][1])

ax[1][0].set_yscale('log')
property_count = verbalised_claims_df.property_id.value_counts()
ax[1][0].set_title('Boxplot of Property distribution')
sns.boxplot(data = property_count, ax=ax[1][0])
ax[1][1].set_title('KDE of Property distribution')
sns.kdeplot(data = property_count, ax=ax[1][1])

plt.tight_layout()
plt.show()

# Half of both entities and properties show up between 50 and 200/500 times,
# the majority being between 1 and 1000. A few outliers are beyond that.

# Entities/Properties that are over-represented here are due to having too many aliases.

In [None]:
unique_verbalisation_counts = verbalised_claims_df[['claim_id', 'verbalisation_unks_replaced_then_dropped']].\
    drop_duplicates().claim_id.value_counts()

# Distribution of entities and properties involved in the verbalisations

fig, ax = plt.subplots(1,2,figsize=(10,5))

ax[0].set_yscale('log')
ax[0].set_title('Boxplot of Unique verbalisation count\n distribution per unique claim id')
sns.boxplot(data = unique_verbalisation_counts, ax=ax[0])
ax[1].set_title('KDE of Unique verbalisation count\n distribution per unique claim id')
sns.kdeplot(data = unique_verbalisation_counts, ax=ax[1])

plt.tight_layout()
plt.show()

# This shows the majority of claims have up to 100 unique verbalisations due to aliases, with some having way more.
# The biggest one, for instance, has 33 subject aliases, 22 predicate aliases, and 8 object aliases

get_outliers(unique_verbalisation_counts).reset_index().rename({'index':'claim_id','claim_id':'count'}, axis=1)

In [None]:
# Next step is manually correcting some verbalisations before SENTENCE SELECTION
#Create a 'corrected_verbalisation' and a 'is_corrected_verbalisation' for manual annotation before sentence selection
verbalised_claims_df_main = verbalised_claims_df[verbalised_claims_df['is_main_verbalisation'] == True]\
    .reset_index(drop=True).drop('is_main_verbalisation', axis=1).copy()

verbalised_claims_df_main['corrected_verbalisation'] = verbalised_claims_df_main['verbalisation_unks_replaced_then_dropped']
verbalised_claims_df_main['is_corrected_verbalisation'] = False

verbalised_claims_df_main.to_csv('verbalisation/verbalised_claims_df_main.csv', index=None)

In [None]:
print('Percentage of main verbalisations where an alias was used for better verbalisation')
print(100*verbalised_claims_df_main['alternative_alias_used'].sum()/verbalised_claims_df_main.shape[0])
# Percentage of main verbalisations where an alias was used for better verbalisation

Now, we manually go over the verbalisations and correct those that need correcting, marking `is_corrected_verbalisation` as true.

In [None]:
from Levenshtein import distance as levenshtein_distance

verbalised_claims_df_main = pd.read_csv('verbalisation/verbalised_claims_df_main.csv')
verbalised_claims_df_main_corrected = pd.read_csv('verbalisation/verbalised_claims_df_main_corrected.csv')

# is_corrected_verbalisation has NOT been filled during correction, as this is quicker and less error-prone
verbalised_claims_df_main_corrected['is_corrected_verbalisation'] = verbalised_claims_df_main_corrected.apply(
    lambda row : row['corrected_verbalisation'] != row['verbalisation_unks_replaced_then_dropped'], axis=1
)

print('Percentage of main verbalisations where a manual correction was used.')
print(100*verbalised_claims_df_main_corrected['is_corrected_verbalisation'].sum()/verbalised_claims_df_main.shape[0])

norm_levenshtein_distances = verbalised_claims_df_main_corrected.apply(
    lambda row : levenshtein_distance(
        row['corrected_verbalisation'],
        row['verbalisation_unks_replaced_then_dropped']
    )/max(
        len(row['corrected_verbalisation']),
        len(row['verbalisation_unks_replaced_then_dropped'])
    ),
    axis=1
)
norm_levenshtein_distances = norm_levenshtein_distances[norm_levenshtein_distances>0].reset_index(drop=True)
sns.boxplot(data=norm_levenshtein_distances, orient='h')
print('Distribution of normalised levenshtein distance after corrections.')
print(norm_levenshtein_distances.describe())

In [None]:
# REMOVE P1448 (OFFICIAL NAME), P1476 (TITLE), AND P1889 (DIFFERENT) AS THEY ARE REDUNDANT AND NON-INFORMATIVE
#also look at the dataset creation for other properties that were deleted and delete them too

BAD_PROPERTIES = [
    'P1448', # offical name
    'P1476', # title
    'P1889',# different
    'P31', # - instance of
    'P279',# - subclass of
    'P373',# - commons category
    'P910',# - Topic's main category
    'P7561',# - category for the interior of the item
    'P5008',# - on focus list of Wikimedia project
    'P2670',# -  has parts of the class
    'P1740',# -  category for films shot at this location
    'P1612',# -  Commons Institution page
    'P8989',# -  category for the view of the item
    'P2959',# -  permanent duplicated item
    'P7867',# -  category for maps
    'P935' ,# -  Commons gallery
    'P1472',#  -  Commons Creator page
    'P8596',# category for the exterior of the item
    'P5105',# Deutsche Bahn station category
    'P8933',# category for the view from the item
    'P642',# of
    'P3876',# category for alumni of educational institution
    'P1791',# category of people buried here
    'P7084',# related category
    'P1465',# category for people who died here
    'P1687',# Wikidata property
    'P6104',# maintained by WikiProject
    'P4195',# category for employees of the organization
    'P1792',# category of associated people
    'P5869',# model item
    'P1659',# see also
    'P1464',# category for people born here
    'P2354',# has list
    'P1424',# topic's main template
    'P7782',# category for ship name
    'P179',# part of the series
    'P7888',# merged into
    'P6365',# member category
    'P8464',# content partnership category
    'P360',# is a list of
    'P805',# statement is subject of
    'P8703',# entry in abbreviations table
    'P1456',# list of monuments
    'P1012',# including
    'P1151',# topic's main Wikimedia portal
    'P2490',# page at OSTIS Belarus Wiki
    'P593',# HomoloGene ID
    'P8744',# economy of topic
    'P2614',# World Heritage criteria
    'P2184',# history of topic
    'P9241',# demographics of topic
    'P487',#Unicode character
    'P1754',#category related to list
    'P2559',#Wikidata usage instructions
    'P2517',#category for recipients of this award
    'P971',#category combines topics
    'P6112',# category for members of a team
    'P4224',#category contains
    'P301',#category's main topic
    'P1753',#list related to category
    'P1423',#template has topic
    'P1204',#Wikimedia portal's main topic
    'P3921',#Wikidata SPARQL query equivalent
    'P1963',#properties for this type
    'P5125',#Wikimedia outline
    'P3176',#uses property
    'P8952',#inappropriate property for this type
    'P2306',#property
    'P5193',#Wikidata property example for forms
    'P5977',#Wikidata property example for senses
    'P1748',#NCI Thesaurus ID
    'P1692',#ICD-9-CM
    'P248',#stated in
]

verbalised_claims_df_main_corrected_badpropdrop = verbalised_claims_df_main_corrected[
    ~verbalised_claims_df_main_corrected['property_id'].isin(BAD_PROPERTIES)
]
print('Percentage [Number] of claims dropped due to bad properties')
print(
    f'{100 - 100*verbalised_claims_df_main_corrected_badpropdrop.shape[0]/verbalised_claims_df_main_corrected.shape[0]}',
    f'[{verbalised_claims_df_main_corrected.shape[0] - verbalised_claims_df_main_corrected_badpropdrop.shape[0]}]'
)

In [None]:
# Correct some final_urls in the reference_text_df dataframe before joining
import re

# Replace by archived page if page was behing paywall when parsed
reference_text_df.loc[reference_text_df.html.map(lambda x : '://archive.ph/' in x), 'final_url'] =\
    reference_text_df.loc[reference_text_df.html.map(lambda x : '://archive.ph/' in x)]\
        .html.map(lambda x : re.findall(r'http(?:s){0,1}://archive.ph/(?:[a-zA-Z0-9]*)', x)[0])

In [None]:
# Get URLs from the references df
verbalised_claims_df_main_corrected_badpropdrop_url = \
    pd.merge(
    verbalised_claims_df_main_corrected_badpropdrop,
    reference_text_df[['reference_id', 'final_url']],
    on='reference_id'
)
verbalised_claims_df_main_corrected_badpropdrop_url

In [None]:
# Remove duplicates of reference and verbalisation, as duplicates arise from qualifier dependancy
verbalised_claims_df_main_corrected_badpropdrop_url_duplidrop = \
    verbalised_claims_df_main_corrected_badpropdrop_url.drop_duplicates(
    ['corrected_verbalisation','final_url'], keep='first'
)

print('Percentage [Number] of claims dropped due to duplicated verbalisation and url pair')
print(
    f'{100 - 100*verbalised_claims_df_main_corrected_badpropdrop_url_duplidrop.shape[0]/verbalised_claims_df_main_corrected_badpropdrop_url.shape[0]}',
    f'[{verbalised_claims_df_main_corrected_badpropdrop_url.shape[0] - verbalised_claims_df_main_corrected_badpropdrop_url_duplidrop.shape[0]}]'
)

In [None]:
# Remove the three cases in archinform.net written in German
verbalised_claims_df_main_corrected_badpropdrop_url_duplidrop = verbalised_claims_df_main_corrected_badpropdrop_url_duplidrop[
    ~verbalised_claims_df_main_corrected_badpropdrop_url_duplidrop['final_url'].map(
        lambda x : 'www.archinform.net' in x and any([(y in x) for y in ['19632','11996','45859']])
    )
]

In [None]:
verbalised_claims_df_main_corrected_badpropdrop_url_duplidrop.reset_index(drop=True, inplace=True)
verbalised_claims_df_main_corrected_badpropdrop_url_duplidrop

In [None]:
# The following claims were selected randomly such that
# each reference_id had only one claim selected.
# However, we did not keep the seed which generated it.
randomly_selected_rows = [
    0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 18, 22, 23, 27, 29, 32, 36, 39, 43, 44, 49,
    51, 58, 62, 63, 64, 65, 66, 67, 68, 69, 70, 72, 73, 74, 75, 76, 80, 81, 82, 83, 84, 85, 86,
    87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
    108, 109, 110, 111, 112, 113, 114, 115, 116, 118, 120, 122, 123, 125, 126, 128, 130, 131,
    132, 133, 135, 136, 140, 141, 142, 147, 151, 152, 153, 154, 155, 158, 159, 160, 161, 163,
    164, 166, 168, 169, 170, 171, 174, 175, 176, 177, 178, 185, 186, 188, 190, 193, 194, 197,
    198, 200, 202, 204, 206, 207, 211, 212, 214, 217, 219, 220, 222, 224, 229, 230, 231, 233,
    235, 239, 240, 241, 243, 244, 246, 247, 250, 251, 253, 254, 255, 256, 257, 258, 259, 260,
    261, 262, 263, 264, 265, 266, 268, 269, 270, 271, 272, 274, 276, 277, 280, 281, 283, 285,
    286, 288, 290, 292, 294, 299, 300, 301, 302, 303, 312, 319, 323, 324, 325, 326, 327, 328,
    331, 332, 335, 336, 338, 340, 342, 344, 345, 347, 350, 352, 353, 354, 355, 356, 357, 358,
    359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 371, 372, 373, 375, 376, 377, 378,
    379, 380, 382, 389, 391, 399, 407, 409, 414, 415, 416, 418, 419, 420, 422, 423, 424, 425,
    426, 427, 428, 429, 432, 439, 442, 443, 445, 446, 448, 454, 456, 458, 461, 464, 465, 467,
    468, 470, 472, 475, 476, 478, 481, 483, 484, 486, 488, 491, 492, 496, 499, 503, 508, 509,
    516, 519, 524, 526, 528, 536, 537, 539, 540, 542, 544, 545, 546, 547, 548, 551, 552, 557,
    561, 570, 571, 573, 576, 578, 579, 581, 583, 584, 586, 588, 589, 590, 592, 593, 594, 595,
    596, 597, 598, 599, 600, 601, 604, 606, 607, 613, 614, 615, 616, 617, 619, 620, 621, 629,
    635, 642, 644, 645, 646, 650, 656, 665, 666, 673, 681, 687, 694, 696, 707, 713, 717, 721,
    729, 730, 731, 732, 736, 737, 738, 739, 740, 741, 743, 744, 745, 747, 748, 749, 750, 751,
    752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 764, 766, 768, 769, 772, 773, 774,
    775, 776, 777, 779, 780, 781, 782, 784, 785, 786, 787, 790, 791, 792, 793, 795
]

In [None]:
verbalised_claims_df_final = verbalised_claims_df_main_corrected_badpropdrop_url_duplidrop
verbalised_claims_df_final = verbalised_claims_df_final.loc[randomly_selected_rows]
verbalised_claims_df_final = verbalised_claims_df_final.reset_index(drop=True)
verbalised_claims_df_final.to_csv('verbalisation/verbalised_claims_df_final.csv', index=None)
verbalised_claims_df_final

# 4. Sentence Selection

Due to the HUGE amounts of verbalisations per unique claim, we will now only look at the **MAIN verbalisations**, defined previously (see last section) as those having main labels only or entity/object main labels and a preferred predicate label.

As next steps, we **COULD**:
1. Select main label verbalisations + 10 others at random from the whole set
2. Select main label verbalisations + 10 others at random given that they (subj and obj) *appear on the document*
3. Select main label verbalisations + 10 others based on some scoring function

Here, we **WILL**:
1. Combine the 1302 (formerly 1570 before cutting some entries down in the last few cells of last section) verbalised claims with their respective reference contents (as sentences).
2. Then, we will calculate a relevance score for each sentence, and keep only the (up-to) 5 best positive scores.

In [None]:
verbalised_claims_df_final = pd.read_csv('verbalisation/verbalised_claims_df_final.csv')
verbalised_claims_df_final.info()

In [None]:
# Remove redundant columns and unnecessary columns for the merging with reference contents data
verbalised_claims_df_final = verbalised_claims_df_final[[
    'reference_id', 'claim_id', 'corrected_verbalisation'
]]
verbalised_claims_df_final = verbalised_claims_df_final.rename(
    {'corrected_verbalisation': 'final_verbalisation'},
    axis=1
)


# Take only the data relevant for joining with the verbalisation data and calculating sentence relevance scores

# The sampling weights CAN ONLY BE USED to average-out any score or quantifiable property that is assigned to EACH REFERENCE,
# such as: percentage of claims actually supported by the reference out of all claims linked to it according
# to pipeline results

reference_text_df_for_sentence_selection_join = reference_text_df[[
    'reference_id', 'sampling_weight', 'final_url', 'netloc_agg', 'nlp_sentences', 'nlp_sentences_slide_2'
]]


sentence_relevance_df = pd.merge(
    verbalised_claims_df_final,
    reference_text_df_for_sentence_selection_join,
    how='left',
    on='reference_id'
)

sentence_relevance_df['nlp_sentences'] = sentence_relevance_df['nlp_sentences'].apply(leval)
sentence_relevance_df['nlp_sentences_slide_2'] = sentence_relevance_df['nlp_sentences_slide_2'].apply(leval)

In [None]:
from sentence_retrieval import sentence_retrieval_module

# If updating the module
#from importlib import reload
#reload(sentence_retrieval_module)

sr_module = sentence_retrieval_module.SentenceRetrievalModule(max_len=512)

**SHIFTING MAIN LABELS OF PROBLEMATIC LABELS FOR VERBALISATION**

Some properties, such as "child" and "date of inception" are problematic for reasons we discussed in a previous paper. So, for better performance, we might shift the main label flags over to more appropriate aliases, like shifting from "child" to "has child", and "date of inception" to "data of creation".

To change:
    - P?: "child" to "has child"
    - P?: "inception" to "?"
    - ??
    
**DONE**: Check section 3, where this and other changes were made

In [None]:
import pdb
BATCH_SIZE = 16

sentence_relevance_df['nlp_sentences_scores'] = None
sentence_relevance_df['nlp_sentences_slide_2_scores'] = None

def chunks(l, n):
    n = max(1, n)
    return [l[i:i+n] for i in range(0, len(l), n)]

all_outputs = []
for i, row in tqdm(sentence_relevance_df.iterrows(), total=sentence_relevance_df.shape[0]):
    
    outputs = []
    for batch in chunks(row['nlp_sentences'], BATCH_SIZE):
        batch_outputs = sr_module.score_sentence_pairs(
            [(row['final_verbalisation'], sentence) for sentence in batch]
        )
        outputs += batch_outputs
    all_outputs.append(outputs)
    
all_outputs = pd.Series(all_outputs)
sentence_relevance_df['nlp_sentences_scores'] = all_outputs

assert all(sentence_relevance_df.apply(
    lambda x : len(x['nlp_sentences']) == len(x['nlp_sentences_scores']),
    axis=1
))

In [None]:
all_outputs = []
for i, row in tqdm(sentence_relevance_df.iterrows(), total=sentence_relevance_df.shape[0]):
    
    outputs = []
    for batch in chunks(row['nlp_sentences_slide_2'], BATCH_SIZE):
        batch_outputs = sr_module.score_sentence_pairs(
            [(row['final_verbalisation'], sentence) for sentence in batch]
        )
        outputs += batch_outputs
    all_outputs.append(outputs)
    
all_outputs = pd.Series(all_outputs)    
sentence_relevance_df['nlp_sentences_slide_2_scores'] = all_outputs
    

assert all(sentence_relevance_df.apply(
    lambda x : len(x['nlp_sentences_slide_2']) == len(x['nlp_sentences_slide_2_scores']),
    axis=1
))

In [None]:
N_TOP_SENTENCES = 5
SCORE_THRESHOLD = 0

nlp_sentences_TOP_N = []
nlp_sentences_slide_2_TOP_N = []
nlp_sentences_all_TOP_N = []

for i, row in tqdm(sentence_relevance_df.iterrows(), total=sentence_relevance_df.shape[0]):

    nlp_sentences_with_scores = [{
        'sentence': t[0],
        'score': t[1],
        'sentence_id': str(j)
    } for j, t in enumerate(zip(row['nlp_sentences'], row['nlp_sentences_scores']))]

    nlp_sentences_with_scores = sorted(nlp_sentences_with_scores, key = lambda x : x['score'], reverse = True)
    nlp_sentences_TOP_N.append(nlp_sentences_with_scores[:N_TOP_SENTENCES])
    
    nlp_sentences_slide_2_with_scores = [{
        'sentence': t[0],
        'score': t[1],
        'sentence_id': str(j)+';'+str(j+1)
    } for j, t in enumerate(zip(row['nlp_sentences_slide_2'], row['nlp_sentences_slide_2_scores']))]

    nlp_sentences_slide_2_with_scores = sorted(nlp_sentences_slide_2_with_scores, key = lambda x : x['score'], reverse = True)
    nlp_sentences_slide_2_TOP_N.append(nlp_sentences_slide_2_with_scores[:N_TOP_SENTENCES])
    

    nlp_sentences_all_with_scores = nlp_sentences_with_scores + nlp_sentences_slide_2_with_scores
    nlp_sentences_all_with_scores = sorted(nlp_sentences_all_with_scores, key = lambda x : x['score'], reverse = True)
    
    #We might no want to allow overlaps, so we do the following:
    #For each evidence in descending order of score, we delete from the 'all' list
    #all overlapping evidence scored lower than it
    nlp_sentences_all_with_scores_filtered_for_overlap = []
    for evidence in nlp_sentences_all_with_scores:
        if ';' in evidence['sentence_id']:
            [start_id, end_id] = evidence['sentence_id'].split(';')
            if not any(
                [start_id in e['sentence_id'].split(';') for e in nlp_sentences_all_with_scores_filtered_for_overlap]
            ):
                if not any(
                    [end_id in e['sentence_id'].split(';') for e in nlp_sentences_all_with_scores_filtered_for_overlap]
                ):
                    nlp_sentences_all_with_scores_filtered_for_overlap.append(evidence)
        else:
            if not any(
                [evidence['sentence_id'] in e['sentence_id'].split(';') for e in nlp_sentences_all_with_scores_filtered_for_overlap]
            ):
                nlp_sentences_all_with_scores_filtered_for_overlap.append(evidence)
    
    
    assert len(nlp_sentences_all_with_scores_filtered_for_overlap) >= 5    
    nlp_sentences_all_TOP_N.append(nlp_sentences_all_with_scores_filtered_for_overlap[:N_TOP_SENTENCES])
    
sentence_relevance_df['nlp_sentences_TOP_N'] = pd.Series(nlp_sentences_TOP_N)
sentence_relevance_df['nlp_sentences_slide_2_TOP_N'] = pd.Series(nlp_sentences_slide_2_TOP_N)
sentence_relevance_df['nlp_sentences_all_TOP_N'] = pd.Series(nlp_sentences_all_TOP_N)

In [None]:
sentence_relevance_df.head()

In [None]:
sentence_relevance_df.iloc[1].nlp_sentences_all_TOP_N

In [None]:
sentence_relevance_df.to_json(
    'sentence_retrieval/sentence_relevance_df.json', orient='records', indent=4
)

In [None]:
sentence_relevance_df = pd.read_json('sentence_retrieval/sentence_relevance_df.json')
sentence_relevance_df

## Manual analysis of selected sentences
As will quantitative metrics later, for now let's try just looking at some examples by hand.

We will, for all netloc_aggs, choose a random sample and check if selected evidence make sense given claim.

**Notes**:
- Threshold variation might be something we can use the crowdsourcing to better define.
- *Sometimes* even sentences with negative scores, when ALL sentences are very poorly scored, might be useful. We could show that to users and say "our models ranked this sentence as low relevance" (or maybe with a color scheme or 1/5 starts symbol), but show anyway, and ask users how helpful they truly were.
    - Basically, ask the crowd if our scores of relevancy/support make sense and correlate to their own judgements.
- Many cases of failure involve information that is written in tabular form and depend on humans infering information/context from the page's layout.
    - Example is with "The chemical formula for radium ra-223 dichloride is Cl2Ra." and the url https://go.drugbank.com/drugs/DB08913. No sentence extracted links the compound's name to the formula.
- Many cases of failure involve information that requires multi-hop reasoning, where the information is broken in multiple places in the page.
    - Such an example is with "Naravoor Lps is located in India." and the url https://schools.org.in/Kannur/32020700608/, where one sentence shows the school is in Kannur, and another that Kannur is in India.
- Some cases of failure involve common sense or background knowledge, which is needed to infer the information from the page.
    - For example, the "Salafi Eng.Medium Lps is a lower primary school." sentence and the url https://schools.org.in/malappuram/32051000519/. There, it is stated the school has grades from 1 to 4, which is what a lower primary school consists of, but that information is not found in the page and depends on the reader's knowledge.
    - Or "William Sleator died on 03/08/2011." and http://archive.today/vSGie., which only states he "died on Wednesday".
    - Another is that the title of something is something, or that it is written in English, etc. Things we can clearly check, but are not written anywhere in the page.
    - Or "HealthRight International is based in the United States of America." and https://www.charitynavigator.org/ein/133791391, stating an address in NY, but we have to infer having a contact address in NY means it is based on NY, which is in the US.
- Some are a mix of layout and common sense
    - Like "Julius (painting) has a thickness of 1.5 centimetres." and the url https://www.rct.uk/collection/402419/julius, which has a sentence "Julius Signed and dated 1846. Oil on canvas | 53.1 x 65.4 x 1.5 cm" receiving low score because the fact that the LAST metric is the thickness is infered through common sense.
- Some are because the information is on a page REACHED THROUGH the linked page, not on itself.
- Some are due to highly domain-specific jargon and acronyms
    - "Capsazepine interacts physically with Transient receptor potential cation channel, subfamily M, member 8." links to https://www.guidetopharmacology.org/GRAC/LigandDisplayForward?ligandId=2461, where the object is called TRPM8
- Some are due to reduntant info that is not going to be on the source anyway, making the source rather unnecessary
    - Like "Supply Act (No. 2) 1965-66 has the title Supply Act (No. 2) 1965-66."
- Some are subjective:
    - "Agnes Alinschaw's trial was a significant event."
- Some depend on qualifiers:
    - "2001: A Space Odyssey was assessed using the Bechdel test." depends on the qualifier "assessment outcome: fails".


To alter:
- **Remember** to alter the URLs leading to paywall-protected pages by using their https://archive.ph/ version.
    - You can build a list by checking if 'archive.ph' is in the html.
- A handful of websites are not in English, we can get a list from netloc archinform.net.

In [None]:
## To see the specific sentence/score breakdown for a specific verbalisation
#row = sentence_relevance_df[sentence_relevance_df['final_verbalisation'] == 'Pokinatcha is a genre of pop rock.']
#list(zip(row.nlp_sentences_slide_2.tolist()[0], row.nlp_sentences_slide_2_scores.tolist()[0]))

In [None]:
for netloc in sentence_relevance_df.netloc_agg.unique().tolist():
    print('-'*10)
    print(netloc,'\n')
    sampled_rows = sentence_relevance_df[sentence_relevance_df['netloc_agg'] == netloc].sample(3, random_state=42)
    for i, row in sampled_rows.iterrows():
        print('->', row['final_verbalisation'])
        print('->>', row['final_url'])
        for i, sentence in enumerate(row['nlp_sentences_all_TOP_N']):
            print(f"\t{i+1}. {sentence['sentence']} ({sentence['score']})")
        print()

# 5. Textual Entailment

Now, for each claim, we take the Top sentences alongside their scores and pass through the textual entalmente module, outputing the following:
- Scores (three scores, each for a label) of each of the N sentences
- Labels of each of the N sentences based on the scores
- Final weighted scores of the sentences (used on the weighted sum method)
- Final label of the claim based on two methods, weighted sum and malon

In [None]:
from textual_entailment import textual_entailment_module

from importlib import reload
reload(textual_entailment_module)

te_module = textual_entailment_module.TextualEntailmentModule()

In [None]:
textual_entailment_df = sentence_relevance_df.copy()

In [None]:
SCORE_THRESHOLD = 0

keys = ['TOP_N', 'slide_2_TOP_N', 'all_TOP_N']
te_columns = {}

for key in keys:
    te_columns[f'evidence_TE_prob_{key}'] = []
    te_columns[f'evidence_TE_prob_weighted_{key}'] = []
    te_columns[f'evidence_TE_labels_{key}'] = []
    te_columns[f'claim_TE_prob_weighted_sum_{key}'] = []
    te_columns[f'claim_TE_label_weighted_sum_{key}'] = []
    te_columns[f'claim_TE_label_malon_{key}'] = []


for i, row in tqdm(textual_entailment_df.iterrows(), total=textual_entailment_df.shape[0]):
    try:
        claim = row['final_verbalisation']

        result_sets = {key : {'evidence': row[f'nlp_sentences_{key}']} for key in keys}

        for key, rs in result_sets.items():

            evidence_size = len([e for e in rs['evidence']])
           
            rs['evidence_TE_prob'] = te_module.get_batch_scores(
                claims = [claim for _ in range(evidence_size)],
                evidence = [e['sentence'] for e in rs['evidence']]
            )   
            
            rs['evidence_TE_labels'] = [te_module.get_label_from_scores(s) for s in rs['evidence_TE_prob']]
                
            rs['evidence_TE_prob_weighted'] = [
                probs*ev['score'] for probs, ev in zip(rs['evidence_TE_prob'], rs['evidence'])\
                if ev['score'] > SCORE_THRESHOLD
            ]
            
            rs['claim_TE_prob_weighted_sum'] = \
                np.sum(rs['evidence_TE_prob_weighted'], axis=0)\
                if rs['evidence_TE_prob_weighted'] else [0,0,0]
            
            rs['claim_TE_label_weighted_sum'] = \
                te_module.get_label_from_scores(rs['claim_TE_prob_weighted_sum'])\
                if rs['evidence_TE_prob_weighted'] else 'NOT ENOUGH INFO'  
            

            rs['claim_TE_label_malon'] = te_module.get_label_malon(
                probs for probs, ev in zip(rs['evidence_TE_prob'], rs['evidence'])\
                if ev['score'] > SCORE_THRESHOLD
            )

            te_columns[f'evidence_TE_prob_{key}'].append(rs['evidence_TE_prob'])
            te_columns[f'evidence_TE_prob_weighted_{key}'].append(rs['evidence_TE_prob_weighted'])
            te_columns[f'evidence_TE_labels_{key}'].append(rs['evidence_TE_labels'])
            te_columns[f'claim_TE_prob_weighted_sum_{key}'].append(rs['claim_TE_prob_weighted_sum'])
            te_columns[f'claim_TE_label_weighted_sum_{key}'].append(rs['claim_TE_label_weighted_sum'])
            te_columns[f'claim_TE_label_malon_{key}'].append(rs['claim_TE_label_malon'])
            
            #print(rs)
            #break
    
    except Exception:
        print(row)
        print(result_sets)

        raise
    
    #break

In [None]:
for key in keys:
    textual_entailment_df[f'evidence_TE_prob_{key}'] = pd.Series(te_columns[f'evidence_TE_prob_{key}'])
    textual_entailment_df[f'evidence_TE_prob_weighted_{key}'] = pd.Series(te_columns[f'evidence_TE_prob_weighted_{key}'])
    textual_entailment_df[f'evidence_TE_labels_{key}'] = pd.Series(te_columns[f'evidence_TE_labels_{key}'])
    textual_entailment_df[f'claim_TE_prob_weighted_sum_{key}'] = pd.Series(te_columns[f'claim_TE_prob_weighted_sum_{key}'])
    textual_entailment_df[f'claim_TE_label_weighted_sum_{key}'] = pd.Series(te_columns[f'claim_TE_label_weighted_sum_{key}'])
    textual_entailment_df[f'claim_TE_label_malon_{key}'] = pd.Series(te_columns[f'claim_TE_label_malon_{key}'])

In [None]:
textual_entailment_df

In [None]:
textual_entailment_df.to_json('textual_entailment/textual_entailment_df.json', orient="records", indent=4)

In [None]:
textual_entailment_df = pd.read_json('textual_entailment/textual_entailment_df.json')
SCORE_THRESHOLD = 0

In [None]:
textual_entailment_df.shape

In [None]:
textual_entailment_df.iloc[0]

In [None]:
fig, ax = plt.subplots(1,3,figsize=(15,5))

sns.histplot(
    textual_entailment_df.nlp_sentences_TOP_N.apply(lambda x : len([xx for xx in x if xx['score'] > SCORE_THRESHOLD])),
    ax=ax[0], stat='probability', discrete=True
)
sns.histplot(
    textual_entailment_df.nlp_sentences_slide_2_TOP_N.apply(lambda x : len([xx for xx in x if xx['score'] > SCORE_THRESHOLD])),
    ax=ax[1], stat='probability', discrete=True
)
sns.histplot(
    textual_entailment_df.nlp_sentences_all_TOP_N.apply(lambda x : len([xx for xx in x if xx['score'] > SCORE_THRESHOLD])),
    ax=ax[2], stat='probability', discrete=True
)

plt.tight_layout()
plt.show()


In [None]:
fig, ax = plt.subplots(1,1,figsize=(10,10))

df = textual_entailment_df[['netloc_agg', 'nlp_sentences_all_TOP_N']].copy()

df['nlp_sentences_all_TOP_N_score_mean'] = df.nlp_sentences_all_TOP_N.apply(
    lambda x : np.mean([xx['score'] for xx in x] if len(x) > 0 else 0)
)
df['nlp_sentences_all_TOP_N_score_max'] = df.nlp_sentences_all_TOP_N.apply(
    lambda x : np.max([xx['score'] for xx in x] if len(x) > 0 else 0)
)
df['nlp_sentences_all_TOP_N_score_min'] = df.nlp_sentences_all_TOP_N.apply(
    lambda x : np.min([xx['score'] for xx in x] if len(x) > 0 else 0)
)
df = df.groupby('netloc_agg').mean().reset_index()
df = df.sort_values('nlp_sentences_all_TOP_N_score_mean', ascending=False).reset_index(drop=True)

for i, row in df.iterrows():
    plt.hlines(
        y=i,
        xmin=row['nlp_sentences_all_TOP_N_score_min'],
        xmax=row['nlp_sentences_all_TOP_N_score_max'],
        color='black'
    )

sns.swarmplot(
    data=df,
    x='nlp_sentences_all_TOP_N_score_mean',
    y='netloc_agg',
    order=df.netloc_agg,
    ax=ax,
    linewidth=1
)
sns.swarmplot(
    data=df,
    x='nlp_sentences_all_TOP_N_score_max',
    y='netloc_agg',
    order=df.netloc_agg,
    ax=ax,
    linewidth=1
)
sns.swarmplot(
    data=df,
    x='nlp_sentences_all_TOP_N_score_min',
    y='netloc_agg',
    order=df.netloc_agg,
    ax=ax,
    linewidth=1
)

plt.axvline(x=1.00, c='black', ls='--', alpha=0.25)
plt.axvline(x=0.75, c='black', ls='--', alpha=0.25)
plt.axvline(x=0.50, c='black', ls='--', alpha=0.25)
plt.axvline(x=0.25, c='black', ls='--', alpha=0.25)
plt.axvline(x=0.00, c='black', ls='--', alpha=0.25)
plt.axvline(x=-0.75, c='black', ls='--', alpha=0.25)
plt.axvline(x=-0.50, c='black', ls='--', alpha=0.25)
plt.axvline(x=-0.25, c='black', ls='--', alpha=0.25)

plt.ylabel('Netlocs')
plt.xlabel('Means of evidence scores above threshold')

plt.tight_layout()
plt.show()
# generate a version of this without bad netlocs

In [None]:
fig, ax = plt.subplots(1,1,figsize=(10,10))

df = textual_entailment_df[['netloc_agg', 'nlp_sentences_all_TOP_N']].copy()

df['nlp_sentences_all_TOP_N_score_mean'] = df.nlp_sentences_all_TOP_N\
    .apply(lambda x : [xx for xx in x if xx['score'] > SCORE_THRESHOLD])\
    .apply(
        lambda x : np.mean([xx['score'] for xx in x] if len(x) > 0 else 0)
    )

df['nlp_sentences_all_TOP_N_score_max'] = df.nlp_sentences_all_TOP_N\
    .apply(lambda x : [xx for xx in x if xx['score'] > SCORE_THRESHOLD])\
    .apply(
        lambda x : np.max([xx['score'] for xx in x] if len(x) > 0 else 0)
    )

df['nlp_sentences_all_TOP_N_score_min'] = df.nlp_sentences_all_TOP_N\
    .apply(lambda x : [xx for xx in x if xx['score'] > SCORE_THRESHOLD])\
    .apply(
        lambda x : np.min([xx['score'] for xx in x] if len(x) > 0 else 0)
    )

df = df.groupby('netloc_agg').mean().reset_index()
df = df.sort_values('nlp_sentences_all_TOP_N_score_mean', ascending=False).reset_index(drop=True)

for i, row in df.iterrows():
    plt.hlines(
        y=i,
        xmin=row['nlp_sentences_all_TOP_N_score_min'],
        xmax=row['nlp_sentences_all_TOP_N_score_max'],
        color='black'
    )

sns.swarmplot(
    data=df,
    x='nlp_sentences_all_TOP_N_score_mean',
    y='netloc_agg',
    order=df.netloc_agg,
    ax=ax,
    linewidth=1
)
sns.swarmplot(
    data=df,
    x='nlp_sentences_all_TOP_N_score_max',
    y='netloc_agg',
    order=df.netloc_agg,
    ax=ax,
    linewidth=1
)
sns.swarmplot(
    data=df,
    x='nlp_sentences_all_TOP_N_score_min',
    y='netloc_agg',
    order=df.netloc_agg,
    ax=ax,
    linewidth=1
)

plt.axvline(x=0.75, c='black', ls='--', alpha=0.25)
plt.axvline(x=0.50, c='black', ls='--', alpha=0.25)
plt.axvline(x=0.25, c='black', ls='--', alpha=0.25)

plt.ylabel('Netlocs')
plt.xlabel('Means of evidence scores above threshold')

plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(1,1,figsize=(10,10))

df = textual_entailment_df[['netloc_agg', 'nlp_sentences_all_TOP_N']].copy()

df['nlp_sentences_all_TOP_N_nonzero'] = df.nlp_sentences_all_TOP_N\
    .apply(lambda x : len([xx for xx in x if xx['score'] > SCORE_THRESHOLD]) != 0)
df['count'] = 1
df = df.groupby('netloc_agg').sum().reset_index()
df['nlp_sentences_all_TOP_N_nonzero_perc'] = 100 * df['nlp_sentences_all_TOP_N_nonzero'] / df['count']
sns.barplot(
    data=df,
    x='nlp_sentences_all_TOP_N_nonzero_perc',
    y='netloc_agg',
    order=df.sort_values('nlp_sentences_all_TOP_N_nonzero_perc', ascending=False).netloc_agg,
    orient='h',
    ax=ax
)

plt.axvline(x=0.75, c='black', ls='--', alpha=0.25)
plt.axvline(x=0.50, c='black', ls='--', alpha=0.25)
plt.axvline(x=0.25, c='black', ls='--', alpha=0.25)

plt.ylabel('Netlocs')
plt.xlabel('Percentage of claims with relevance evidence collected')

plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(1,1,figsize=(15,15))

df = textual_entailment_df[['netloc_agg', 'evidence_TE_labels_all_TOP_N']].copy()

df['evidence_TE_labels_all_TOP_N_SUP_perc'] = df.evidence_TE_labels_all_TOP_N.apply(
    lambda x : 100*len([xx for xx in x if xx == 'SUPPORTS'])/len(x) if x is not None else 0
)
df['evidence_TE_labels_all_TOP_N_REF_perc'] = df.evidence_TE_labels_all_TOP_N.apply(
    lambda x : 100*len([xx for xx in x if xx == 'REFUTES'])/len(x) if x is not None else 0
)
df['evidence_TE_labels_all_TOP_N_NEI_perc'] = df.evidence_TE_labels_all_TOP_N.apply(
    lambda x : 100*len([xx for xx in x if xx == 'NOT ENOUGH INFO'])/len(x) if x is not None else 0
)
df = df.groupby('netloc_agg').mean().reset_index()
df = df.sort_values('evidence_TE_labels_all_TOP_N_NEI_perc', ascending=True).reset_index(drop=True)
df_m = df.melt(id_vars='netloc_agg')

sns.barplot(
    data=df_m,
    x='value',
    y='netloc_agg',
    hue='variable',
    order=df.netloc_agg,
    ax=ax
)

h, l = ax.get_legend_handles_labels()
ax.legend(h, ['SUP','REF','NEI'])

plt.ylabel('Netlocs')
plt.ylabel('Breakdown of mean percentage of evidence labels')

plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(1,2,figsize=(10,5))

df = textual_entailment_df.copy()


sns.barplot(
    data=(df.claim_TE_label_weighted_sum_all_TOP_N.value_counts(normalize=True)*100).reset_index(),
    x='index',
    y='claim_TE_label_weighted_sum_all_TOP_N',
    ax=ax[0],
    order=['SUPPORTS','REFUTES','NOT ENOUGH INFO']
)

sns.barplot(
    data=(df.claim_TE_label_malon_all_TOP_N.value_counts(normalize=True)*100).reset_index(),
    x='index',
    y='claim_TE_label_malon_all_TOP_N',
    ax=ax[1],
    order=['SUPPORTS','REFUTES','NOT ENOUGH INFO']
)

ax[0].set(ylabel='Percentage of claim labels (WEIGHTED SUM)', xlabel='Labels')
ax[1].set(ylabel='Percentage of claim labels (MALON)', xlabel='Labels')

plt.tight_layout()
plt.show()

## Manual analysis of selected sentences PLUS textual entailment
In addition to quantitative metrics at this step, let's try just looking at some examples by hand.

We will, for all netloc_aggs, choose a random sample and check if selected evidence make sense given claim.

**Notes**:
- The Weighted Sum method provides a better distinction between Refutes and Supports (especifically, it reduces classifying true REFUTE as false SUPPORT from 0.12 to 0.6), but has a harder time with NOT ENOUGH INFO.
- The Malon method is the opposite, provides a better distinction between NOT ENOUGH INFO and the other two, but has a higher chance of classifying ture REFUTE as false SUPPORT.
- *We should come with a method that mixes both*, as weighted sum considers the evidence scores while malon does not. Perhaps modifying Malon's in a way to consider the positive evidence scores.
- Utilising negative evidence scores is not something we do now. So, when it comes to R3 of showing negatively scored evidence, we might have to think on a way to incorporate them in the final label.
- Past comments on the manual analysis of selected sentences all fit here.

- In some cases, TE will UNLIKELY rate a pair as NEI if the evidence sentence is long. This is because often the TE module will fix favourable (S or R) attention to parts of the information that are not necessarity pertinent to the subject.
    - "Noel Short died in 2001", for example. The evidences repeatedly talk about 2001 and Short, and in dying and Short, but not all three together. This moves the labels away from NEI, as these sentences are relevant, but not in the exact combination of information that only by imbuing the module with common sense it would be able to spot (like when talking about his wife dying through 'she', the module should know 'she' could not possibly mean Short). In other sentence, it mentions "Sir Noel Short" ... "who has died" ... "in the 1970s", which leads to a REFUTES label, as this long sentence spans lots of information that fool the module.
- This might also be an issue when the subject is information-heavy, as the object might get overlooked. Some of these are "solved" because they receive very low evidence scores too.
    - In the "The former wash house, Blackness Castle, is located in the administrative territorial entity of Falkirk.", the evidence "Address/Name of Site. Former barracks block including outbuildings and yard walls to rear and wash house to northeast, excluding custodian's cottage to northeast and scheduled monument SM90036, Blackness Castle, Blackness LB52456." is assigned as SUPPORTS as it cites "former", "wash house", "Blackness Castle", and location names such as "Address of site".
    - Same with "Tetradrachm of Elagabalus, Emperor of Rome, from Antioch, Yale University Art Gallery, inv. 1938.6000.104 is an example of an archaeological artifact." and the evidence "Tetradrachm of Elagabalus, Emperor of Rome from Antioch."
    - Same with "The Restrictive Trade Practices (Jewellery, Watches and Clocks) Order, 1968 applies to the jurisdiction of Ireland."
- A similar issue happens when the subject/object is information-heavy on the claim but abbreviated on the evidence.
    - Like "administrative territorial entity of Anaheim" becoming just "Anaheim".
- Obviously there are issues when whatever is said on the sentence needs some common sense/knowledge to conclude the claim. Like in the "The former wash house, Blackness Castle, is located in the administrative territorial entity of Falkirk." case, the evidence saying
    - For instance, "A Rocky Shore, with the Tour de Croy in the Distance is located in Tate." is true because the link leads to the Tate website with the art piece on it, not because it particularly says so in its text.
    - "Naravoor Lps is located in India." needs knowledge that Kannur is in India.
- Rare ways of writing information throws the modules out
    - Like "Deathdate" instead of "Death date", on 'Alec Waugh died on 03/09/1981.' with evidence "Birthdate: 8 July 1898. Death date: 3 September 1981."

To alter before crowdsourcing:
- **Remember** to alter the URLs leading to paywall-protected pages by using their https://archive.ph/ version.
    - You can build a list by checking if 'archive.ph' is in the html.
- A handful of websites are not in English, we can get a list from netloc archinform.net.

In [None]:
textual_entailment_df.head()

In [None]:
df = textual_entailment_df.copy()

try:
    for netloc in df.netloc_agg.unique().tolist():
        print('-'*10)
        print(netloc,'\n')
        sampled_rows = df[df['netloc_agg'] == netloc].sample(3, random_state=42)
        for i, row in sampled_rows.iterrows():
            print(
                '->', row['final_verbalisation'],'\n',
                '\t-WS:', row['claim_TE_label_weighted_sum_all_TOP_N'], f"({row['claim_TE_prob_weighted_sum_all_TOP_N']})\n",
                '\t-M: ', row['claim_TE_label_malon_all_TOP_N']
            )
            print('->>', row['final_url'])
            for i, sentence in enumerate(row['nlp_sentences_all_TOP_N']):
                if sentence['score'] > SCORE_THRESHOLD:
                    print(
                        f"\t{i+1}. {sentence['sentence']}\n",
                        f"\t-Evidence Score: {sentence['score']}\n",
                        f"\t-Label Prob: {row['evidence_TE_prob_all_TOP_N'][i]}\n",
                        f"\t-Label Prob Weighted: {row['evidence_TE_prob_weighted_all_TOP_N'][i]}\n",
                        f"\t-Label: {row['evidence_TE_labels_all_TOP_N'][i]}\n",
                    )
                else:
                    print(
                        f"\t{i+1}. {sentence['sentence']}\n",
                        f"\t-Evidence Score: {sentence['score']}\n",
                        f"\t-Label Prob: {row['evidence_TE_prob_all_TOP_N'][i]}\n",
                        f"\t-Label Prob Weighted: {[0,0,0]}\n",
                        f"\t-Label: {row['evidence_TE_labels_all_TOP_N'][i]}\n",
                    )
            print()
except Exception:
    print(row)
    raise