# Libraries

In [3]:
import spacy
import pandas as pd
import numpy as np
from spacy.tokens import Doc, Span
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('dbpedia_spotlight', config={'confidence': 0.5})
preprocessing = spacy.load('en_core_web_sm')
from fastcoref import LingMessCoref
coref_model = LingMessCoref()
from SPARQLWrapper import SPARQLWrapper, JSON
from datasets import load_dataset
dataset = load_dataset("potsawee/wiki_bio_gpt3_hallucination")
import os
from tqdm.notebook import tqdm
import concurrent.futures
import re
import requests

07/04/2024 12:35:38 - INFO - 	 missing_keys: []
07/04/2024 12:35:38 - INFO - 	 unexpected_keys: []
07/04/2024 12:35:38 - INFO - 	 mismatched_keys: []
07/04/2024 12:35:38 - INFO - 	 error_msgs: []
07/04/2024 12:35:38 - INFO - 	 Model Parameters: 590.0M, Transformer: 434.6M, Coref head: 155.4M


In [4]:
import wikipedia

In [5]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-mpnet-base-v2")

07/04/2024 12:35:54 - INFO - 	 Use pytorch device_name: mps
07/04/2024 12:35:54 - INFO - 	 Load pretrained SentenceTransformer: all-mpnet-base-v2


In [6]:
import numpy as np
from urllib.parse import urlparse

# Coreference Resolution

In [7]:

def get_cluster_spans(doc, clusters):
    fast_clusters = []
    for cluster in clusters:
        new_group = []
        for start, end in cluster:
            span = doc.char_span(start, end)
            if span is not None:
                new_group.append([span.start, span.end - 1])
        fast_clusters.append(new_group)
    return fast_clusters

def get_clusters(doc, text):
    preds = coref_model.predict(texts=[text])
    # print(f"\nThe clusters of same entities are as follows: {preds[0].get_clusters(as_strings=True)} \n")
    clusters = preds[0].get_clusters(as_strings=False)
    cluster_spans = get_cluster_spans(doc, clusters)
    return cluster_spans

def get_span_noun_indices(doc, cluster):    
    spans = [doc[start:end+1] for start, end in cluster]

    spans_pos = []
    for span in spans:
        pos_tags = [token.pos_ for token in span]
        spans_pos.append(pos_tags)

    noun_indices = []
    for i, pos_list in enumerate(spans_pos):
        if 'NOUN' in pos_list or 'PROPN' in pos_list:
            noun_indices.append(i)
    return noun_indices

def get_cluster_head(doc, cluster, noun_indices):
    head_idx = noun_indices[0]
    head_start, head_end = cluster[head_idx]
    head_span = doc[head_start:head_end+1]
    return head_span, [head_start, head_end]

def is_containing_other_spans(span, all_spans):
    for s in all_spans:
        if s[0] >= span[0] and s[1] <= span[1] and s != span:
            return True  
    return False

def replacement(coref, resolved, mention_span):
    start, end = coref
    mention_text = mention_span.text_with_ws + " "
    resolved[start] = mention_text
    for i in range(start + 1, end + 1):
        resolved[i] = ""
    return resolved

def replace_corefs(document, clusters):
    resolved = [token.text_with_ws for token in document]
    all_spans = [span for cluster in clusters for span in cluster]

    for cluster in clusters:
        noun_indices = get_span_noun_indices(document, cluster)

        if noun_indices:
            mention_span, mention = get_cluster_head(document, cluster, noun_indices)
        else:
            start, end = cluster[0]
            mention_span = document[start:end+1]
            mention = cluster[0]
            
        for coref in cluster:
            resolved = replacement(coref, resolved, mention_span)

    
    return ("".join(resolved))


def coreference_resolution(text):
    doc = nlp(text)
    clusters = get_clusters(doc, text) 
    answer= replace_corefs(doc, clusters) 
    return answer

# Pre-Processing

In [8]:
def is_three_word_name(entity):
    return len(entity.text.split()) >= 3 and entity.label_ == "PERSON"

In [9]:
def replace_three_worded_names(text):
    doc = preprocessing(text)
    new_text = text
    
    for entity in doc.ents:
        if is_three_word_name(entity):
            words = entity.text.split()
            new_name = f"{words[0]} {words[-1]}"
            new_text = new_text.replace(entity.text, new_name)
    return new_text

In [10]:
def preprocess_text(text): 
    # text = replace_three_worded_names(text) 
    text = re.sub(r'[^\w\s.,()\'"\-]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r"(['\"])\1+", r"\1", text)
    # preprocessed_text = coreference_resolution(text)
    return text

# Spotlight Based System

## Sentence Based Link Making

In [None]:
def get_sentence_based_links(text):
    final_text = coreference_resolution(text)
    doc = nlp(final_text)
    final_sents = [sents for sents in doc.sents]
    # entities = list(doc.ents)
    # print("Entities found by spaCy:", entities)
    
    subjects = []
    subject_set = set([])
    for sent in doc.sents:
        found_subject = False
        for token in sent:
            # print(f"{token},{token.dep_}")
            if token.dep_ in ['nsubj', 'nsubjpass'] and token.ent_kb_id_:
                subjects.append(token.ent_kb_id_)
                subject_set.add(token.ent_kb_id_)
                found_subject = True
                break
        if not found_subject:
            for i, token in enumerate(sent):
                if token.dep_ in ["dobj","iobj","pobj"]:
                    subjects.append(token.ent_kb_id_)
                    subject_set.add(token.ent_kb_id_)
                    found_subject = True
                    break
        if not found_subject:
            subjects.append(None)
    ("Subjects identified:", subjects)
    
    sentence_forms = []
    for sent in doc.sents:
        entities = [(ent.text, ent.start, ent.end) for ent in sent.ents]
        sentence_forms.append(entities)
    # print("Entities in each sentence:", sentence_forms)
    
    # print(subjects)
    
    pairs = []
    count = 0
    for i in range(len(sentence_forms)):
        tmp_storage = []
        if subjects[i] is not None and len(sentence_forms[i]) > 1:
            for entity, ent_start, ent_end in sentence_forms[i]:
                check_sub = False
                for token in doc[ent_start:ent_end]:
                    if token.dep_ in ['nsubj', 'nsubjpass']:
                        check_sub = True
                        break
                if check_sub == True or entity in subjects:
                    continue
                tmp_storage.append([subjects[i], entity])
                count += 1
        pairs.append(tmp_storage)
    print(f"The number of pairs is: {count}\n")
    
    return pairs, final_sents, subject_set

In [None]:
def get_sentence_based_links(text):
    final_text = coreference_resolution(text)
    doc = nlp(final_text)
    final_sents = [sents for sents in doc.sents]
    # entities = list(doc.ents)
    # print("Entities found by spaCy:", entities)
    
    subjects = []
    subject_set = set([])
    for sent in doc.sents:
        found_subject = False
        for token in sent:
            # print(f"{token},{token.dep_}")
            if token.dep_ in ['nsubj', 'nsubjpass'] and token.ent_kb_id_:
                subjects.append(token.ent_kb_id_)
                subject_set.add(token.ent_kb_id_)
                found_subject = True
                break

        if not found_subject:
            subjects.append(None)
    ("Subjects identified:", subjects)
    
    new_doc = preprocessing(final_text)
    sentence_forms = []
    for sent in new_doc.sents:
        entities = [(ent.text, ent.start, ent.end) for ent in sent.ents]
        sentence_forms.append(entities)
    
    # print("Entities in each sentence:", sentence_forms)
    
    # print(subjects)
    
    pairs = []
    count = 0
    for i in range(len(sentence_forms)):
        tmp_storage = []
        if subjects[i] is not None and len(sentence_forms[i]) > 1:
            for entity, ent_start, ent_end in sentence_forms[i]:
                check_sub = False
                for token in new_doc[ent_start:ent_end]:
                    # print(f"{token} and {token.dep_}")
                    if token.dep_ in ['nsubj', 'nsubjpass']:
                        check_sub = True
                        break
                if check_sub == True:
                    continue
                tmp_storage.append([subjects[i], entity])
                count += 1
        pairs.append(tmp_storage)
    print(f"The number of pairs is: {count}\n")
    
    return pairs, final_sents, subject_set

In [None]:
text = """Wilhelm Windelband (May 11, 1848 - October 22, 1915) was a German philosopher of the Baden School. Wilhelm Windelband is now mainly remembered for the terms "nomothetic" and "idiographic", which Wilhelm Windelband introduced. the terms "nomothetic" and "idiographic", which he introducedhave currency in psychology and other areas, though not necessarily in line with Wilhelm Windelband original meanings. Wilhelm Windelband was a Neo-Kantian who protested other Neo-Kantians of Wilhelm Windelband time and maintained that "to understand Kant rightly means to go beyond Kant ". Against Wilhelm Windelband positivist contemporaries, Wilhelm Windelband argued that philosophy should engage in humanistic dialogue with the natural sciences rather than uncritically appropriating the natural sciences methodologies. Wilhelm Windelband interests in psychology and cultural sciences represented an opposition to psychologism and historicism schools by a critical philosophic system. Wilhelm Windelband relied in Wilhelm Windelband effort to reach beyond Kant on such philosophers as Georg Wilhelm Friedrich Hegel, Johann Friedrich Herbart, and Hermann Lotze. Closely associated with Wilhelm Windelband was Heinrich Rickert. Wilhelm Windelband disciples were not only noted philosophers, but sociologists like Max Weber and theologians like Ernst Troeltsch and Albert Schweitzer."""
pairs, final_sents, subject_set = (get_sentence_based_links(text))
print(pairs)

## Making Direct Subject Link Embeddings

In [None]:
def subjects_direct_links(subjects_set):
    subject_direct_dict = {}
    for subj in subjects_set:
        sparql = SPARQLWrapper("https://dbpedia.org/sparql")
        sparql.setMethod('POST')  

        # Dynamically insert the subject into the query
        query_source_to_target = f"""
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX dbo: <http://dbpedia.org/ontology/>
        PREFIX dbr: <http://dbpedia.org/resource/>

        SELECT DISTINCT ?label
        WHERE {{
        {{
            <{subj}> ?property ?entity .
            ?entity rdfs:label ?label .
        }} UNION {{
            ?entity ?property <{subj}> .
            ?entity rdfs:label ?label .
        }}
        FILTER (lang(?label) = "en")
        }}
        """
        sparql.setQuery(query_source_to_target)
        sparql.setReturnFormat(JSON)

        try:
            result = sparql.query().convert()
            tmp = [item['label']['value'] for item in result["results"]["bindings"]]
            subject_direct_dict[subj] = model.encode(tmp,show_progress_bar=False)
        except Exception as e:
            print(f"Query for {subj} didn't work: {e}")

    return subject_direct_dict


### Testing

In [None]:
# Example usage
result_dict = subjects_direct_links(subject_set)
print(result_dict)

In [None]:
word = "Psychologist"
entity_embedding = model.encode(word)
embeddings = result_dict['http://dbpedia.org/resource/Wilhelm_Windelband']
cosine_similarities = np.dot(embeddings, entity_embedding)
print(np.argmax(cosine_similarities))
cosine_similarities[np.argmax(cosine_similarities)]

## Checking the Direct Links

In [None]:
def check_direct_link(source_target):
    source_uri, target_uri = source_target
    sparql = SPARQLWrapper("https://dbpedia.org/sparql")
    sparql.setMethod('POST')  
    
    query_source_to_target = f"""
    ASK WHERE {{
      <{source_uri}> ?p <{target_uri}> .
    }}
    """
    sparql.setQuery(query_source_to_target)
    sparql.setReturnFormat(JSON)

    try:
        result_source_to_target = sparql.query().convert()
        has_link_source_to_target = result_source_to_target['boolean']
    except Exception as e:
        print(f"Error querying {source_uri} -> {target_uri}: {e}")
        has_link_source_to_target = False

    # Query from target to source
    query_target_to_source = f"""
    ASK WHERE {{
      <{target_uri}> ?p <{source_uri}> .
    }}
    """
    sparql.setQuery(query_target_to_source)

    try:
        result_target_to_source = sparql.query().convert()
        has_link_target_to_source = result_target_to_source['boolean']
    except Exception as e:
        print(f"Error querying {target_uri} -> {source_uri}: {e}")
        has_link_target_to_source = False

    # Combine the results
    has_link = has_link_source_to_target or has_link_target_to_source
    return source_uri, target_uri, has_link

In [None]:
print(check_direct_link(["http://dbpedia.org/resource/John_Russell_Reynolds","http://dbpedia.org/resource/Judge"])[2])

## Scoring

In [None]:
def scoring_linear_version(text):
    pairs,final_sents,subject_set = get_sentence_based_links(text)
    subject_dict = subjects_direct_links(subject_set)
    
    if not pairs:
        print("No entity pairs found here.")
        return 0,[],final_sents
    
    fractions = []
    pair_and_values = []
    
    for sent_sets in pairs:
        score = 0
        for pair in sent_sets:
            if check_direct_link(pair)[2]:
                score+=1
                pair_and_values.append([pair[0],pair[1],1])
            else:
                local_subject= pair[0]
                entity_link = pair[1]
                last_part = entity_link.split('/')[-1]
                entity = last_part.replace('_', ' ')
                print(entity)
                entity_embeddings = model.encode(entity,show_progress_bar=False)
                embeddings = subject_dict[local_subject]
                cosine_similarities = np.dot(embeddings, entity_embeddings)
                max_similarity = np.max(cosine_similarities)
                print(max_similarity)
                if max_similarity>0.65:
                    score+=1
                    pair_and_values.append([pair[0],pair[1],1])
                else:
                    pair_and_values.append([pair[0],pair[1],0])
        if len(sent_sets)==0:
            fractions.append(-1)
        else:
            fractions.append(score/len(sent_sets))
    
    return fractions, pair_and_values, final_sents

In [None]:
def process_sentence_set(args):
    index, sent_sets, subject_dict = args
    score = 0
    local_pair_and_values = []
    for pair in sent_sets:
        _, _, is_direct = check_direct_link(pair)
        if is_direct:
            score += 1
            local_pair_and_values.append([pair[0], pair[1], 1])
        else:
            local_subject= pair[0]
            entity_link = pair[1]
            last_part = entity_link.split('/')[-1]
            entity = last_part.replace('_', ' ')
            entity_embeddings = model.encode(entity,show_progress_bar=False)
            embeddings = subject_dict[local_subject]
            
            if embeddings.size == 0:
                local_pair_and_values.append([pair[0], pair[1], 0])
                continue
                
            
            cosine_similarities = np.dot(embeddings, entity_embeddings)
            max_similarity = np.max(cosine_similarities)
            if max_similarity>0.65:
                score+=1
                local_pair_and_values.append([pair[0],pair[1],1])
            else:
                local_pair_and_values.append([pair[0],pair[1],0])
    fraction = score / len(sent_sets) if len(sent_sets) > 0 else -1
    return index, fraction, local_pair_and_values

def scoring_parallel_version(text):
    pairs, final_sents, subject_set = get_sentence_based_links(text)
    subject_dict = subjects_direct_links(subject_set)

    if not pairs:
        print("No entity pairs found here.")
        return 0, [], final_sents

    fractions = [None] * len(pairs)
    pair_and_values = [None] * len(pairs)

    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        # Prepare the arguments for each task
        task_args = [(index, sent_sets, subject_dict) for index, sent_sets in enumerate(pairs)]
        # Process each sentence set in parallel
        results = list(executor.map(process_sentence_set, task_args))

    # Place results back into the correct order
    for index, fraction, local_pair_and_values in results:
        fractions[index] = fraction
        pair_and_values[index] = local_pair_and_values

    # Flatten pair_and_values list
    flat_pair_and_values = [item for sublist in pair_and_values for item in sublist]

    return fractions, flat_pair_and_values, final_sents


In [None]:
# #test =  ["John Russell Reynolds (1820–1876) was an English lawyer, judge, and author.", "He was born in London, the son of a barrister, and was educated at Eton College and Trinity College, Cambridge.", "He was called to the bar in 1845, and became a Queen's Counsel in 1859.", "He was appointed a judge of the Court of Common Pleas in 1867, and was knighted in 1871.", "Reynolds was a prolific author, writing on a wide range of topics.", "He wrote several books on legal topics, including The Law of Libel and Slander (1863), The Law of Copyright (1865), and The Law of Patents for Inventions (1868).", "He also wrote on a variety of other topics, including history, biography, and literature.", "He was a frequent contributor to the Saturday Review, and wrote several books on Shakespeare, including The Mystery of William Shakespeare (1848) and The Authorship of Shakespeare (1875).", "He also wrote a biography of the poet John Keats (1848)." ]
# test = [ "Gordon David Strachan (born 9 February 1957) is a Scottish football manager and former player.", "He is the manager of the Scotland national team.", "Strachan played for Dundee, Aberdeen, Manchester United, Leeds United and Coventry City, as well as the Scotland national team.", "He has also managed Coventry City, Southampton, Celtic and Middlesbrough.", "Strachan began his managerial career at Coventry City in 1996, leading them to the 1997 FA Cup Final, where they lost to Tottenham Hotspur.", "He then moved to Southampton in 2001, where he guided them to the 2003 FA Cup Final, which they lost to Arsenal.", "In 2005, he was appointed manager of Celtic, where he won three consecutive Scottish Premier League titles and the Scottish League Cup twice.", "He left Celtic in 2009 and was appointed manager of Middlesbrough in October 2010.", "He left Middlesbrough in October 2013.", "In January 2013, Strachan was appointed manager of the Scotland national team.", "He has since led Scotland to the UEFA Euro 2016 qualifying playoffs, where they were eliminated by eventual finalists, and to the 2018 FIFA World Cup" ]
# text = ""
# for i in test:
#    text += " " + i
# print(text)
print(scoring_parallel_version(text))

# print(end_time - start_time)


## Experimentation

In [None]:
accurate = []
minor_inaccurate = []
major_inaccurate = []

In [None]:
folder_name = "Output"

if not os.path.exists(folder_name):
        os.makedirs(folder_name)

for i in tqdm(range(28,100), desc="Processing entries", unit="entry"):
    list_of_sentences = (dataset["evaluation"][i]["gpt3_sentences"])
    sentences = ''''''
    for s in list_of_sentences:
        tmp = preprocess_text(s)
        if tmp[-1]!='.':
            tmp+='.'
        sentences = sentences + tmp + "\n" 
    ground_truth_doc = preprocessing(dataset["evaluation"][i]["wiki_bio_text"])
    ground_truth = ''''''
    for sent in ground_truth_doc.sents:
        temp = preprocess_text(sent.text)
        if temp[-1]!='.':
            temp+='.'
        ground_truth = ground_truth + temp + "\n" 
    annotation = dataset["evaluation"][i]["annotation"]
    
    sentences_scores, sentence_pairs_and_values , sentence_coref_sents= scoring_parallel_version(sentences)
    ground_truth_scores, ground_pairs_and_values, ground_coref_sents = scoring_parallel_version(ground_truth)
        
    filename = os.path.join(folder_name, f"entry_{i+1}.txt")
    with open(filename, 'w') as file:
        file.write("#############SENTENCE_PAIRS############\n\n")
        for x in range(len(sentence_pairs_and_values)):
            file.write(f"{sentence_pairs_and_values[x][0]} and {sentence_pairs_and_values[x][1]} and the value is : {sentence_pairs_and_values[x][2]}\n")
        file.write("\n")
        file.write("#############GROUND_PAIRS############\n\n")
        for x in range(len(ground_pairs_and_values)):
            file.write(f"{ground_pairs_and_values[x][0]} and {ground_pairs_and_values[x][1]} and the value is : {ground_pairs_and_values[x][2]}\n")
        file.write("\n\n")
        file.write("%%%%%%%%%%%%%%%%%%%%SENTENCES%%%%%%%%%%%%%%%%%\n")
        file.write(f"Sentences : \n\n{sentences}\n\n")
        file.write(f"Coref Resolved: \n\n")
        for y,x in enumerate(sentence_coref_sents):
            file.write(f"{y}. {x}")
        file.write("\n")
        file.write("\n")
        file.write("%%%%%%%%%%%%%%%%%%%%GROUND_TRUTH%%%%%%%%%%%%%%%\n")
        file.write(f"Ground Truth : \n\n{ground_truth} \n\n")
        file.write(f"Coref Resolved: \n\n")
        for y,x in enumerate(ground_coref_sents):
            file.write(f"{y}. {x}")
        file.write("\n")
        file.write("%%%%%%%%%%%%%%%%%%%%FRACTIONS%%%%%%%%%%%%%%%%%%\n")
        file.write(f"Value for sentences is : \n")
        for x in sentences_scores:
            file.write(f"{x} ")
        file.write("\n")
        file.write(f"Value for ground truth is : \n")
        for x in ground_truth_scores:
            file.write(f"{x} ")
        file.write("\n\n")
        file.write("%%%%%%%%%%%%%%%%%%%%ANNOTATIONS%%%%%%%%%%%%%%%%\n")
        for x in annotation:
            file.write(f"{x} ")
        file.write("\n\n")
        
    suma = 0
    leng = 0
    for x in ground_truth_scores:
        if x!=-1:
            suma+=x
            leng+=1
    
    if leng>0:
        print(f"The ground truth scores are: {suma/leng}\n")

    if (leng)>0 and (suma/leng)>0.25:
        if len(sentences_scores)!=len(annotation):
            continue
        for scores in ground_truth_scores:
            if scores==-1:
                continue
            else:
                accurate.append(scores)
        for t,score in enumerate(sentences_scores):
            if score==-1:
                continue
            if annotation[t-1]=="accurate":
                accurate.append(score) 
            elif annotation[t-1]=="minor_inaccurate":
                minor_inaccurate.append(score)
            elif annotation[t-1]=="major_inaccurate":
                major_inaccurate.append(score)

In [None]:
sum(accurate)/len(accurate)

In [None]:
len(accurate)

In [None]:
sum(minor_inaccurate)/len(minor_inaccurate)

In [None]:
len(minor_inaccurate)

In [None]:
sum(major_inaccurate)/len(major_inaccurate)

In [None]:
len(major_inaccurate)

In [None]:
text = "Traded to the San Diego Chargers, Aldridge played two seasons in San Diego before retiring from professional football in 1973."

In [None]:
doc = preprocessing(text)

In [None]:
for ents in doc.ents:
    print(f"{ents} __ {ents.label_}")

In [None]:
import spacy
import requests

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Define the sentence
sentence = "Albert Einstein was a theoretical physicist who developed the theory of relativity."

# Parse the sentence using spaCy
doc = nlp(sentence)

# Extract subjects from the sentence
subjects = [token.text for token in doc if token.dep_ == "nsubj"]

print("Extracted Subjects:", subjects)

def link_to_dbpedia(subject):
    spotlight_url = "https://api.dbpedia-spotlight.org/en/annotate"
    headers = {"accept": "application/json"}
    params = {"text": subject, "confidence": 0.5}

    response = requests.get(spotlight_url, headers=headers, params=params)
    if response.status_code == 200:
        data = response.json()
        if 'Resources' in data:
            resources = data['Resources']
            return resources[0]['@URI']
    return None

# Link each extracted subject to a DBpedia entry
for subject in subjects:
    dbpedia_uri = link_to_dbpedia(subject)
    if dbpedia_uri:
        print(f"Subject: {subject} -> DBpedia URI: {dbpedia_uri}")
    else:
        print(f"Subject: {subject} -> No DBpedia URI found")


# Wikipedia Based System

Wikipediaapi Doesnt work ->

Switched to wikipedia

## Subject Extraction

In [227]:
base = "Produce a list of common words in the english language."
text = preprocessing(base)

In [226]:
def extract_compound_subject_span(sent):
    compound_subject = []

    for i, token in enumerate(sent):
        # print(f"{token} and {token.dep_}")
        if token.dep_ in ["nsubj","nsubjpass"]:
            compound_subject.append(token.text)
            for j in range(i - 1, -1, -1):
                if sent[j].dep_ == "compound":
                    # print(f"{sent[j]} and {sent[j].dep_}")
                    compound_subject.insert(0, sent[j].text) 
                else:
                    break  
            break
    if compound_subject == []:
        for i, token in enumerate(sent):
            # print(f"{token} and {token.dep_}")
            if token.dep_ in ["dobj","iobj","pobj"]:
                compound_subject.append(token.text)
                for j in range(i - 1, -1, -1):
                    if sent[j].dep_ == "compound":
                        # print(f"{sent[j]} and {sent[j].dep_}")
                        compound_subject.insert(0, sent[j].text) 
                    else:
                        break  
                break

    return " ".join(compound_subject) if compound_subject else None

In [228]:
extract_compound_subject_span(text)

AttributeError: 'str' object has no attribute 'dep_'

In [165]:
text ="""After the family moved to Kilburn, Tommy Nutter and Tommy Nutter brother David attended Willesden Technical College."""
text = """Windelband relied in his effort to reach beyond Kant on such philosophers as Georg Wilhelm Friedrich Hegel, Johann Friedrich Herbart, and Hermann Lotze."""

text = coreference_resolution(text)
input = preprocessing(text)

# for token in input:
#     print(f"{token} and {token.dep_}")
# for sent in input.sents:
#     print(extract_compound_subject_span(sent))
for entities in input.ents:
    print(entities)

07/04/2024 16:39:20 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:39:21 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

Windelband
Windelband
Kant
Georg Wilhelm Friedrich Hegel
Johann Friedrich Herbart
Hermann Lotze


## Sentence Based Link Formation

In [215]:
def get_sentence_based_links_wiki(text):
    output = {}
    
    output["Initial_Text"] = text
    # print(text)
    
    t1 = coreference_resolution(text)
    final_text = preprocess_text(t1)
    
    # print(final_text)
    output["Coref_Resolved_Text"] = final_text
    
    doc = nlp(final_text)
    
    # print([ents for ents in doc.noun_chunks])
    
    final_sents = [sents for sents in doc.sents]
    output["Coref_Resolved_Sentences"] = final_sents 
    
    subjects = []
    subject_set = set([])
    subjects_with_context = []
    
    for sent in doc.sents:
        tmp_subj = extract_compound_subject_span(sent)
        subjects.append(tmp_subj)
        if tmp_subj!=None:
            if tmp_subj not in subject_set:
                subject_set.add(tmp_subj)
                subjects_with_context.append([tmp_subj,sent.text])

    
    sentence_forms = []
    base_form = []
    
    # for sent in final_sents:
    #     tmp_text = sent.text
    #     tmp_text = tmp_text.strip()
    #     new_sent = preprocessing(tmp_text)
        
    #     entities = []
    #     for ents in new_sent.noun_chunks:
    #         start_idx = ents.start
    #         end_idx = ents.end

    #         start_context = max(0, start_idx - 5) 
    #         end_context = min(len(sent), end_idx + 5) 

    #         context_tokens = sent[start_context:end_context]
    #         entity_context = " ".join(token.text for token in context_tokens)

    #         new_text = f"{ents.text}: {entity_context}"
    #         entities.append(new_text)
        
    #     sentence_forms.append(entities)
        
    
    for i,sent in enumerate(final_sents):
        tmp_text = sent.text
        tmp_text = tmp_text.strip()
        new_sent = preprocessing(tmp_text)
        
        entities = []
        base_entities = []
        for ents in new_sent.noun_chunks:
            if ents.text not in subjects:
                new_text = f"{ents.text}"
                entities.append(new_text)
                base_entities.append(ents.text)
        
        sentence_forms.append(entities)
        base_form.append(base_entities)
    
    # print("Entities in each sentence:", sentence_forms)
    
    output["Sentence_wise_Entities"] = sentence_forms
    output["Sentence_wise_subjects"] = subjects
    # print(subjects)
    
    pairs = []
    count = 0
    for i in range(len(sentence_forms)):
        tmp_storage = []
        if subjects[i] is not None and len(sentence_forms[i]) > 0:
            for j,entity in enumerate(sentence_forms[i]):
                tmp_storage.append([subjects[i], entity])
                count += 1
        pairs.append(tmp_storage)
    # print(f"The number of pairs is: {count}\n")
    
    output["Sentence_Wise_pairs"] = pairs
    
    return pairs, final_sents, subject_set, subjects_with_context,output

In [216]:
text ="""Wilhelm Windelband (May 11, 1848 - October 22, 1915) was a German philosopher of the Baden School.
Windelband is now mainly remembered for the terms "nomothetic" and "idiographic", which he introduced.
These have currency in psychology and other areas, though not necessarily in line with his original meanings.
Windelband was a Neo-Kantian who protested other Neo-Kantians of his time and maintained that "to understand Kant rightly means to go beyond him".
Against his positivist contemporaries, Windelband argued that philosophy should engage in humanistic dialogue with the natural sciences rather than uncritically appropriating its methodologies.
His interests in psychology and cultural sciences represented an opposition to psychologism and historicism schools by a critical philosophic system.
Windelband relied in his effort to reach beyond Kant on such philosophers as Georg Wilhelm Friedrich Hegel, Johann Friedrich Herbart, and Hermann Lotze.
Closely associated with Windelband was Heinrich Rickert.
Windelband's disciples were not only noted philosophers, but sociologists like Max Weber and theologians like Ernst Troeltsch and Albert Schweitzer."""

pairs, final_sents, subject_set, subjects_with_context,output = (get_sentence_based_links_wiki(text))
print(pairs)
print(output)

07/04/2024 17:08:22 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:08:22 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

[[['Wilhelm Windelband', 'May 11, 1848 - October'], ['Wilhelm Windelband', 'a German philosopher'], ['Wilhelm Windelband', 'the Baden School']], [['Wilhelm Windelband', 'the terms'], ['Wilhelm Windelband', 'which'], ['Wilhelm Windelband', 'he']], [['terms', 'the terms'], ['terms', 'which'], ['terms', 'he'], ['terms', 'currency'], ['terms', 'psychology'], ['terms', 'other areas'], ['terms', 'line'], ['terms', 'Wilhelm Windelband original meanings']], [['Wilhelm Windelband', 'a Neo-Kantian'], ['Wilhelm Windelband', 'who'], ['Wilhelm Windelband', 'other Neo-Kantians'], ['Wilhelm Windelband', 'Wilhelm Windelband time'], ['Wilhelm Windelband', 'Kant'], ['Wilhelm Windelband', 'Kant']], [['Wilhelm Windelband', 'Wilhelm Windelband positivist contemporaries'], ['Wilhelm Windelband', 'philosophy'], ['Wilhelm Windelband', 'humanistic dialogue'], ['Wilhelm Windelband', 'the natural sciences'], ['Wilhelm Windelband', 'the natural sciences methodologies']], [['Wilhelm Windelband interests', 'psychol

## Subject Direct Links Linear

In [217]:
def subjects_direct_links_wiki(subject_with_context):
    
    subject_direct_dict = {}
    subject_direct_word_dict = {}
    
    result_set = set([])
    result_dict = {}
    
    for pair in subject_with_context:
        subject = pair[0]
        context = pair[1]
        
        try:
            subj = subject + ": " + context
            if len(subj) > 290:
                subj = subj[:290]  # Truncate to 250 characters if it exceeds 300
                # print(f"Truncated query: {subj}")
            
            result = wikipedia.search(subj, results = 1)
            if result == []:
                result = wikipedia.search(subject, results = 1)
            # print(result)
            for results in result:
                if results in result_set:
                    subject_direct_dict[subject] = result_dict[results]
                else:
                    page = wikipedia.page(results)
                    tmp = [link for link in page.links]
                    subject_direct_word_dict[subject] = tmp
                    subject_direct_dict[subject] = model.encode(tmp,show_progress_bar=False)
                    result_set.add(results)
                    result_dict[results] = subject_direct_dict[subject]
                    
            if not result:
                subject_direct_word_dict[subject] = []
                subject_direct_dict[subject] = np.array([],dtype="float32")
        except Exception as e:
            print(f"Query for {subj} didn't work: {e}")
            subject_direct_dict[subject] = np.array([],dtype="float32")

    return subject_direct_dict, subject_direct_word_dict


In [218]:
page = wikipedia.page("Rick Mahler",auto_suggest=False, redirect=True)

## Subject Direct Links Fast

In [168]:
import wikipedia
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed

In [169]:
def get_links_with_context(page):
    links_with_context = []
    for link in page.links:
        try:
            link_page = wikipedia.page(link, auto_suggest=False, redirect=True)
            context = link_page.content.split('\n')[0]  # Get the first paragraph
            links_with_context.append(f"{link}: {context}")
        except Exception as e:
            print(f"Could not fetch context for {link}: {e}")
            links_with_context.append(f"{link}")
    return links_with_context

In [170]:
import wikipedia
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed

def subjects_direct_links_wiki_fast(subject_with_context):
    subject_direct_dict = {}
    subject_direct_word_dict = {}
    subject_kg_link_dict = {}
    result_set = set([])
    result_dict = {}
    
    def process_subject(pair):
        subject = pair[0]
        context = pair[1]
        try:
            subj = subject + ": " + context
            if len(subj) > 290:
                subj = subj[:290] 
                # print(f"Truncated query: {subj}")
            
            result = wikipedia.search(subj, results=1)
            if result == []:
                result = wikipedia.search(subject,results=1)
            for results in result:
                KG_Link = results
                if results in result_set:
                    return subject, result_dict[results], [], KG_Link
                else:
                    page = wikipedia.page(results,auto_suggest=False,redirect=True)
                    links = [link for link in page.links]
                    return subject, None, links, KG_Link
            return subject, None, [], ""
        except Exception as e:
            print(f"Query for {subj} didn't work: {e}")
            return subject, None, [], ""

    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_subject, pair) for pair in subject_with_context]
        for future in as_completed(futures):
            subject, encoded_data, links, KG_Link = future.result()
            if encoded_data is not None:
                subject_direct_dict[subject] = encoded_data
                subject_kg_link_dict[subject] = KG_Link
            else:
                if links:
                    encoded_links = model.encode(links, show_progress_bar=False)
                    subject_direct_word_dict[subject] = links
                    subject_direct_dict[subject] = encoded_links
                    subject_kg_link_dict[subject] = KG_Link
                    result_set.add(subject)
                    result_dict[subject] = encoded_links
                else:
                    subject_direct_word_dict[subject] = []
                    subject_direct_dict[subject] = np.array([], dtype="float32")
                    subject_kg_link_dict[subject] = KG_Link

    return subject_direct_dict, subject_direct_word_dict, subject_kg_link_dict


In [171]:
search_term = "Steve Jobs"
page = wikipedia.page((wikipedia.search(search_term,results=1))[0],auto_suggest=False,redirect=True)
print([links for links in page.links])

['1984 (advertisement)', '1984 (television commercial)', '20th Century Animation', '22 vs. Earth', '8-bit', 'ABC News (United States)', 'AIM alliance', 'AP English Literature and Composition', "A Bug's Land", "A Bug's Life", 'A Computer Animated Hand', 'A Spark Story', 'Academy Award for Best Animated Feature', 'Ad Age', 'Ahwahnee Hotel', 'AirPods', 'AirPods Max', 'AirPods Pro', 'AirTag', 'Akamai Technologies', 'Al Eisenstat', 'Al Gore', 'Alan Kay', 'Alex Gorsky', 'Alien Swirling Saucers', 'All-in-one PC', 'All One Farm', 'Allusion', 'Alta Mesa Memorial Park', 'Alternative medicine', 'Alvy Ray Smith', 'American University of Beirut', 'Andrea Jung', 'Andreas Deja', 'Andy Hertzfeld', 'Angela Ahrendts', 'Anika Noni Rose', 'Animation studio', 'Annual meeting', 'Anobit', 'App Store', 'App Store (iOS)', "Apple's EU tax dispute", 'Apple.com', 'AppleCare+', 'AppleInsider', 'AppleMasters', 'AppleToo', 'Apple Arcade', 'Apple Authorized Service Provider', 'Apple Books', 'Apple Campus', 'Apple Car

In [172]:
page

<WikipediaPage 'Steve Jobs'>

In [173]:
subjects_with_context

[['Wilhelm Windelband',
  'Wilhelm Windelband (May 11, 1848 - October 22, 1915) was a German philosopher of the Baden School.'],
 ['terms',
  'the terms "nomothetic" and "idiographic", which he introduced have currency in psychology and other areas, though not necessarily in line with Wilhelm Windelband original meanings.'],
 ['Wilhelm Windelband interests',
  'Wilhelm Windelband interests in psychology and cultural sciences represented an opposition to psychologism and historicism schools by a critical philosophic system.'],
 ['Wilhelm Windelband disciples',
  'Wilhelm Windelband disciples were not only noted philosophers, but sociologists like Max Weber and theologians like Ernst Troeltsch and Albert Schweitzer.']]

In [219]:
subjects_direct_links_wiki(subjects_with_context)


({'Wilhelm Windelband': array([[ 0.0129078 ,  0.07744548,  0.02033362, ...,  0.02409213,
           0.02910504,  0.00767747],
         [-0.01241453, -0.00914223, -0.00579345, ...,  0.02605269,
           0.01314212, -0.00260564],
         [ 0.03970159,  0.02328221,  0.01568113, ..., -0.03660033,
           0.03326635,  0.00548261],
         ...,
         [ 0.02180419,  0.06057727,  0.00269464, ...,  0.01583836,
           0.01757044,  0.00485697],
         [ 0.03798955,  0.08162113,  0.00288995, ...,  0.01411823,
           0.03410124, -0.03442033],
         [ 0.02450549,  0.07101952,  0.01709009, ...,  0.01251805,
          -0.01737141, -0.02389679]], dtype=float32),
  'terms': array([[ 0.0129078 ,  0.07744548,  0.02033362, ...,  0.02409213,
           0.02910504,  0.00767747],
         [-0.01241453, -0.00914223, -0.00579345, ...,  0.02605269,
           0.01314212, -0.00260564],
         [ 0.03970159,  0.02328221,  0.01568113, ..., -0.03660033,
           0.03326635,  0.00548261],
  

In [220]:
subjects_direct_links_wiki_fast(subjects_with_context)


({'Wilhelm Windelband interests': array([[ 0.0129078 ,  0.07744548,  0.02033362, ...,  0.02409213,
           0.02910504,  0.00767747],
         [-0.01241453, -0.00914223, -0.00579345, ...,  0.02605269,
           0.01314212, -0.00260564],
         [ 0.03970159,  0.02328221,  0.01568113, ..., -0.03660033,
           0.03326635,  0.00548261],
         ...,
         [ 0.02180419,  0.06057727,  0.00269464, ...,  0.01583836,
           0.01757044,  0.00485697],
         [ 0.03798955,  0.08162113,  0.00288995, ...,  0.01411823,
           0.03410124, -0.03442033],
         [ 0.02450549,  0.07101952,  0.01709009, ...,  0.01251805,
          -0.01737141, -0.02389679]], dtype=float32),
  'Wilhelm Windelband disciples': array([[ 0.0129078 ,  0.07744548,  0.02033362, ...,  0.02409213,
           0.02910504,  0.00767747],
         [-0.01241453, -0.00914223, -0.00579345, ...,  0.02605269,
           0.01314212, -0.00260564],
         [ 0.03970159,  0.02328221,  0.01568113, ..., -0.03660033,
      

## Scoring

In [221]:
def process_sentence_set_wiki(args):
    index, sent_sets, subject_dict, subject_word_dict = args
    score = 0
    local_pair_and_values = []
    for pair in sent_sets:
        local_subject= pair[0]
        entity = pair[1]
        
        entity_embeddings = model.encode(entity,show_progress_bar=False)
        embeddings = subject_dict[local_subject]
        words = subject_word_dict[local_subject]
        
        if embeddings.size == 0:
            local_pair_and_values.append([pair[0], pair[1], 0.5])
            score+=0.5
            continue
            
        cosine_similarities = np.dot(embeddings, entity_embeddings)
        max_similarity = np.max(cosine_similarities)
        max_similarity_index = np.argmax(cosine_similarities)
        
        # print(f"Entity: {entity}\nMost Similar: {words[max_similarity_index]} \n Score: {max_similarity}\n\n ")
        
        if max_similarity>0.6:
            score+=1
            local_pair_and_values.append([pair[0],pair[1],1])
        else:
            local_pair_and_values.append([pair[0],pair[1],0])
            
    fraction = score / len(sent_sets) if len(sent_sets) > 0 else 0.5
    return index, fraction, local_pair_and_values

def scoring_parallel_version_wiki(text):
    # print(text)
    pairs, final_sents, subject_set, subjects_with_context, output = get_sentence_based_links_wiki(text)
    subject_dict, subject_word_dict, subject_to_KG_links = subjects_direct_links_wiki_fast(subjects_with_context)
    
    output["Subject_direct_links"] = subject_word_dict
    output["Subject_direct_links_embeddings"] = subject_dict
    output["Subject_to_KG_links"] = subject_to_KG_links
    
    # print(subject_set)

    if not pairs:
        print("No entity pairs found here.")
        return 0.5, [], final_sents

    fractions = [None] * len(pairs)
    pair_and_values = [None] * len(pairs)

    with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
        task_args = [(index, sent_sets, subject_dict, subject_word_dict) for index, sent_sets in enumerate(pairs)]
        results = list(executor.map(process_sentence_set_wiki, task_args))

    for index, fraction, local_pair_and_values in results:
        fractions[index] = fraction
        pair_and_values[index] = local_pair_and_values

    output["Sentence_wise_pairs_and_values"] = pair_and_values
    output["Sentence_wise_scores"] = fractions
    
    flat_pair_and_values = [item for sublist in pair_and_values for item in sublist]
    return fractions, flat_pair_and_values, final_sents,output


In [24]:
text = "Virat Kohli is an Indian Cricketer who plays for India."

In [181]:
text

'Wilhelm Windelband (May 11, 1848 - October 22, 1915) was a German philosopher of the Baden School.\nWindelband is now mainly remembered for the terms "nomothetic" and "idiographic", which he introduced.\nThese have currency in psychology and other areas, though not necessarily in line with his original meanings.\nWindelband was a Neo-Kantian who protested other Neo-Kantians of his time and maintained that "to understand Kant rightly means to go beyond him".\nAgainst his positivist contemporaries, Windelband argued that philosophy should engage in humanistic dialogue with the natural sciences rather than uncritically appropriating its methodologies.\nHis interests in psychology and cultural sciences represented an opposition to psychologism and historicism schools by a critical philosophic system.\nWindelband relied in his effort to reach beyond Kant on such philosophers as Georg Wilhelm Friedrich Hegel, Johann Friedrich Herbart, and Hermann Lotze.\nClosely associated with Windelband w

In [222]:
a,b,c,o = scoring_parallel_version_wiki(text)

07/04/2024 17:08:56 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:08:56 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

In [183]:
a

[0.3333333333333333, 0.5, 0.5, 1.0, 0.5, 0.5, 1.0, 1.0, 1.0]

In [223]:
def print_dict_schema(data, indent=0):
    for key, value in data.items():
        print(' ' * indent + f'{key}:', end=' ')
        if isinstance(value, dict):
            print('{')
            print_dict_schema(value, indent + 4)
            print(' ' * indent + '}')
        else:
            print(type(value).__name__)

In [224]:
_,_,_,output = (scoring_parallel_version_wiki(text))

print_dict_schema(output)

07/04/2024 17:09:07 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:09:07 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

Initial_Text: str
Coref_Resolved_Text: str
Coref_Resolved_Sentences: list
Sentence_wise_Entities: list
Sentence_wise_subjects: list
Sentence_Wise_pairs: list
Subject_direct_links: {
    Wilhelm Windelband disciples: list
    terms: list
    Wilhelm Windelband interests: list
    Wilhelm Windelband: list
}
Subject_direct_links_embeddings: {
    Wilhelm Windelband disciples: ndarray
    terms: ndarray
    Wilhelm Windelband interests: ndarray
    Wilhelm Windelband: ndarray
}
Subject_to_KG_links: {
    Wilhelm Windelband disciples: str
    terms: str
    Wilhelm Windelband interests: str
    Wilhelm Windelband: str
}
Sentence_wise_pairs_and_values: list
Sentence_wise_scores: list


## Testing

In [225]:
accurate = []
minor_inaccurate = []
major_inaccurate = []

In [195]:
outputs = []

In [197]:
folder_name = "Output"

if not os.path.exists(folder_name):
        os.makedirs(folder_name)

for i in tqdm(range(0,50), desc="Processing entries", unit="entry"):
    list_of_sentences = (dataset["evaluation"][i]["gpt3_sentences"])
    sentences = ''''''
    for s in list_of_sentences:
        tmp = preprocess_text(s)
        if tmp[-1]!='.':
            tmp+='.'
        sentences = sentences + tmp + "\n" 
        
    # sentences = sentences.strip()
    # raw = dataset["evaluation"][i]["wiki_bio_text"]
    
    ground_truth_doc = preprocessing(dataset["evaluation"][i]["wiki_bio_text"])
    ground_truth = ''''''
    for sent in ground_truth_doc.sents:
        temp = preprocess_text(sent.text.strip())  
        if temp:
            if temp[-1] not in '.!?': 
                temp += '.'
            ground_truth += temp + "\n"

    ground_truth = ground_truth.strip()
    # print(ground_truth)
    
    annotation = dataset["evaluation"][i]["annotation"]
    
    sentences_scores, sentence_pairs_and_values , sentence_coref_sents, output1= scoring_parallel_version_wiki(sentences)
    ground_truth_scores, ground_pairs_and_values, ground_coref_sents, output2 = scoring_parallel_version_wiki(ground_truth)
    output1["Sentence_Wise_Annotations"] = annotation
    output2["Sentence_Wise_Annotations"] = ["accurate"]*len(output2["Coref_Resolved_Sentences"])
    
    outputs.append(output2)
    if len(sentences_scores)==len(annotation):
        outputs.append(output1)
    
        
    filename = os.path.join(folder_name, f"entry_{i+1}.txt")
    with open(filename, 'w') as file:
        file.write("#############SENTENCE_PAIRS############\n\n")
        for x in range(len(sentence_pairs_and_values)):
            file.write(f"{sentence_pairs_and_values[x][0]} and {sentence_pairs_and_values[x][1]} and the value is : {sentence_pairs_and_values[x][2]}\n")
        file.write("\n")
        file.write("#############GROUND_PAIRS############\n\n")
        for x in range(len(ground_pairs_and_values)):
            file.write(f"{ground_pairs_and_values[x][0]} and {ground_pairs_and_values[x][1]} and the value is : {ground_pairs_and_values[x][2]}\n")
        file.write("\n\n")
        file.write("%%%%%%%%%%%%%%%%%%%%SENTENCES%%%%%%%%%%%%%%%%%\n")
        file.write(f"Sentences : \n\n{sentences}\n\n")
        file.write(f"Coref Resolved: \n\n")
        for y,x in enumerate(sentence_coref_sents):
            file.write(f"{y}. {x}\n")
        file.write("\n")
        file.write("\n")
        file.write("%%%%%%%%%%%%%%%%%%%%GROUND_TRUTH%%%%%%%%%%%%%%%\n")
        file.write(f"Ground Truth : \n\n{ground_truth} \n\n")
        file.write(f"Coref Resolved: \n\n")
        for y,x in enumerate(ground_coref_sents):
            file.write(f"{y}. {x}\n")
        file.write("\n")
        file.write("%%%%%%%%%%%%%%%%%%%%FRACTIONS%%%%%%%%%%%%%%%%%%\n")
        file.write(f"Value for sentences is : \n")
        for x in sentences_scores:
            file.write(f"{x} ")
        file.write("\n")
        file.write("\n")
        file.write(f"Value for ground truth is : \n")
        for x in ground_truth_scores:
            file.write(f"{x} ")
        file.write("\n\n")
        file.write("%%%%%%%%%%%%%%%%%%%%ANNOTATIONS%%%%%%%%%%%%%%%%\n")
        for x in annotation:
            file.write(f"{x} ")
        file.write("\n\n")
        
    
    if len(ground_truth_scores)>0:
        print(f"The ground truth scores are: {sum(ground_truth_scores)/len(ground_truth_scores)}\n")

    for scores in ground_truth_scores:
            accurate.append(scores)
            
    if len(sentences_scores)!=len(annotation):
        continue
    
    for t,score in enumerate(sentences_scores):
        if annotation[t-1]=="accurate":
            accurate.append(score) 
        elif annotation[t-1]=="minor_inaccurate":
            minor_inaccurate.append(score)
        elif annotation[t-1]=="major_inaccurate":
            major_inaccurate.append(score)

Processing entries:   0%|          | 0/50 [00:00<?, ?entry/s]

07/04/2024 16:49:59 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:49:59 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 16:50:10 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:50:11 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.686611090059366



07/04/2024 16:50:24 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:50:24 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 16:50:31 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:50:31 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.6709183673469388



07/04/2024 16:50:37 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:50:37 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 16:50:47 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:50:47 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.4961861667744021



07/04/2024 16:50:56 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:50:56 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 16:51:05 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:51:05 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.4258333333333333



07/04/2024 16:51:11 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:51:11 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 16:51:47 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:51:47 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.26486928104575164



07/04/2024 16:51:56 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:51:56 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 16:52:03 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:52:03 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.3888888888888889



07/04/2024 16:52:10 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:52:10 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 16:52:19 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:52:19 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.4398809523809523



07/04/2024 16:52:25 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:52:25 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 16:52:35 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:52:35 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.7037037037037037



07/04/2024 16:52:41 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:52:41 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 16:52:46 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:52:46 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.137037037037037



07/04/2024 16:52:55 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:52:55 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 16:53:04 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:53:04 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.5634920634920636



07/04/2024 16:53:09 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:53:10 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 16:53:14 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:53:15 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.5246031746031746



07/04/2024 16:53:20 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:53:20 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 16:53:33 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:53:34 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.34023754023754027



07/04/2024 16:53:46 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:53:46 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 16:54:08 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:54:08 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]



  lis = BeautifulSoup(html).find_all('li')


Query for Winnebago: As well as playing Mondo Generator gigs, two Bens, Ben Perrier (vocals, guitar) and Ben Thomas (drums) still toured as Winnebago Deal and even supported Mondo Generator , earning Winnebago Deal the moniker "Winnebago Generator" from fans. didn't work: "Winnebago" may refer to: 
Ho-Chunk
Winnebago Tribe of Nebraska
Winnebago language
Winnebago (chicken)
Winnebago Council
Winnebago Industries
Lake Winnebago
Winnebago Pool
Winnebago Scout Reservation
Winnebago, Illinois
Winnebago, Minnesota
Winnebago, Nebraska
Winnebago, Wisconsin
Winnebago Mission, Wisconsin
Winnebago County (disambiguation)
Winnebago Township (disambiguation)
Query for it: After recording tracks at Dave Grohl's Studio 606 for the next Mondo Generator album with producer Nick Raskulinecz, it was announced in July 2006 2006 that two Bens, Ben Perrier (vocals, guitar) and Ben Thomas (drums) had left Mondo Generator for "undisclosed reasons". didn't work: "It" may refer to: 
It (pronoun)
Information tec

07/04/2024 16:54:29 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:54:29 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 16:54:36 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:54:36 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.4627705627705627



07/04/2024 16:54:48 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:54:48 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 16:54:54 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:54:54 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.553641456582633



07/04/2024 16:55:03 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:55:03 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

Query for Quinn: William "Bill" Quinn (April 28, 1912 April 22, 1994) was an American actor, best known for William "Bill" Quinn role as Ralph Hourback on the CBS television series The Rifleman . didn't work: "Quinn" may refer to: 
Quinn (soccer)
Quinn (given name)
Quinn (surname)
Quinn (musician)
Quinn, Kentucky
Quinn, Michigan
Quinn, Missouri
Quinn, South Dakota
Quinn River
Quinn House, San Francisco
A. V. Quinn House
Masten-Quinn House
Quin House
Quinn (album)
Mannok
Quinn Industrial Holdings
University College Dublin
Quinn the Eskimo
Quin (disambiguation)
Quinns (disambiguation)
Harley Quinn (disambiguation)


07/04/2024 16:55:08 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:55:08 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.3869047619047619



07/04/2024 16:55:14 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:55:14 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 16:55:21 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:55:21 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.6916666666666667



07/04/2024 16:55:30 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:55:30 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 16:55:39 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:55:39 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.46190476190476193



07/04/2024 16:55:45 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:55:45 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 16:55:50 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:55:50 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.3572751322751322



07/04/2024 16:55:57 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:55:57 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

Query for he: In addition to Rod Morgenstein (born April 18, 1959) work with the progressive rock band Winger, which he joined in 1987 , Rod Morgenstein (born April 18, 1959) has also performed and recorded with Steve Morse Band, Jordan Rudess Jordan Rudess , and many other artists. didn't work: "He" may refer to: 
He (letter)
He (pronoun)
He (kana)
Ge (Cyrillic)
Hebrew language
He County
He River
Hebei
Hessen
He (surname)
Zheng He
He-He er xian
Immortal Woman He
"He" (short story)
Katherine Anne Porter
He (film)
"He" (song)
Jars of Clay (album)
John Connolly
HE...
Hé (Chinese pastry)
His Eminence
Excellency
Hektoen enteric agar
Helium
Hemagglutinin esterase
Hematoxylin and eosin stain
Hepatic encephalopathy
High explosive
Holocene calendar
Holocene
Homomorphic encryption
High-explosive anti-tank
High-explosive incendiary
High-explosive incendiary/armor-piercing ammunition
Heathrow Express
Heinkel
Higher education
Hurricane Electric
Lobotomy Corporation
Hezhou (disambiguation)


07/04/2024 16:56:03 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:56:04 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.6757575757575757



07/04/2024 16:56:10 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:56:10 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 16:56:16 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:56:16 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.33312324929971987



07/04/2024 16:56:27 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:56:27 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 16:56:38 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:56:38 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.23148148148148145



07/04/2024 16:56:45 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:56:45 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 16:56:51 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:56:51 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.4290674603174603



07/04/2024 16:56:58 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:56:58 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 16:57:10 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:57:10 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

Query for Castellano: Heather "Torry" Castellano (born January 8, 1979, in San Francisco, California) is the cousin of actress Laura San Giacomo. didn't work: "Castellano" may refer to: 
Castilian (disambiguation)
Castile (historical region)
Spanish language
Castilian Spanish
Castellano (surname)
Castellano (grape)
Castellano, Trentino
Castellano (river)
All pages with titles beginning with Castellano
All pages with titles containing Castellano
Castellanos (disambiguation)
Castellani (disambiguation)
Carea Castellano Manchego
The ground truth scores are: 0.38571428571428573



07/04/2024 16:57:19 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:57:19 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 16:57:29 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:57:29 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

Query for conflict: Nevertheless, the conflict with his younger brother Dietrich continued as Albert I, the proud (de "Albrecht I der Stolze") (1158 24 June 1195) tried to regain the title . didn't work: "Conflict" may refer to: 
Conflict (process)
Conflict continuum
Conflict of interest
Cultural conflict
Ethnic conflict
Group conflict
Intragroup conflict
Organizational conflict
Role conflict
Social conflict
Work–family conflict
Violence
war
Conflict (narrative)
Conflict (air traffic control)
Conflict (revision control)
HMS Conflict
HMS Conflict (1873)
HMS Conflict (1894)
Conflict (1921 film)
Conflict (1936 film)
Conflict (1937 film)
Conflict (1938 film)
Conflict (1945 film)
Catholics: A Fable (1973 film)
Judith (1966 film)
Samar (1999 film)
Conflict (series)
Conflict (video game)
Conflict: Middle East Political Simulator
Conflict (novel)
Gerard Cosloy
Margie Harris
board wargames
Conflict (band)
Conflict (Sy Smith album)
Conflict (Jimmy Woods album)
The Sickness
Conflict (1978 TV seri

07/04/2024 16:57:37 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:57:37 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

Query for she: In addition to Sirið Stenberg work with the Faroese band Týr, in which she is the lead vocalist and plays the violin , Sirið Stenberg has released two solo albums, and has collaborated with various other Faroese and international artists. didn't work: "She" may refer to: 
She (pronoun)
She County, Anhui
She Prefecture
She County, Hebei
She River
She people
She Chinese
She language
She (surname)
She (Qi)
Empress She
She: A History of Adventure
She (1911 film)
She (1916 film)
She (1917 film)
She (1925 film)
She (1935 film)
She (1965 film)
She (1984 film)
Ophélie Winter
She (1954 film)
She (1967 film)
She (magazine)
She (Netflix series)
S.H.E
Solid HarmoniE
She (American band)
she (Swedish band)
she (Dalbello album)
She (Harry Connick Jr. album)
She (Jerusalem album)
She (Stiltskin album)
She (Viktor Lazlo album)
She (Wendy Matthews album)
s/he (album)
She (EP)
Monni
Sheryn Regis
"She" (Charles Aznavour song)
"She" (Green Day song)
"She" (Groove Coverage song)
"She" (Kiss s

07/04/2024 16:57:42 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:57:42 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.4365079365079365



07/04/2024 16:57:47 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:57:47 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 16:57:52 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:57:52 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.64421768707483



07/04/2024 16:57:58 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:57:58 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 16:58:11 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:58:11 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.6599999999999999



07/04/2024 16:58:17 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:58:17 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 16:58:27 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:58:27 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.4964113181504486



07/04/2024 16:58:46 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:58:46 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 16:58:53 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:58:53 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.33141025641025645



07/04/2024 16:59:01 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:59:01 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 16:59:07 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:59:07 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.5409226190476191



07/04/2024 16:59:14 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:59:14 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 16:59:49 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:59:49 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.4203703703703704



07/04/2024 16:59:55 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 16:59:55 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 17:00:00 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:00:00 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.356060606060606



07/04/2024 17:00:06 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:00:06 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 17:00:14 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:00:14 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.45



07/04/2024 17:00:21 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:00:21 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

Query for Zhuang: "Zhuang of Chu Chu (died 621 BC ) was the last ruler of the state of Chu during the Spring and Autumn period of ancient China. didn't work: "Zhuang" may refer to: 
Zhuang people
Zhuang languages
Zhuang logogram
Zhuang Zhou
Zhuang (surname)


07/04/2024 17:00:27 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:00:27 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

Query for which: In 611 BC King Zhuang of Chu (died 591 BC) annexed the state of Yong (庸国), a move which made Chu much stronger. didn't work: "Which" may refer to: 
relative pronoun
English interrogative word
which (command)
Which?
English relative clauses
Interrogative clause
Whicher (disambiguation)
All pages with titles containing Which
Query for output: Chu agricultural output improved significantly during his reign , aided by Sunshu Ao comprehensive dam-works and an enormous planned reservoir created in modern-day northern Anhui province. didn't work: "Output" may refer to: 
Input/output
state (computer science)
Output (economics)
Gross output
Net output
Power (physics)
Dependent variable
Output (album)
Input (disambiguation)
Query for minister: When a particularly senior minister challenged King Zhuang of Chu (died 591 BC) through a riddle, King Zhuang of Chu (died 591 BC) responded that King Zhuang of Chu (died 591 BC) had been waiting for three years for someone from King Zhuan

07/04/2024 17:00:34 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:00:35 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 17:00:47 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:00:47 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

Query for Flanagan household: The Flanagan household consisted of eight children Patricia Mary, Admiral William J. Flanagan, Jr., born on March 27, 1943, , Kathleen, John J., Peter A., Mary Margaret, Anne, and Joseph M. William J. Flanagan, Sr. was a member of the Massachusetts National Guard. didn't work: "Mark Flanagan" may refer to: 
Mark Flanagan (actor)
Mark Flanagan (boxer)
Mark Flanagan (chef)
Mark Flanagan (communications)
Mark Flanagan (musician)
Mark Flanagan (rugby league)
Mark Flanagan (rugby union)
Largo
Mark G. Flanagan
Marc Flanagan
The ground truth scores are: 0.31020408163265306



07/04/2024 17:01:05 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:01:05 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 17:01:12 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:01:12 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

Query for identity: Despite the existence of Hendrik van Rheede's over the last three centuries, the correct taxonomic identity of many plants listed in Hortus Malabaricus , many plants listed in Hortus Malabaricus medicinal properties, methods of use, etc., as described and codified by renowned tra didn't work: "Identity" may refer to: 
Identity document
Identity (philosophy)
Identity (social science)
Identity (mathematics)
Identity (1987 film)
Identity (2003 film)
Identity (game show)
Identity (TV series)
"Identity" (Arrow)
"Identity" (Burn Notice)
"Identity" (Charlie Jade)
"Identity" (Legend of the Seeker)
"Identity" (Law & Order: Special Victims Unit episode)
"Identity" (NCIS: Los Angeles)
Identity (3T album)
Identity (BoA album)
Identity (Far East Movement album)
Identity (Robert Pierre album)
Identity (Raghav album)
Identity (Victon EP)
Identity (Zee album)
"Identity" (Sakanaction song)
"Identity" (X-Ray Spex song)
London Town
The Art of Survival
Identity (music)
Identity (tuning

07/04/2024 17:01:22 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:01:22 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 17:01:33 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:01:33 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.15782312925170067



07/04/2024 17:01:40 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:01:40 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 17:01:45 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:01:46 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.3222222222222222



07/04/2024 17:01:51 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:01:51 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

Query for work: Hilda Kuper most famous work, An African Aristocracy Rank Among the Swazi (1944), is considered a classic in the field of anthropology. didn't work: "Work" may refer to: 
5 See also
Work (human activity)
Manual labour
House work
Working animal
Work (physics)
Work (electric field)
Work (thermodynamics)
Creative work
Work of art
WORK (FM)
WORK-LP
WOYK
The Work (band)
Work Group
Work (EP)
Work!
Work 1989–2002
Work (album)
"Work" (ASAP Ferg song)
"Work" (Iggy Azalea song)
"Work" (Ciara song)
"Work" (Jars of Clay song)
"Work" (Jimmy Eat World song)
"Work" (Rihanna song)
"Work" (Kelly Rowland song)
"Work" (The Saturdays song)
"Work" (The 2 Bears song)
Uprising
Grand Hustle Presents: In da Streetz Volume 4
Moment of Truth
Everyone Afraid to Be Forgotten
Jme
Wanderlust
Songs for Drella
Thelonious Monk
Charlotte Day Wilson
Golden Hour: Part 1
Work (film)
Work (painting)
Work (professional wrestling)
Work (vehicle)
"Work" (The Armando Iannucci Shows)
Work: A Story of Experience
G

07/04/2024 17:01:58 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:01:58 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.37050264550264544



07/04/2024 17:02:05 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:02:05 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 17:02:12 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:02:12 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.5777777777777778



07/04/2024 17:02:23 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:02:23 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 17:02:33 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:02:33 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.40740740740740744



07/04/2024 17:02:39 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:02:39 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 17:02:44 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:02:45 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.375



07/04/2024 17:02:50 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:02:50 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 17:02:59 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:02:59 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.5006410256410256



07/04/2024 17:03:05 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:03:05 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 17:03:16 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:03:16 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.36271929824561405



07/04/2024 17:03:30 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:03:30 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 17:03:39 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:03:39 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.32863969363969364



07/04/2024 17:03:54 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:03:54 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 17:04:04 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:04:04 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.5333333333333333



07/04/2024 17:04:14 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:04:14 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 17:04:19 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:04:19 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.6534722222222222



07/04/2024 17:04:29 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:04:29 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 17:04:37 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:04:37 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.3034920634920635



07/04/2024 17:04:45 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:04:45 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

07/04/2024 17:04:52 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:04:52 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The ground truth scores are: 0.2702947845804989



In [198]:
avrg_accurate = sum(accurate)/len(accurate)
print(avrg_accurate)

0.4302239242785771


In [199]:
len(accurate)

677

In [200]:
avrg_minor = sum(minor_inaccurate)/len(minor_inaccurate)
print(avrg_minor)

0.4238847837985769


In [201]:
len(minor_inaccurate)

116

In [202]:
avrg_major = sum(major_inaccurate)/len(major_inaccurate)
print(avrg_major)

0.42376283846872087


In [203]:
len(major_inaccurate)

153

In [204]:
avrg_inaccurate = (sum(minor_inaccurate) + sum(major_inaccurate))/(len(minor_inaccurate) + len(major_inaccurate))
print(avrg_inaccurate)

0.423815424558919


In [205]:
threshold_accurate = ((avrg_accurate)+(avrg_inaccurate))/2

In [206]:
predicted_labels = []
for score in accurate + minor_inaccurate + major_inaccurate:
    if score >= threshold_accurate:
        predicted_labels.append("accurate")
    else:
        predicted_labels.append("inaccurate")


In [207]:
true_labels = ["accurate"] * len(accurate) + ["inaccurate"] * len(minor_inaccurate) + ["inaccurate"] * len(major_inaccurate)


In [208]:
from collections import defaultdict

# Initialize counters
confusion_matrix = defaultdict(lambda: {"TP": 0, "FP": 0, "FN": 0, "TN": 0})

# Calculate confusion matrix
for true, pred in zip(true_labels, predicted_labels):
    for category in ["accurate", "inaccurate"]:
        if true == category and pred == category:
            confusion_matrix[category]["TP"] += 1
        elif true == category and pred != category:
            confusion_matrix[category]["FN"] += 1
        elif true != category and pred == category:
            confusion_matrix[category]["FP"] += 1
        elif true != category and pred != category:
            confusion_matrix[category]["TN"] += 1

metrics = {}
for category in confusion_matrix:
    TP = confusion_matrix[category]["TP"]
    FP = confusion_matrix[category]["FP"]
    FN = confusion_matrix[category]["FN"]

    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0

    metrics[category] = {"Precision": precision, "Recall": recall}
    
for category in metrics:
    precision = metrics[category]["Precision"]
    recall = metrics[category]["Recall"]

    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    metrics[category]["F1 Score"] = f1_score

print(metrics)


{'accurate': {'Precision': 0.712890625, 'Recall': 0.5391432791728212, 'F1 Score': 0.6139613120269133}, 'inaccurate': {'Precision': 0.28110599078341014, 'Recall': 0.45353159851301117, 'F1 Score': 0.3470839260312945}}


In [209]:
sentence = []
prediction = []
temp_actual = []
scores = []
subject = []
subject_to_kg = []
direct_links_from_subject = []
direct_link_embeddings = []
sentence_pairs_and_values = []
base_text = []
base_text_coref_resolved = []
sentence_wise_entities = []


In [210]:
for output in outputs:
    sentence += output["Coref_Resolved_Sentences"]
    scores += output["Sentence_wise_scores"]
    
    for score in output["Sentence_wise_scores"]:
        if score >= threshold_accurate:
            prediction.append("accurate")
        else:
            prediction.append("inaccurate")
            
    temp_actual += output["Sentence_Wise_Annotations"]
    subject += output["Sentence_wise_subjects"]
    sentence_wise_entities += output["Sentence_wise_Entities"]
    
    subject_link_map = output["Subject_direct_links"]
    subject_link_embedding_map = output["Subject_direct_links_embeddings"]
    subject_kg_map = output["Subject_to_KG_links"]
    
    direct_links_from_subject += [subject_link_map[subj] if subj is not None else [] for subj in output["Sentence_wise_subjects"]]
    direct_link_embeddings += [subject_link_embedding_map[subj] if subj is not None else [] for subj in output["Sentence_wise_subjects"]]
    subject_to_kg += [subject_kg_map[subj] if subj is not None else "" for subj in output["Sentence_wise_subjects"]]
    
    sentence_pairs_and_values += output["Sentence_wise_pairs_and_values"]
    base_text += ([output["Initial_Text"]]*len(output["Coref_Resolved_Sentences"]))
    base_text_coref_resolved += ([output["Coref_Resolved_Text"]]*len(output["Coref_Resolved_Sentences"]))
    
actual = ["inaccurate" if item in ["minor_inaccurate", "major_inaccurate"] else item for item in temp_actual]


In [211]:
print("Length of Sentence:", len(sentence)) 
print("Length of Prediction:", len(prediction))
print("Length of Actual:", len(actual))
print("Length of Scores:", len(scores))
print("Length of Subject:", len(subject))
print("Length of Direct Links from Subject:", len(direct_links_from_subject))
print("Length of Direct Link Embeddings:", len(direct_link_embeddings))
print("Length of Subject to Knowledge Graph Linking :", len(subject_to_kg))
print("Length of Sentence Pairs and Values:", len(sentence_pairs_and_values))
print("Length of Base Text:", len(base_text))
print("Length of Base Text Coref Resolved:", len(base_text_coref_resolved))
print("Length of Sentence Wise Entities:", len(sentence_wise_entities))

Length of Sentence: 946
Length of Prediction: 946
Length of Actual: 946
Length of Scores: 946
Length of Subject: 946
Length of Direct Links from Subject: 946
Length of Direct Link Embeddings: 946
Length of Subject to Knowledge Graph Linking : 946
Length of Sentence Pairs and Values: 946
Length of Base Text: 946
Length of Base Text Coref Resolved: 946
Length of Sentence Wise Entities: 946


In [152]:
data= {
    "sentence": sentence,
    "prediction": prediction,
    "actual": actual,
    "entities": sentence_wise_entities,
    "pairs": sentence_pairs_and_values,
    "subject": subject,
    "subject-KB Link": subject_to_kg,
    "direct_links_from_subject": direct_links_from_subject,
    "scores": scores,
    "direct_link_embeddings": direct_link_embeddings,
    "base_text": base_text,
    "base_text_coref_resolved": base_text_coref_resolved
}

In [212]:
df = pd.DataFrame(data)
excel_file = 'output_data.xlsx'
df.to_excel(excel_file, index=False, sheet_name='Sheet1')
print(f"DataFrame successfully exported to Excel: '{excel_file}'")

DataFrame successfully exported to Excel: 'output_data.xlsx'


In [None]:
lst = ['Addiction medicine', 'Addiction psychiatry', 'Adolescent medicine', 'Allergy', 'Allied health professions', 'Alternative medicine', 'Anatomical pathology', 'Andrology', 'Anesthesiology', 'Angiology', 'Aviation medicine', 'Bachelor of Medical Sciences', 'Bachelor of Medicine, Bachelor of Surgery', 'Bertrand Dawson, 1st Viscount Dawson of Penn', 'Cardiac surgery', 'Cardiology', 'Cardiothoracic surgery', 'Carol M. Black', 'Charles Dodds', 'Charles Goodall (physician)', 'Charles Wilson, 1st Baron Moran', 'Chief physician', 'Clinical chemistry', 'Clinical neurophysiology', 'Clinical pathology', 'College of Physicians', 'Colorectal surgery', 'Cyril Clarke', 'Cytopathology', 'Dame Commander of the Most Excellent Order of the British Empire', 'Daniel Whistler', 'Dermatology', 'Dictionary of National Biography', 'Digestive system surgery', 'Disaster medicine', 'Diving medicine', 'Doctor of Medicine', 'Doctor of Osteopathic Medicine', 'Douglas Black (physician)', 'Edward Alston', 'Edward Browne (physician)', 'Edward Wotton (zoologist)', 'Emergency medicine', 'Endocrine surgery', 'Endocrinology', 'Evolutionary medicine', 'Eye surgery', 'Family medicine', 'Francis Glisson', 'Francis Prujean', 'Gastroenterology', 'General practice', 'General surgery', 'George Alberti', 'George Baker, 1st Baronet', 'George Ent', 'George Owen (physician)', 'George Rogers (physician)', 'Geriatrics', 'Gynaecology', 'Gynecologic oncology', 'Hampshire', 'Hand surgery', 'Harveian oration', 'Hematology', 'Henry Atkins (physician)', 'Henry Plumptre', 'Henry Revell Reynolds', 'Hepatology', 'History of medicine', 'Hospital medicine', 'Ian Gilmore', 'Immunology', 'Infectious diseases (medical specialty)', 'Intensive care medicine', 'Internal medicine', 'Interventional radiology', 'James Alderson', 'James Jurin', 'James Risdon Bennett', 'Jane Dacre', 'John Argent', 'John Ayrton Paris', 'John Bateman (physician)', 'John Burgess (physician)', 'John Caius', 'John Clark (17th-century physician)', 'John Clement (physician)', 'John Fryer (physician, died 1563)', 'John Giffard (physician)', 'John Latham (physician)', 'John Lawson (physician)', 'John Micklethwaite', 'John Symings', 'Josiah Clerk', 'Knight Commander of the Royal Victorian Order', 'Leeds', 'Leslie Turnberg, Baron Turnberg', 'List of extant baronetcies', 'List of presidents of the Royal College of Physicians', 'London', 'Lumleian Lectures', 'MD–PhD', 'Margaret Turner-Warwick', 'Marshall Hall (physiologist)', 'Mass gathering medicine', 'Master of Medicine', 'Master of Surgery', 'Maternal–fetal medicine', 'Max Rosenheim, Baron Rosenheim', 'Medical Scientist Training Program', 'Medical diagnosis', 'Medical education', 'Medical genetics', 'Medical microbiology', 'Medical school', 'Medical specialty', 'Medicine', 'Molecular oncology', 'Nanomedicine', 'Narcology', 'Neonatology', 'Nephrology', 'Neurologist', 'Neurology', 'Neuroradiology', 'Neurosurgery', 'Neurosurgical anesthesia', 'Nuclear medicine', 'Obstetric anesthesiology', 'Obstetrics', 'Obstetrics and gynaecology', 'Occupational medicine', 'Oncology', 'Ophthalmology', 'Oral and maxillofacial surgery', 'Oral medicine', 'Organ transplantation', 'Orthopedic surgery', 'Othowell Meverall', 'Otorhinolaryngology', 'Outline of medicine', 'Pain management', 'Palliative care', 'Pathology', 'Pediatric surgery', 'Pediatrics', 'Personalized medicine', 'Phlebologist', 'Physical medicine and rehabilitation', 'Physician', 'Plastic surgery', 'Preventive healthcare', 'Prison healthcare', 'Psychiatry', 'Public domain', 'Public health', 'Pulmonary congestion', 'Pulmonology', 'Radiation therapy', 'Radiology', 'Raymond Hoffenberg', 'Reproductive endocrinology and infertility', 'Reproductive medicine', 'Reproductive surgery', 'Reynolds Baronets', 'Reynolds baronets', 'Rheumatology', 'Richard Bartlot', 'Richard Caldwell', 'Richard Forster (physician)', 'Richard Master', 'Richard Palmer (physician)', 'Richard Smith (physician)', 'Richard Thompson (physician)', 'Richard Tyson (physician, 1680–1750)', 'Robert Huick', 'Robert Platt, Baron Platt', 'Roger Giffard', 'Romsey', 'Royal College of Physicians', 'Royal Medical and Chirurgical Society', 'Royal Society', 'Rural health', 'Samuel Collins (physician, born 1618)', 'Samuel Wilks', 'Sarah Clarke (doctor)', 'Sexual medicine', 'Simeon Fox', 'Sir Andrew Clark, 1st Baronet', 'Sir Andrew Goddard', 'Sir Francis Milman, 1st Baronet', 'Sir Frederick Taylor, 1st Baronet', 'Sir George Burrows, 1st Baronet', 'Sir Hans Sloane', 'Sir Henry Halford, 1st Baronet', 'Sir Humphry Rolleston, 1st Baronet', 'Sir John Bradford, 1st Baronet', 'Sir John Russell Reynolds, 1st Baronet', 'Sir Norman Moore, 1st Baronet', 'Sir Richard Powell, 1st Baronet', 'Sir Robert Hutchison, 1st Baronet', 'Sir Thomas Barlow, 1st Baronet', 'Sir William Church, 1st Baronet', 'Sir William Paddy', 'Sleep medicine', 'Sports medicine', 'Subspecialty', 'Surgery', 'Surgical oncology', 'The Hospital for Sick Children, Toronto', 'The London Gazette', 'Therapy', 'Thomas Bentley (physician)', 'Thomas Burwell', 'Thomas Coxe', 'Thomas Francis (16th-century physician)', 'Thomas Gisborne (physician)', 'Thomas Langton (physician)', 'Thomas Lawrence (physician)', 'Thomas Linacre', 'Thomas Mayo (physician)', 'Thomas Millington (physician)', 'Thomas Moundeford', 'Thomas Pellett', 'Thomas Reeve (physician)', 'Thomas Watson (physician)', 'Thomas Witherley', 'Traditional medicine', 'Transfusion medicine', 'Trauma surgery', 'Travel medicine', 'Tropical medicine', 'University College, London', 'University College Hospital', 'University of London', 'Urogynecology', 'Urology', 'Vascular surgery', 'Venereology', 'Veterinary medicine', 'Walter Charleton', 'Walter Hayle Walshe', 'Walter Russell Brain', 'Westminster Hospital', 'William Baronsdale', 'William Battie', 'William Browne (physician)', 'William Dawes (physician)', 'William Freeman (physician)', 'William Gilbert (physicist)', 'William Jenner, 1st Baronet', 'William Pitcairn', 'William Wasey']

In [None]:
list_embeddings = model.encode(lst)

In [None]:
word = "the latter year"

In [None]:
word_embedding = model.encode(word)

In [None]:
maxi = -1; 
new_word = ""

In [None]:
for i,local in enumerate(list_embeddings):
    if np.dot(word_embedding,local)>maxi:
        maxi = np.dot(word_embedding,local)
        new_word = lst[i]

In [None]:
new_word

In [None]:
maxi

In [None]:
word1 = "albert"

In [None]:
word2 = "albert Einstien"

In [None]:
print(np.dot(model.encode(word1),model.encode(word2)))

In [None]:
import concurrent.futures

def process_sentence_set_wiki(task_args):
    # Extract the index, sent_sets, and subject_dict from the arguments
    index, sent_sets, subject_dict = task_args
    # Process the sentence set (this is a placeholder for your actual logic)
    fraction = 0.5  # Example result
    local_pair_and_values = "example_pair_and_values"  # Example result
    return index, fraction, local_pair_and_values

# Example data for the task arguments
pairs = [("sentence_set1", "sentence_set2"), ("sentence_set3", "sentence_set4")]
subject_dict = {"subject1": "value1", "subject2": "value2"}

# Using ThreadPoolExecutor
with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
    task_args = [(index, sent_sets, subject_dict) for index, sent_sets in enumerate(pairs)]
    results = list(executor.map(process_sentence_set_wiki, task_args))

# Extract and use the results
for index, fraction, local_pair_and_values in results:
    print(f"Index: {index}, Fraction: {fraction}, Pair and Values: {local_pair_and_values}")
