In [1]:
import spacy
from spacy.tokens import Doc, Span
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('dbpedia_spotlight', config={'confidence': 0.5})
preprocessing = spacy.load('en_core_web_sm')
from fastcoref import LingMessCoref
coref_model = LingMessCoref()
from SPARQLWrapper import SPARQLWrapper, JSON
from datasets import load_dataset
dataset = load_dataset("potsawee/wiki_bio_gpt3_hallucination")
import os
from tqdm.notebook import tqdm
import concurrent.futures
import re
import requests
import pandas as pd
import numpy as np

07/04/2024 17:17:30 - INFO - 	 missing_keys: []
07/04/2024 17:17:30 - INFO - 	 unexpected_keys: []
07/04/2024 17:17:30 - INFO - 	 mismatched_keys: []
07/04/2024 17:17:30 - INFO - 	 error_msgs: []
07/04/2024 17:17:30 - INFO - 	 Model Parameters: 590.0M, Transformer: 434.6M, Coref head: 155.4M


# Coreference Resolution

In [2]:

def get_cluster_spans(doc, clusters):
    fast_clusters = []
    for cluster in clusters:
        new_group = []
        for start, end in cluster:
            span = doc.char_span(start, end)
            if span is not None:
                new_group.append([span.start, span.end - 1])
        fast_clusters.append(new_group)
    return fast_clusters

def get_clusters(doc, text):
    preds = coref_model.predict(texts=[text])
    # print(f"\nThe clusters of same entities are as follows: {preds[0].get_clusters(as_strings=True)} \n")
    clusters = preds[0].get_clusters(as_strings=False)
    cluster_spans = get_cluster_spans(doc, clusters)
    return cluster_spans

def get_span_noun_indices(doc, cluster):    
    spans = [doc[start:end+1] for start, end in cluster]

    spans_pos = []
    for span in spans:
        pos_tags = [token.pos_ for token in span]
        spans_pos.append(pos_tags)

    noun_indices = []
    for i, pos_list in enumerate(spans_pos):
        if 'NOUN' in pos_list or 'PROPN' in pos_list:
            noun_indices.append(i)
    return noun_indices

def get_cluster_head(doc, cluster, noun_indices):
    head_idx = noun_indices[0]
    head_start, head_end = cluster[head_idx]
    head_span = doc[head_start:head_end+1]
    return head_span, [head_start, head_end]

def is_containing_other_spans(span, all_spans):
    for s in all_spans:
        if s[0] >= span[0] and s[1] <= span[1] and s != span:
            return True  
    return False

def replacement(coref, resolved, mention_span):
    start, end = coref
    mention_text = mention_span.text_with_ws 
    resolved[start] = mention_text
    for i in range(start + 1, end + 1):
        resolved[i] = ""
    return resolved

def replace_corefs(document, clusters):
    resolved = [token.text_with_ws for token in document]
    all_spans = [span for cluster in clusters for span in cluster]

    for cluster in clusters:
        noun_indices = get_span_noun_indices(document, cluster)

        if noun_indices:
            mention_span, mention = get_cluster_head(document, cluster, noun_indices)
        else:
            start, end = cluster[0]
            mention_span = document[start:end+1]
            mention = cluster[0]
            
        for coref in cluster:
            if coref != mention and not is_containing_other_spans(coref, all_spans):
                resolved = replacement(coref, resolved, mention_span)

    
    return ("".join(resolved))


def coreference_resolution(text):
    doc = nlp(text)
    clusters = get_clusters(doc, text) 
    answer= replace_corefs(doc, clusters) 
    return answer

# Pre-Processing

In [3]:
def is_three_word_name(entity):
    return len(entity.text.split()) >= 3 and entity.label_ == "PERSON"

In [4]:
def replace_three_worded_names(text):
    doc = preprocessing(text)
    new_text = text
    
    for entity in doc.ents:
        if is_three_word_name(entity):
            words = entity.text.split()
            new_name = f"{words[0]} {words[-1]}"
            new_text = new_text.replace(entity.text, new_name)
    return new_text

In [5]:
def preprocess_text(text): 
    # text = replace_three_worded_names(text) 
    text = re.sub(r'[^\w\s.,()\'"\-]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r"(['\"])\1+", r"\1", text)
    # preprocessed_text = coreference_resolution(text)
    return text

# Link Formation from Text

In [6]:
def get_sentence_based_links_subj(text):
    final_text = coreference_resolution(text)
    
    doc = nlp(final_text)
    
    entities = list(doc.ents)
    # print("Entities found by spaCy:", entities)
    
    subjects = []
    for sent in doc.sents:
        found_subject = False
        for token in sent:
            print(f"{token},{token.dep_}")
            if token.dep_ in ['nsubj', 'nsubjpass'] and token.ent_kb_id_:
                subjects.append(token.ent_kb_id_)
                found_subject = True
                break
        if not found_subject:
            subjects.append(None)
    # print("Subjects identified:", subjects)
    
    sentence_forms = []
    for sent in doc.sents:
        entities_set = set()
        entities = []
        for ent in sent.ents:
            if ent.kb_id_ and ent.kb_id_ not in entities_set:
                entities.append(ent.kb_id_)
                entities_set.add(ent.kb_id_)
        sentence_forms.append(entities)
    # print("Entities in each sentence:", sentence_forms)
    
    pairs = []
    count = 0
    for i in range(len(sentence_forms)):
        if subjects[i] is not None and len(sentence_forms[i]) > 1:
            for entity in sentence_forms[i]:
                if subjects[i] != entity:
                    pairs.append([subjects[i], entity])
                    count += 1
    print(f"The number of pairs is: {count}\n")
    
    return pairs, final_text

Testing - 

In [7]:
text = "Sir John Russell Reynolds, 1st Baronet (22 May 1828 29 May 1896) was a British neurologist and physician. Reynolds was born in Romsey, Hampshire, as the son of John Reynolds, an independent minister, and the grandson of Dr. Henry Revell Reynolds."

pairs,text = get_sentence_based_links_subj(text)
print(pairs)

07/04/2024 17:17:54 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:17:54 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

Sir,compound
John,compound
Russell,compound
Reynolds,nsubj
Sir,compound
John,compound
Russell,compound
Reynolds,nsubjpass
The number of pairs is: 5

[['http://dbpedia.org/resource/John_Russell_Reynolds', 'http://dbpedia.org/resource/Neurology'], ['http://dbpedia.org/resource/John_Russell_Reynolds', 'http://dbpedia.org/resource/Physician'], ['http://dbpedia.org/resource/John_Russell_Reynolds', 'http://dbpedia.org/resource/Romsey'], ['http://dbpedia.org/resource/John_Russell_Reynolds', 'http://dbpedia.org/resource/John_F._Reynolds'], ['http://dbpedia.org/resource/John_Russell_Reynolds', 'http://dbpedia.org/resource/Henry_Revell_Reynolds']]


In [8]:
def get_sentence_based_links_all(text):
    final_text = coreference_resolution(text)
    doc = nlp(final_text)
    sentence_forms = []
    for sent in doc.sents:
        entities_set = set()  
        entities = []  
        for ent in sent.ents:
            # print(ent)
            if ent.kb_id_ != "" and ent.kb_id_ not in entities_set:
                entities.append(ent.kb_id_)
                entities_set.add(ent.kb_id_)
        sentence_forms.append(entities)
        # sentence_forms.append([ent.kb_id_ for ent in sent.ents if ent.kb_id_ != ""])
    pairs = []
    count = 0
    for entities in sentence_forms:
        if len(entities)>1:
            for i in range(len(entities)):
                for j in range(i+1,len(entities)):
                    pairs.append([entities[i],entities[j]])
                    count+=1
    print(f"The number of pairs is: {count}\n")
    
    return pairs,final_text

# Spotlight Based System

## Link Making

In [9]:
def get_sentence_based_links(text):
    final_text = coreference_resolution(text)
    doc = nlp(final_text)
    final_sents = [sents for sents in doc.sents]
    # entities = list(doc.ents)
    # print("Entities found by spaCy:", entities)
    
    subjects = []
    for sent in doc.sents:
        found_subject = False
        for token in sent:
            # print(f"{token},{token.dep_}")
            if token.dep_ in ['nsubj', 'nsubjpass'] and token.ent_kb_id_:
                subjects.append(token.ent_kb_id_)
                found_subject = True
                break

        if not found_subject:
            subjects.append(None)
    # print("Subjects identified:", subjects)
    
    sentence_forms = []
    for sent in doc.sents:
        entities = []
        for ent in sent.ents:
            if ent.kb_id_:
                entities.append(ent.kb_id_)
        sentence_forms.append(entities)
    # print("Entities in each sentence:", sentence_forms)
    
    pairs = []
    count = 0
    for i in range(len(sentence_forms)):
        tmp_storage = []
        if subjects[i] is not None and len(sentence_forms[i]) > 1:
            for entity in sentence_forms[i]:
                if subjects[i] != entity:
                    tmp_storage.append([subjects[i], entity])
                    count += 1
        pairs.append(tmp_storage)
    print(f"The number of pairs is: {count}\n")
    
    return pairs, final_sents

In [10]:
text = """Wilhelm Windelband (May 11, 1848 - October 22, 1915) was a German philosopher of the Baden School. Wilhelm Windelband is now mainly remembered for the terms "nomothetic" and "idiographic", which Wilhelm Windelband introduced. the terms "nomothetic" and "idiographic", which he introducedhave currency in psychology and other areas, though not necessarily in line with Wilhelm Windelband original meanings. Wilhelm Windelband was a Neo-Kantian who protested other Neo-Kantians of Wilhelm Windelband time and maintained that "to understand Kant rightly means to go beyond Kant ". Against Wilhelm Windelband positivist contemporaries, Wilhelm Windelband argued that philosophy should engage in humanistic dialogue with the natural sciences rather than uncritically appropriating the natural sciences methodologies. Wilhelm Windelband interests in psychology and cultural sciences represented an opposition to psychologism and historicism schools by a critical philosophic system. Wilhelm Windelband relied in Wilhelm Windelband effort to reach beyond Kant on such philosophers as Georg Wilhelm Friedrich Hegel, Johann Friedrich Herbart, and Hermann Lotze. Closely associated with Wilhelm Windelband was Heinrich Rickert. Wilhelm Windelband disciples were not only noted philosophers, but sociologists like Max Weber and theologians like Ernst Troeltsch and Albert Schweitzer."""
print(get_sentence_based_links(text))

07/04/2024 17:18:06 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:18:06 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 14

([[['http://dbpedia.org/resource/Wilhelm_Windelband', 'http://dbpedia.org/resource/Philosopher'], ['http://dbpedia.org/resource/Wilhelm_Windelband', 'http://dbpedia.org/resource/Neo-Kantianism']], [['http://dbpedia.org/resource/Wilhelm_Windelband', 'http://dbpedia.org/resource/Nomothetic_and_idiographic'], ['http://dbpedia.org/resource/Wilhelm_Windelband', 'http://dbpedia.org/resource/Nomothetic_and_idiographic']], [], [['http://dbpedia.org/resource/Wilhelm_Windelband', 'http://dbpedia.org/resource/Neo-Kantianism'], ['http://dbpedia.org/resource/Wilhelm_Windelband', 'http://dbpedia.org/resource/Immanuel_Kant'], ['http://dbpedia.org/resource/Wilhelm_Windelband', 'http://dbpedia.org/resource/Immanuel_Kant']], [['http://dbpedia.org/resource/Wilhelm_Windelband', 'http://dbpedia.org/resource/Positivism'], ['http://dbpedia.org/resource/Wilhelm_Windelband', 'http://dbpedia.org/resource/Philosophy']], [], [['http://dbpedia.org/resource/Wilhelm_Windelband', 'http://d

## Link Checking

In [11]:
def check_direct_link(source_target):
    source_uri, target_uri = source_target
    sparql = SPARQLWrapper("https://dbpedia.org/sparql")
    sparql.setMethod('POST')  
    
    query_source_to_target = f"""
    ASK WHERE {{
      <{source_uri}> ?p <{target_uri}> .
    }}
    """
    sparql.setQuery(query_source_to_target)
    sparql.setReturnFormat(JSON)

    try:
        result_source_to_target = sparql.query().convert()
        has_link_source_to_target = result_source_to_target['boolean']
    except Exception as e:
        print(f"Error querying {source_uri} -> {target_uri}: {e}")
        has_link_source_to_target = False

    # Query from target to source
    query_target_to_source = f"""
    ASK WHERE {{
      <{target_uri}> ?p <{source_uri}> .
    }}
    """
    sparql.setQuery(query_target_to_source)

    try:
        result_target_to_source = sparql.query().convert()
        has_link_target_to_source = result_target_to_source['boolean']
    except Exception as e:
        print(f"Error querying {target_uri} -> {source_uri}: {e}")
        has_link_target_to_source = False

    # Combine the results
    has_link = has_link_source_to_target or has_link_target_to_source
    return source_uri, target_uri, has_link

In [12]:
def new_check_direct_link(source_target):
    source_uri, target_uri = source_target
    sparql = SPARQLWrapper("https://dbpedia.org/sparql")
    sparql.setMethod('POST')  
    
    query_source_to_target = f"""
    ASK WHERE {{
      <{source_uri}> ?p <{target_uri}> .
    }}
    """
    sparql.setQuery(query_source_to_target)
    sparql.setReturnFormat(JSON)

    try:
        result_source_to_target = sparql.query().convert()
        has_link_source_to_target = result_source_to_target['boolean']
    except Exception as e:
        print(f"Error querying {source_uri} -> {target_uri}: {e}")
        has_link_source_to_target = False

    # Query from target to source
    query_target_to_source = f"""
    ASK WHERE {{
      <{target_uri}> ?p <{source_uri}> .
    }}
    """
    sparql.setQuery(query_target_to_source)

    try:
        result_target_to_source = sparql.query().convert()
        has_link_target_to_source = result_target_to_source['boolean']
    except Exception as e:
        print(f"Error querying {target_uri} -> {source_uri}: {e}")
        has_link_target_to_source = False

    # Combine the results
    has_link = has_link_source_to_target or has_link_target_to_source
    return source_uri, target_uri, has_link

In [13]:
print(check_direct_link(["http://dbpedia.org/resource/John_Russell_Reynolds","http://dbpedia.org/resource/Judge"])[2])

False


## Scoring

In [14]:
def making_the_linear_version(text):
    pairs,final_sents = get_sentence_based_links(text)
    
    if not pairs:
        print("No entity pairs found here.")
        return 0,[],final_sents
    
    fractions = []
    pair_and_values = []
    
    for sent_sets in pairs:
        score = 0
        for pair in sent_sets:
            if check_direct_link(pair):
                score+=1
                pair_and_values.append([pair[0],pair[1],1])
            else:
                pair_and_values.append([pair[0],pair[1],0])
        if len(sent_sets)==0:
            fractions.append(-1)
        else:
            fractions.append(score/len(sent_sets))
    
    return fractions, pair_and_values, final_sents

In [15]:
def process_sentence_set(index_and_sent_sets):
    index, sent_sets = index_and_sent_sets
    score = 0
    local_pair_and_values = []
    for pair in sent_sets:
        if check_direct_link(pair)[2]:
            score += 1
            local_pair_and_values.append([pair[0], pair[1], 1])
        else:
            local_pair_and_values.append([pair[0], pair[1], 0])
    fraction = score / len(sent_sets) if len(sent_sets) > 0 else -1
    return index, fraction, local_pair_and_values

def making_the_parallel_version(text):
    pairs, final_sents = get_sentence_based_links(text)

    if not pairs:
        print("No entity pairs found here.")
        return 0, [], final_sents

    fractions = [None] * len(pairs)
    pair_and_values = [None] * len(pairs)

    # Set max_workers to 50
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        # Process each sentence set in parallel
        results = list(executor.map(process_sentence_set, enumerate(pairs)))

    # Place results back into the correct order
    for index, fraction, local_pair_and_values in results:
        fractions[index] = fraction
        pair_and_values[index] = local_pair_and_values

    # Flatten pair_and_values list
    flat_pair_and_values = [item for sublist in pair_and_values for item in sublist]

    return fractions, flat_pair_and_values, final_sents


In [13]:
def count_direct_links_and_fraction(text, num_workers=50):
    pairs,final_text = get_sentence_based_links(text)
    
    if not pairs:
        print("No entity pairs found.")
        return 0,[],final_text
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = {executor.submit(check_direct_link, pair): pair for pair in pairs}
        
        results = []
        for future in concurrent.futures.as_completed(futures):
            try:
                result = future.result()
                results.append(result)
            except Exception as e:
                source_target = futures[future]
                print(f"Error processing pair {source_target}: {e}")
    
    pairs_and_values = [[a, b, 1] if has_link else [a, b, 0] for a, b, has_link in results]
    direct_links_count = sum(entity[2] for entity in pairs_and_values)
    
    fraction = 0
    if(len(pairs)>0):
        fraction = direct_links_count/len(pairs)
    
    return fraction, pairs_and_values, final_text

In [16]:
# #test =  ["John Russell Reynolds (1820–1876) was an English lawyer, judge, and author.", "He was born in London, the son of a barrister, and was educated at Eton College and Trinity College, Cambridge.", "He was called to the bar in 1845, and became a Queen's Counsel in 1859.", "He was appointed a judge of the Court of Common Pleas in 1867, and was knighted in 1871.", "Reynolds was a prolific author, writing on a wide range of topics.", "He wrote several books on legal topics, including The Law of Libel and Slander (1863), The Law of Copyright (1865), and The Law of Patents for Inventions (1868).", "He also wrote on a variety of other topics, including history, biography, and literature.", "He was a frequent contributor to the Saturday Review, and wrote several books on Shakespeare, including The Mystery of William Shakespeare (1848) and The Authorship of Shakespeare (1875).", "He also wrote a biography of the poet John Keats (1848)." ]
# test = [ "Gordon David Strachan (born 9 February 1957) is a Scottish football manager and former player.", "He is the manager of the Scotland national team.", "Strachan played for Dundee, Aberdeen, Manchester United, Leeds United and Coventry City, as well as the Scotland national team.", "He has also managed Coventry City, Southampton, Celtic and Middlesbrough.", "Strachan began his managerial career at Coventry City in 1996, leading them to the 1997 FA Cup Final, where they lost to Tottenham Hotspur.", "He then moved to Southampton in 2001, where he guided them to the 2003 FA Cup Final, which they lost to Arsenal.", "In 2005, he was appointed manager of Celtic, where he won three consecutive Scottish Premier League titles and the Scottish League Cup twice.", "He left Celtic in 2009 and was appointed manager of Middlesbrough in October 2010.", "He left Middlesbrough in October 2013.", "In January 2013, Strachan was appointed manager of the Scotland national team.", "He has since led Scotland to the UEFA Euro 2016 qualifying playoffs, where they were eliminated by eventual finalists, and to the 2018 FIFA World Cup" ]
# text = ""
# for i in test:
#    text += " " + i
# print(text)
print(making_the_parallel_version(text))

# print(end_time - start_time)

# def number_of_direct_links_in_dbpedia(text):
#     pairs = get_sentence_based_links(text)
#     number_of_pairs = len(pairs)
#     existing_links = 0
    
#     for i in tqdm(range(len(pairs)), desc="Processing entries", unit="entry"):
#         if(check_direct_link(pairs[i][0], pairs[i][1])):
#             existing_links= existing_links+1
    
#     fraction = -1
#     if(number_of_pairs>0):
#         fraction = existing_links/number_of_pairs
#     print(f"The fraction of correct links = {fraction}")
#     return fraction


07/04/2024 17:18:25 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:18:25 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 14

([1.0, 1.0, -1, 1.0, 0.0, -1, 0.8, -1, -1], [['http://dbpedia.org/resource/Wilhelm_Windelband', 'http://dbpedia.org/resource/Philosopher', 1], ['http://dbpedia.org/resource/Wilhelm_Windelband', 'http://dbpedia.org/resource/Neo-Kantianism', 1], ['http://dbpedia.org/resource/Wilhelm_Windelband', 'http://dbpedia.org/resource/Nomothetic_and_idiographic', 1], ['http://dbpedia.org/resource/Wilhelm_Windelband', 'http://dbpedia.org/resource/Nomothetic_and_idiographic', 1], ['http://dbpedia.org/resource/Wilhelm_Windelband', 'http://dbpedia.org/resource/Neo-Kantianism', 1], ['http://dbpedia.org/resource/Wilhelm_Windelband', 'http://dbpedia.org/resource/Immanuel_Kant', 1], ['http://dbpedia.org/resource/Wilhelm_Windelband', 'http://dbpedia.org/resource/Immanuel_Kant', 1], ['http://dbpedia.org/resource/Wilhelm_Windelband', 'http://dbpedia.org/resource/Positivism', 0], ['http://dbpedia.org/resource/Wilhelm_Windelband', 'http://dbpedia.org/resource/Philosophy', 0], ['http:

## IO Testing

In [None]:

def check_i_th_entry_in_database(i):
    sentences = dataset["evaluation"][i]["gpt3_text"]
    ground_truth = dataset["evaluation"][i]["wiki_bio_text"]
    annotation = dataset["evaluation"][i]["annotation"]

    ground_truth_pairs = get_sentence_based_links(ground_truth)
    sentence_pairs = get_sentence_based_links(sentences)
    
    fraction = 0; 
    count = len(sentence_pairs)
    match = 0
    for pair in sentence_pairs:
        temp = [pair[1],pair[0]]
        if pair in ground_truth_pairs or temp in ground_truth_pairs:
            match+=1
    if(count!=0):
        fraction=match/count
    return sentences, ground_truth,fraction,annotation,sentence_pairs,ground_truth_pairs
  
def write_entries_to_files(entries, folder_name="Self_GPT_Testing"):
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    
    for i, entry in enumerate(entries):
        filename = os.path.join(folder_name, f"entry_{i+1}.txt")
        with open(filename, 'w') as file:
            file.write("#############GROUND_PAIRS############\n\n")
            for x in entry[5]:
                file.write(f"{x[0]} and {x[1]}\n\n")
            file.write("#############SENTENCE_PAIRS############\n\n")
            for x in entry[4]:
                file.write(f"{x[0]} and {x[1]}\n\n")
            file.write("%%%%%%%%%%%%%%%%%%%%SENTENCES%%%%%%%%%%%%%%%%%\n")
            file.write(entry[0])
            file.write("\n\n")
            file.write("%%%%%%%%%%%%%%%%%%%%GROUND_TRUTH%%%%%%%%%%%%%%%\n")
            file.write(entry[1])
            file.write("\n\n")
            file.write("%%%%%%%%%%%%%%%%%%%%FRACTIONS%%%%%%%%%%%%%%%%%%\n")
            file.write(f"{entry[2]}")
            file.write("\n\n")
            file.write("%%%%%%%%%%%%%%%%%%%%ANNOTATIONS%%%%%%%%%%%%%%%%\n")
            for i in entry[3]:
                file.write(f"{i} ")
            file.write("\n\n")

# WikiData Link Formation

In [None]:
import spacy
import requests

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Function to link entities to Wikidata/Wikipedia
def link_entity(entity):
    url = f"https://www.wikidata.org/w/api.php?action=wbsearchentities&search={entity}&language=en&format=json"
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)
        data = response.json()
        if 'search' in data and data['search']:
            entity_id = data['search'][0]['id']
            entity_name = data['search'][0].get('label', 'No name available')
            entity_description = data['search'][0].get('description', 'No description available')
            return entity_id, entity_name, entity_description
        else:
            return None, None, None
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return None, None, None
    except ValueError as e:
        print(f"Failed to parse JSON: {e}")
        return None, None, None
    
# Function to verify direct links in Wikidata
def verify_link(entity1, entity2):
    url = f"https://query.wikidata.org/sparql?query=SELECT ?item WHERE {{ wd:{entity1} ?p wd:{entity2} }}&format=json"
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)
        data = response.json()
        return bool(data.get('results', {}).get('bindings', []))
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return False
    except ValueError as e:
        print(f"Failed to parse JSON: {e}")
        return False

# Function to analyze text
def analyze_text(text):
    doc = nlp(text)
    entities = {ent.text: link_entity(ent.text) for ent in doc.ents if link_entity(ent.text) is not None}
    links = []
    direct_links = 0
    direct_link_bools = []
    sentences = list(doc.sents)
    for sent in sentences:
        sent_entities = [ent.text for ent in sent.ents]
        for i in range(len(sent_entities)):
            for j in range(i + 1, len(sent_entities)):
                entity1 = entities[sent_entities[i]]
                entity2 = entities[sent_entities[j]]
                if entity1 and entity2:
                    links.append((entity1, entity2))
                    # is_direct = verify_link(entity1[0], entity2[0])
                    is_direct = 1
                    direct_link_bools.append(1 if is_direct else 0)
                    if is_direct:
                        direct_links += 1
    ratio = direct_links / len(links) if links else 0
    return ratio, links, direct_link_bools

# Example usage:
text = "John F. Kennedy was the 35th President of the United States. He was assassinated in 1963."
ratio, links, direct_link_bools = analyze_text(text)
print(f"Ratio: {ratio}")
print(f"Links: {links}")
print(f"Direct Link Bools: {direct_link_bools}")

# Normal Experimentation

In [15]:
correct_scores = []
incorrect_scores = []

In [17]:
accurate = []
minor_inaccurate = []
major_inaccurate = []

In [18]:
folder_name = "Output"

if not os.path.exists(folder_name):
        os.makedirs(folder_name)

for i in tqdm(range(0,100), desc="Processing entries", unit="entry"):
    list_of_sentences = (dataset["evaluation"][i]["gpt3_sentences"])
    sentences = ''''''
    for s in list_of_sentences:
        tmp = preprocess_text(s)
        if tmp[-1]!='.':
            tmp+='.'
        sentences = sentences + tmp + "\n" 
    ground_truth_doc = preprocessing(dataset["evaluation"][i]["wiki_bio_text"])
    ground_truth = ''''''
    for sent in ground_truth_doc.sents:
        temp = preprocess_text(sent.text)
        if temp[-1]!='.':
            temp+='.'
        ground_truth = ground_truth + temp + "\n" 
    annotation = dataset["evaluation"][i]["annotation"]
    
    sentences_scores, sentence_pairs_and_values , sentence_coref_sents,= making_the_parallel_version(sentences)
    ground_truth_scores, ground_pairs_and_values, ground_coref_sents = making_the_parallel_version(ground_truth)
        
    filename = os.path.join(folder_name, f"entry_{i+1}.txt")
    with open(filename, 'w') as file:
        file.write("#############SENTENCE_PAIRS############\n\n")
        for x in range(len(sentence_pairs_and_values)):
            file.write(f"{sentence_pairs_and_values[x][0]} and {sentence_pairs_and_values[x][1]} and the value is : {sentence_pairs_and_values[x][2]}\n")
        file.write("\n")
        file.write("#############GROUND_PAIRS############\n\n")
        for x in range(len(ground_pairs_and_values)):
            file.write(f"{ground_pairs_and_values[x][0]} and {ground_pairs_and_values[x][1]} and the value is : {ground_pairs_and_values[x][2]}\n")
        file.write("\n\n")
        file.write("%%%%%%%%%%%%%%%%%%%%SENTENCES%%%%%%%%%%%%%%%%%\n")
        file.write(f"Sentences : \n\n{sentences}\n\n")
        file.write(f"Coref Resolved: \n\n")
        for y,x in enumerate(sentence_coref_sents):
            file.write(f"{y}. {x}")
        file.write("\n")
        file.write("\n")
        file.write("%%%%%%%%%%%%%%%%%%%%GROUND_TRUTH%%%%%%%%%%%%%%%\n")
        file.write(f"Ground Truth : \n\n{ground_truth} \n\n")
        file.write(f"Coref Resolved: \n\n")
        for y,x in enumerate(ground_coref_sents):
            file.write(f"{y}. {x}")
        file.write("\n")
        file.write("%%%%%%%%%%%%%%%%%%%%FRACTIONS%%%%%%%%%%%%%%%%%%\n")
        file.write(f"Value for sentences is : \n")
        for x in sentences_scores:
            file.write(f"{x} ")
        file.write("\n\n")
        file.write(f"Value for ground truth is : \n")
        for x in ground_truth_scores:
            file.write(f"{x} ")
        file.write("\n\n")
        file.write("%%%%%%%%%%%%%%%%%%%%ANNOTATIONS%%%%%%%%%%%%%%%%\n")
        for x in annotation:
            file.write(f"{x} ")
        file.write("\n\n")
        
    suma = 0
    leng = 0
    for x in ground_truth_scores:
        if x!=-1:
            suma+=x
            leng+=1
    
    if leng>0:
        print(f"The ground truth scores are: {suma/leng}\n")

    if (sum(ground_truth_scores)/len(ground_truth_scores))>0.1:
        if len(sentences_scores)!=len(annotation):
            continue
        for scores in ground_truth_scores:
            if scores!=-1:
                accurate.append(scores)
        for t,score in enumerate(sentences_scores):
            if score==-1:
                continue
            if annotation[t-1]=="accurate":
                accurate.append(score) 
            elif annotation[t-1]=="minor_inaccurate":
                minor_inaccurate.append(score)
            elif annotation[t-1]=="major_inaccurate":
                major_inaccurate.append(score)

Processing entries:   0%|          | 0/100 [00:00<?, ?entry/s]

07/04/2024 17:18:46 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:18:46 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 12



07/04/2024 17:18:55 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:18:55 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 40

The ground truth scores are: 0.3134920634920635



07/04/2024 17:19:08 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:19:08 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 16



07/04/2024 17:19:17 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:19:17 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 29

The ground truth scores are: 0.3365800865800866



07/04/2024 17:19:37 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:19:37 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 22



07/04/2024 17:19:54 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:19:54 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 65

The ground truth scores are: 0.3624603174603175



07/04/2024 17:20:13 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:20:14 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 8



07/04/2024 17:20:19 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:20:20 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 27

The ground truth scores are: 0.027777777777777776



07/04/2024 17:20:34 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:20:34 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 9



07/04/2024 17:20:41 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:20:42 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 0



07/04/2024 17:20:45 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:20:45 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 21



07/04/2024 17:20:55 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:20:55 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 9

The ground truth scores are: 0.1111111111111111



07/04/2024 17:21:11 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:21:11 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 21



07/04/2024 17:21:20 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:21:20 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 17

The ground truth scores are: 0.0



07/04/2024 17:21:31 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:21:31 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 11



07/04/2024 17:21:37 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:21:37 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 14

The ground truth scores are: 0.76



07/04/2024 17:21:47 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:21:47 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 9



07/04/2024 17:21:56 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:21:56 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 45

The ground truth scores are: 0.056818181818181816



07/04/2024 17:22:13 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:22:13 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 19



07/04/2024 17:22:26 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:22:26 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 27

The ground truth scores are: 0.548611111111111



07/04/2024 17:22:43 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:22:44 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 11



07/04/2024 17:23:05 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:23:05 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 0



07/04/2024 17:23:08 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:23:08 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 0



07/04/2024 17:23:12 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:23:12 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 3

The ground truth scores are: 0.0



07/04/2024 17:23:19 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:23:19 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 14



07/04/2024 17:23:30 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:23:30 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 47

The ground truth scores are: 0.2126984126984127



07/04/2024 17:23:47 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:23:47 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 0



07/04/2024 17:23:50 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:23:50 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 35

The ground truth scores are: 0.5238095238095238



07/04/2024 17:24:06 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:24:06 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 11



07/04/2024 17:24:14 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:24:14 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 50

The ground truth scores are: 0.48561507936507936



07/04/2024 17:24:31 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:24:31 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 2



07/04/2024 17:24:37 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:24:37 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 15

The ground truth scores are: 0.16666666666666666



07/04/2024 17:24:49 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:24:49 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 14



07/04/2024 17:25:00 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:25:00 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 19

The ground truth scores are: 0.5047619047619047



07/04/2024 17:25:10 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:25:11 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 8



07/04/2024 17:25:23 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:25:23 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 10

The ground truth scores are: 0.5



07/04/2024 17:25:32 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:25:32 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 14



07/04/2024 17:25:39 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:25:39 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 20

The ground truth scores are: 0.37777777777777777



07/04/2024 17:25:53 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:25:53 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 19



07/04/2024 17:26:02 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:26:03 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 22

The ground truth scores are: 0.6666666666666666



07/04/2024 17:26:12 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:26:12 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 13



07/04/2024 17:26:20 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:26:20 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 66

The ground truth scores are: 0.011764705882352941



07/04/2024 17:26:40 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:26:40 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 14



07/04/2024 17:26:47 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:26:47 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 23

The ground truth scores are: 0.0



07/04/2024 17:26:57 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:26:58 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 11



07/04/2024 17:27:06 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:27:06 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 11

The ground truth scores are: 0.36363636363636365



07/04/2024 17:27:26 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:27:26 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 13



07/04/2024 17:27:37 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:27:37 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 37

The ground truth scores are: 0.6410256410256411



07/04/2024 17:27:50 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:27:50 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 0



07/04/2024 17:27:53 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:27:54 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 14

The ground truth scores are: 0.05714285714285715



07/04/2024 17:28:05 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:28:05 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 22



07/04/2024 17:28:18 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:28:18 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 16

The ground truth scores are: 0.8166666666666668



07/04/2024 17:28:29 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:28:29 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 26



07/04/2024 17:28:43 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:28:43 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 28

The ground truth scores are: 0.769047619047619



07/04/2024 17:29:00 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:29:00 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 17



07/04/2024 17:29:07 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:29:08 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 10

The ground truth scores are: 0.625



07/04/2024 17:29:17 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:29:17 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 3



07/04/2024 17:29:25 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:29:25 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 93

The ground truth scores are: 0.019005847953216373



07/04/2024 17:29:54 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:29:55 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 18



07/04/2024 17:30:03 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:30:03 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 3

The ground truth scores are: 0.6666666666666666



07/04/2024 17:30:11 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:30:11 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 13



07/04/2024 17:30:22 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:30:22 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 24

The ground truth scores are: 0.5388888888888889



07/04/2024 17:30:33 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:30:33 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 15



07/04/2024 17:30:40 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:30:40 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 24

The ground truth scores are: 0.0



07/04/2024 17:30:51 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:30:51 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 0



07/04/2024 17:30:54 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:30:54 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 0



07/04/2024 17:30:57 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:30:57 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 1



07/04/2024 17:31:02 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:31:02 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 24

The ground truth scores are: 0.33766233766233766



07/04/2024 17:31:22 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:31:22 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 33



07/04/2024 17:31:34 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:31:34 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 39

The ground truth scores are: 0.17748917748917747



07/04/2024 17:31:50 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:31:50 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 0



07/04/2024 17:31:53 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:31:53 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 0



07/04/2024 17:31:56 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:31:56 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 0



07/04/2024 17:31:59 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:31:59 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 8

The ground truth scores are: 0.5



07/04/2024 17:32:14 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:32:14 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 0



07/04/2024 17:32:17 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:32:18 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 0



07/04/2024 17:32:21 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:32:21 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 11



07/04/2024 17:32:29 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:32:29 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 19

The ground truth scores are: 0.0



07/04/2024 17:32:42 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:32:42 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 17



07/04/2024 17:32:53 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:32:53 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 12

The ground truth scores are: 0.0



07/04/2024 17:33:13 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:33:13 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 9



07/04/2024 17:33:20 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:33:21 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 43

The ground truth scores are: 0.0



07/04/2024 17:33:38 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:33:38 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 14



07/04/2024 17:33:46 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:33:46 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 22

The ground truth scores are: 0.4476190476190475



07/04/2024 17:33:58 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:33:58 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 13



07/04/2024 17:34:08 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:34:08 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 10

The ground truth scores are: 0.4



07/04/2024 17:34:18 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:34:18 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 18



07/04/2024 17:34:27 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:34:27 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 0



07/04/2024 17:34:30 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:34:30 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 24



07/04/2024 17:34:43 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:34:43 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 61

The ground truth scores are: 0.3998015873015873



07/04/2024 17:35:04 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:35:04 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 12



07/04/2024 17:35:12 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:35:12 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 48

The ground truth scores are: 0.6617647058823529



07/04/2024 17:35:29 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:35:29 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 23



07/04/2024 17:35:40 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:35:40 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 15

The ground truth scores are: 0.5333333333333334



07/04/2024 17:35:52 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:35:52 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 12



07/04/2024 17:36:05 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:36:05 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 12

The ground truth scores are: 0.15



07/04/2024 17:36:22 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:36:22 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 33



07/04/2024 17:36:36 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:36:36 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 0



07/04/2024 17:36:40 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:36:40 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 25



07/04/2024 17:36:53 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:36:53 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 22

The ground truth scores are: 0.2857142857142857



07/04/2024 17:37:07 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:37:07 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 22



07/04/2024 17:37:21 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:37:21 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 23

The ground truth scores are: 0.34375



07/04/2024 17:37:30 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/04/2024 17:37:30 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 11



KeyboardInterrupt: 

In [19]:
sum(accurate)/len(accurate)

0.5133620689655171

In [20]:
len(accurate)

116

In [21]:
sum(minor_inaccurate)/len(minor_inaccurate)

0.33339285714285716

In [22]:
len(minor_inaccurate)

40

In [23]:
sum(major_inaccurate)/len(major_inaccurate)

0.30000000000000004

In [24]:
len(major_inaccurate)

29

In [25]:
avrg_accurate = sum(accurate)/len(accurate)
print(avrg_accurate)
len(accurate)
avrg_minor = sum(minor_inaccurate)/len(minor_inaccurate)
print(avrg_minor)
len(minor_inaccurate)
avrg_major = sum(major_inaccurate)/len(major_inaccurate)
print(avrg_major)
len(major_inaccurate)
avrg_inaccurate = (sum(minor_inaccurate) + sum(major_inaccurate))/(len(minor_inaccurate) + len(major_inaccurate))
print(avrg_inaccurate)
threshold_accurate = ((avrg_accurate)+(avrg_inaccurate))/2

0.5133620689655171
0.33339285714285716
0.30000000000000004
0.3193581780538302


In [26]:
threshold_accurate = ((avrg_accurate)+(avrg_inaccurate))/2
predicted_labels = []
for score in accurate + minor_inaccurate + major_inaccurate:
    if score >= threshold_accurate:
        predicted_labels.append("accurate")
    else:
        predicted_labels.append("inaccurate")

true_labels = ["accurate"] * len(accurate) + ["inaccurate"] * len(minor_inaccurate) + ["inaccurate"] * len(major_inaccurate)

from collections import defaultdict

# Initialize counters
confusion_matrix = defaultdict(lambda: {"TP": 0, "FP": 0, "FN": 0, "TN": 0})

# Calculate confusion matrix
for true, pred in zip(true_labels, predicted_labels):
    for category in ["accurate", "inaccurate"]:
        if true == category and pred == category:
            confusion_matrix[category]["TP"] += 1
        elif true == category and pred != category:
            confusion_matrix[category]["FN"] += 1
        elif true != category and pred == category:
            confusion_matrix[category]["FP"] += 1
        elif true != category and pred != category:
            confusion_matrix[category]["TN"] += 1

metrics = {}
for category in confusion_matrix:
    TP = confusion_matrix[category]["TP"]
    FP = confusion_matrix[category]["FP"]
    FN = confusion_matrix[category]["FN"]

    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0

    metrics[category] = {"Precision": precision, "Recall": recall}
    
for category in metrics:
    precision = metrics[category]["Precision"]
    recall = metrics[category]["Recall"]

    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    metrics[category]["F1 Score"] = f1_score

print(metrics)


{'accurate': {'Precision': 0.7319587628865979, 'Recall': 0.6120689655172413, 'F1 Score': 0.6666666666666666}, 'inaccurate': {'Precision': 0.48863636363636365, 'Recall': 0.6231884057971014, 'F1 Score': 0.5477707006369427}}


In [46]:
threshold_accurate = 0.53
threshold_minor_inaccurate = 0.34
predicted_labels = []
for score in accurate + minor_inaccurate + major_inaccurate:
    if score >= threshold_accurate:
        predicted_labels.append("accurate")
    elif threshold_minor_inaccurate <= score < threshold_accurate:
        predicted_labels.append("minor_inaccurate")
    else:
        predicted_labels.append("major_inaccurate")

true_labels = ["accurate"] * len(accurate) + ["minor_inaccurate"] * len(minor_inaccurate) + ["major_inaccurate"] * len(major_inaccurate)

from collections import defaultdict

# Initialize counters
confusion_matrix = defaultdict(lambda: {"TP": 0, "FP": 0, "FN": 0, "TN": 0})

# Calculate confusion matrix
for true, pred in zip(true_labels, predicted_labels):
    for category in ["accurate", "minor_inaccurate", "major_inaccurate"]:
        if true == category and pred == category:
            confusion_matrix[category]["TP"] += 1
        elif true == category and pred != category:
            confusion_matrix[category]["FN"] += 1
        elif true != category and pred == category:
            confusion_matrix[category]["FP"] += 1
        elif true != category and pred != category:
            confusion_matrix[category]["TN"] += 1

# Calculate Precision and Recall
metrics = {}
for category in confusion_matrix:
    TP = confusion_matrix[category]["TP"]
    FP = confusion_matrix[category]["FP"]
    FN = confusion_matrix[category]["FN"]

    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0

    metrics[category] = {"Precision": precision, "Recall": recall}
    
# Calculate F1 score
for category in metrics:
    precision = metrics[category]["Precision"]
    recall = metrics[category]["Recall"]

    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    metrics[category]["F1 Score"] = f1_score

print(metrics)

{'accurate': {'Precision': 0.8012422360248447, 'Recall': 0.5397489539748954, 'F1 Score': 0.6449999999999999}, 'minor_inaccurate': {'Precision': 0.288135593220339, 'Recall': 0.22077922077922077, 'F1 Score': 0.25}, 'major_inaccurate': {'Precision': 0.23333333333333334, 'Recall': 0.6481481481481481, 'F1 Score': 0.3431372549019608}}


# Wiki Experimentation

In [None]:
checking = []
correct_scores = []
incorrect_scores = []

In [None]:
folder_name="Wiki_Method"

if not os.path.exists(folder_name):
        os.makedirs(folder_name)

for i in tqdm(range(0,1), desc="Processing entries", unit="entry"):
    sentences = preprocess_text(dataset["evaluation"][i]["gpt3_text"])
    ground_truth = preprocess_text(dataset["evaluation"][i]["wiki_bio_text"])
    annotation = dataset["evaluation"][i]["annotation"]
    
    sentences_score, sentence_pairs, sentence_links= analyze_text(sentences)
    ground_truth_score, ground_pairs, ground_links = analyze_text(ground_truth)
    correct_scores.append(ground_truth_score)
    print(f"The ground truth scores are: {ground_truth_score}\n")
    
    correct_count = annotation.count("accurate")
    incorrect_count = annotation.count("minor_inaccurate") + annotation.count("major_inaccurate")
    total = correct_count + incorrect_count
    if(correct_count==total):
        correct_scores.append(sentences_score)
    if(incorrect_count/total>0.75):
        incorrect_scores.append(sentences_score)
        
    filename = os.path.join(folder_name, f"entry_{i+1}.txt")
    with open(filename, 'w') as file:
        file.write("#############SENTENCE_PAIRS############\n\n")
        for i in range(len(sentence_pairs)):
            file.write(f"{sentence_pairs[i][0][1]} and {sentence_pairs[i][1][1]} and the value is : {sentence_links[i]}\n")
        file.write("\n\n")
        file.write("#############GROUND_PAIRS############\n\n")
        for i in range(len(ground_pairs)):
            file.write(f"{ground_pairs[i][0][1]} and {ground_pairs[i][1][1]} and the value is : {ground_links[i]}\n")
        file.write("\n\n")
        file.write("%%%%%%%%%%%%%%%%%%%%SENTENCES%%%%%%%%%%%%%%%%%\n")
        file.write(f"Sentences : \n{sentences} \n\n")
        file.write("\n\n")
        file.write("%%%%%%%%%%%%%%%%%%%%GROUND_TRUTH%%%%%%%%%%%%%%%\n")
        file.write(f"Ground Truth : \n{ground_truth} \n\n")
        file.write("\n\n")
        file.write("%%%%%%%%%%%%%%%%%%%%FRACTIONS%%%%%%%%%%%%%%%%%%\n")
        file.write(f"Value for sentences is : {sentences_score} and for ground truth is : {ground_truth_score}")
        file.write("\n\n")
        file.write("%%%%%%%%%%%%%%%%%%%%ANNOTATIONS%%%%%%%%%%%%%%%%\n")
        for i in annotation:
            file.write(f"{i} ")
        file.write("\n\n")

In [22]:
print(correct_scores)

[]


In [23]:
print(incorrect_scores)

[]


In [31]:
anvg_coreect_value = 0
for val in correct_scores:
    anvg_coreect_value+= val
anvg_coreect_value/= len(correct_scores)
print(anvg_coreect_value)

0.5077271430379288


In [32]:
anvg_incoreect_value = 0
count = 0
for val in incorrect_scores:
    if val>=0:
        anvg_incoreect_value+= val
        count += 1; 
anvg_incoreect_value/= count
print(anvg_incoreect_value)

0.31714452101961915


In [None]:
negative = 0
for val in incorrect_scores:
    if val<0:
        negative+= 1
print(negative)

In [None]:
len(correct_scores)

In [None]:
len(incorrect_scores)

In [17]:
import matplotlib.pyplot as plt
import numpy as np

plt.hist(correct_scores, bins=20, edgecolor='black')
plt.xlabel('Correct Scores')
plt.ylabel('Frequency')
plt.title('Histogram of Scores')
plt.show()

ValueError: object __array__ method not producing an array

<Figure size 640x480 with 1 Axes>

In [None]:
plt.hist(incorrect_scores, bins=20, edgecolor='black')
plt.xlabel('Correct Scores')
plt.ylabel('Frequency')
plt.title('Histogram of Scores')
plt.show()

In [None]:
correct_scores_GPT_Only = []
incorrect_scores_GPT_Only = []

In [None]:
folder_name="Only_GPT_Generated"

if not os.path.exists(folder_name):
        os.makedirs(folder_name)

for i in tqdm(range(130,238), desc="Processing entries", unit="entry"):
    sentences = preprocess_text(dataset["evaluation"][i]["gpt3_text"])
    annotation = dataset["evaluation"][i]["annotation"]
    
    sentences_score, sentence_pairs, sentence_links, sentence_coref_resolved= count_direct_links_and_fraction(sentences)
    
    correct_count = annotation.count("accurate")
    incorrect_count = annotation.count("major_inaccurate")
    total = len(annotation)
    if(correct_count==total):
        correct_scores_GPT_Only.append(sentences_score)
    if(incorrect_count/total>0.80):
        incorrect_scores_GPT_Only.append(sentences_score)
        
    filename = os.path.join(folder_name, f"entry_{i+1}.txt")
    with open(filename, 'w') as file:
        file.write("#############SENTENCE_PAIRS############\n\n")
        for i in range(len(sentence_pairs)):
            file.write(f"{sentence_pairs[i][0]} and {sentence_pairs[i][1]} and the value is : {sentence_links[i]}\n")
        file.write("\n\n")
        file.write("%%%%%%%%%%%%%%%%%%%%SENTENCES%%%%%%%%%%%%%%%%%\n")
        file.write(f"Sentences : \n{sentences} \n\n")
        file.write(f"Coref Resolved : \n{sentence_coref_resolved}")
        file.write("\n\n")
        file.write("%%%%%%%%%%%%%%%%%%%%FRACTIONS%%%%%%%%%%%%%%%%%%\n")
        file.write(f"Value for sentences is : {sentences_score}")
        file.write("\n\n")
        file.write("%%%%%%%%%%%%%%%%%%%%ANNOTATIONS%%%%%%%%%%%%%%%%\n")
        for i in annotation:
            file.write(f"{i} ")
        file.write("\n\n")

In [None]:
print(correct_scores_GPT_Only)

In [None]:
print(incorrect_scores_GPT_Only)

In [None]:
print(sum(correct_scores_GPT_Only)/len(correct_scores_GPT_Only))

In [None]:
print(sum(incorrect_scores_GPT_Only)/len(incorrect_scores_GPT_Only))

# Experimentation GPT

In [None]:
folder_name="Name_reduction_check"

if not os.path.exists(folder_name):
        os.makedirs(folder_name)

for i in tqdm(range(0,3), desc="Processing entries", unit="entry"):
    sentences = preprocess_text(dataset["evaluation"][i]["gpt3_text"])
    ground_truth = preprocess_text(dataset["evaluation"][i]["wiki_bio_text"])
    annotation = dataset["evaluation"][i]["annotation"]
    
    
    correct_scores.append(ground_truth_score)
    print(f"The ground truth scores are: {ground_truth_score}\n")
    
    correct_count = annotation.count("accurate")
    incorrect_count = annotation.count("minor_inaccurate") + annotation.count("major_inaccurate")
    total = correct_count + incorrect_count
    if(correct_count==total):
        correct_scores.append(sentences_score)
    if(incorrect_count/total>0.75):
        incorrect_scores.append(sentences_score)
        
    filename = os.path.join(folder_name, f"entry_{i+1}.txt")
    with open(filename, 'w') as file:
        file.write("#############SENTENCE_PAIRS############\n\n")
        for i in range(len(sentence_pairs)):
            file.write(f"{sentence_pairs[i][0]} and {sentence_pairs[i][1]} and the value is : {sentence_links[i]}\n")
        file.write("\n\n")
        file.write("#############GROUND_PAIRS############\n\n")
        for i in range(len(ground_pairs)):
            file.write(f"{ground_pairs[i][0]} and {ground_pairs[i][1]} and the value is : {ground_links[i]}\n")
        file.write("\n\n")
        file.write("%%%%%%%%%%%%%%%%%%%%SENTENCES%%%%%%%%%%%%%%%%%\n")
        file.write(f"Sentences : \n{sentences} \n\n")
        file.write(f"Coref Resolved : \n{sentence_coref_resolved}")
        file.write("\n\n")
        file.write("%%%%%%%%%%%%%%%%%%%%GROUND_TRUTH%%%%%%%%%%%%%%%\n")
        file.write(f"Ground Truth : \n{ground_truth} \n\n")
        file.write(f"Coref Resolved : \n{ground_coref_resolved}")
        file.write("\n\n")
        file.write("%%%%%%%%%%%%%%%%%%%%FRACTIONS%%%%%%%%%%%%%%%%%%\n")
        file.write(f"Value for sentences is : {sentences_score} and for ground truth is : {ground_truth_score}")
        file.write("\n\n")
        file.write("%%%%%%%%%%%%%%%%%%%%ANNOTATIONS%%%%%%%%%%%%%%%%\n")
        for i in annotation:
            file.write(f"{i} ")
        file.write("\n\n")

# Embeddings based Name Extraction (Linear)

In [None]:
def process_entity(uri, label):
        file.write(f"URI: {uri}, Label: {label}\n")
        print(f"URI: {uri}, Label: {label}")

In [None]:
DBPedia_Information = []
DBpedia_Embeddings = []

In [None]:
sparql = SPARQLWrapper("http://dbpedia.org/sparql")
limit = 500
offset = 0

while True:
    try:
        query = f"""
        SELECT ?entity ?label ?abstract WHERE {{
          ?entity a dbo:Person .
          OPTIONAL {{ ?entity rdfs:label ?label . FILTER (lang(?label) = "en") }}
          OPTIONAL {{ ?entity dbo:abstract ?abstract . FILTER (lang(?abstract) = "en") }}
        }}
        LIMIT {limit} OFFSET {offset}
        """
        
        sparql.setQuery(query)
        sparql.setReturnFormat(JSON)
        
        results = sparql.query().convert()
        
        bindings = results["results"]["bindings"]
        if not bindings:
            break
        
        for result in bindings:
            entry = {
                "uri": result["entity"]["value"],
                "label": result["label"]["value"] if "label" in result else "",
                "abstract": result["abstract"]["value"] if "abstract" in result else ""
            }
            DBPedia_Information.append(entry)
            print(entry)
            context = entry["label"] + ": " + entry["abstract"]
            current_embedding = model.encode(context,show_progress_bar=False)
            DBpedia_Embeddings.append(current_embedding)  
        
        offset += limit
              
    except Exception as e:
        print(f"Error while creating embeddings/extracting entities: {e}")

# PARALLEL

In [None]:
def fetch_entities(offset, limit):
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    query = f"""
    SELECT ?entity ?label ?abstract WHERE {{
        ?entity a dbo:Person .
        OPTIONAL {{ ?entity rdfs:label ?label . FILTER (lang(?label) = "en") }}
        OPTIONAL {{ ?entity dbo:abstract ?abstract . FILTER (lang(?abstract) = "en") }}
    }}
    LIMIT {limit} OFFSET {offset}
    """
    
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    
    results = sparql.query().convert()
    
    fetched_data = []
    for result in results["results"]["bindings"]:
        label = result["label"]["value"] if "label" in result else ""
        abstract = result["abstract"]["value"] if "abstract" in result else ""
        context = label + ": " + abstract
        current_embedding = model.encode(context, show_progress_bar=False)
        
        entry = {
            "uri": result["entity"]["value"],
            "label": result["label"]["value"] if "label" in result else "",
            "abstract": result["abstract"]["value"] if "abstract" in result else "",
            "embedding": current_embedding
        }
        fetched_data.append(entry)
    return fetched_data


In [None]:
DBpedia_Database= []

In [None]:
limit = 100  # Adjust the batch size as needed
total_entities = 2200000  # Total number of entities to fetch

with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    # Calculate offsets for pagination
    offsets = range(0, total_entities, limit)
    
    # Submit tasks for each offset to fetch entities
    future_to_offset = {executor.submit(fetch_entities, offset, limit): offset for offset in offsets}
    
    for future in concurrent.futures.as_completed(future_to_offset):
        try:
            entities = future.result()
            # Extend DBpedia_Database with fetched entities
            DBpedia_Database.extend(entities)
            # Process fetched entities as needed
            for entity in entities:
                # Access entity["uri"], entity["label"], entity["abstract"], entity["embedding"]
                print(entity["uri"], entity["label"], entity["abstract"], entity["embedding"])
        except Exception as e:
            print(f"Error fetching entities: {e}")

In [None]:
print(len(DBPedia_Information))

In [None]:
import numpy as np
import faiss

embeddings = np.random.rand(2300000, 768).astype('float32')
names = ['Embedding_' + str(i) for i in range(2300000)] 

index = faiss.IndexFlatL2(768)

index.add(embeddings)

query_embedding = np.random.rand(1, 768).astype('float32')  
k = 5 

In [None]:
k=1
distances, indices = index.search(query_embedding, k)

# Step 4: Display results
print("Query Embedding:")
print(query_embedding)
print("\nNearest Neighbors:")
for i in range(k):
    print(f"Name: {names[indices[0][i]]}, Distance: {distances[0][i]}")

# Extracting Entities

In [None]:
DBpedia_Entities = []

In [None]:
sparql = SPARQLWrapper("http://dbpedia.org/sparql")
sparql.setReturnFormat(JSON)

offset = 298000
limit = 2000

total_entities = 2293328  # Total number of entities in DBpedia
progress_bar = tqdm(total=total_entities, desc="Fetching Entities", unit=" entities")

# Open a text file for writing
with open('dbpedia_entities.txt', 'a', encoding='utf-8') as file:
    while True:
        # Set the SPARQL query with limit and offset
        query = f"""
        SELECT DISTINCT ?entity ?label (COALESCE(?abstract_en, "") AS ?abstract) WHERE {{
          ?entity a dbo:Person .
          OPTIONAL {{ 
            ?entity rdfs:label ?label .
            FILTER(LANG(?label) = 'en')
          }}
          OPTIONAL {{ 
            ?entity dbo:abstract ?abstract_en .
            FILTER(LANG(?abstract_en) = 'en')
          }}
        }}
        LIMIT {limit} OFFSET {offset}
        """
        
        sparql.setQuery(query)
        
        # Execute the query
        results = sparql.query().convert()
        
        # Check if results are returned
        if len(results["results"]["bindings"]) == 0:
            break  # No more results
        
        # Process each result and write to the file
        for result in results["results"]["bindings"]:
            entity_uri = result["entity"]["value"]
            label = result["label"]["value"] if "label" in result else ""
            abstract = result["abstract"]["value"] if "abstract" in result else ""
            
            # Write entity information to the file
            file.write(f"Entity URI: {entity_uri}\n")
            file.write(f"Label: {label}\n")
            file.write(f"Abstract: {abstract}\n")
            file.write("\n")
            
            # Create dictionary for the entry
            entry = {
                "entity_uri": entity_uri,
                "label": label,
                "abstract": abstract
            }
            
            # Append entry to the list
            DBpedia_Entities.append(entry)
            
            # Update progress bar
            progress_bar.update(1)
        
        # Increment the offset for the next iteration
        offset += limit

# Close the progress bar
progress_bar.close()


In [None]:
print(offset)

In [None]:
print(DBpedia_Entities)