In [1]:
import spacy
from spacy.tokens import Doc, Span
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('dbpedia_spotlight', config={'confidence': 0.5})
preprocessing = spacy.load('en_core_web_sm')
from fastcoref import LingMessCoref
coref_model = LingMessCoref()
from SPARQLWrapper import SPARQLWrapper, JSON
from datasets import load_dataset
dataset = load_dataset("potsawee/wiki_bio_gpt3_hallucination")
import os
from tqdm.notebook import tqdm
import concurrent.futures
import re
import requests
import pandas as pd

07/02/2024 16:20:50 - INFO - 	 missing_keys: []
07/02/2024 16:20:50 - INFO - 	 unexpected_keys: []
07/02/2024 16:20:50 - INFO - 	 mismatched_keys: []
07/02/2024 16:20:50 - INFO - 	 error_msgs: []
07/02/2024 16:20:50 - INFO - 	 Model Parameters: 590.0M, Transformer: 434.6M, Coref head: 155.4M


In [2]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-mpnet-base-v2")

07/02/2024 16:21:20 - INFO - 	 Use pytorch device_name: mps
07/02/2024 16:21:20 - INFO - 	 Load pretrained SentenceTransformer: all-mpnet-base-v2


In [3]:
import numpy as np
from urllib.parse import urlparse

# Coreference Resolution

In [5]:

def get_cluster_spans(doc, clusters):
    fast_clusters = []
    for cluster in clusters:
        new_group = []
        for start, end in cluster:
            span = doc.char_span(start, end)
            if span is not None:
                new_group.append([span.start, span.end - 1])
        fast_clusters.append(new_group)
    return fast_clusters

def get_clusters(doc, text):
    preds = coref_model.predict(texts=[text])
    # print(f"\nThe clusters of same entities are as follows: {preds[0].get_clusters(as_strings=True)} \n")
    clusters = preds[0].get_clusters(as_strings=False)
    cluster_spans = get_cluster_spans(doc, clusters)
    return cluster_spans

def get_span_noun_indices(doc, cluster):    
    spans = [doc[start:end+1] for start, end in cluster]

    spans_pos = []
    for span in spans:
        pos_tags = [token.pos_ for token in span]
        spans_pos.append(pos_tags)

    noun_indices = []
    for i, pos_list in enumerate(spans_pos):
        if 'NOUN' in pos_list or 'PROPN' in pos_list:
            noun_indices.append(i)
    return noun_indices

def get_cluster_head(doc, cluster, noun_indices):
    head_idx = noun_indices[0]
    head_start, head_end = cluster[head_idx]
    head_span = doc[head_start:head_end+1]
    return head_span, [head_start, head_end]

def is_containing_other_spans(span, all_spans):
    for s in all_spans:
        if s[0] >= span[0] and s[1] <= span[1] and s != span:
            return True  
    return False

def replacement(coref, resolved, mention_span):
    start, end = coref
    mention_text = mention_span.text_with_ws + " "
    resolved[start] = mention_text
    for i in range(start + 1, end + 1):
        resolved[i] = ""
    return resolved

def replace_corefs(document, clusters):
    resolved = [token.text_with_ws for token in document]
    all_spans = [span for cluster in clusters for span in cluster]

    for cluster in clusters:
        noun_indices = get_span_noun_indices(document, cluster)

        if noun_indices:
            mention_span, mention = get_cluster_head(document, cluster, noun_indices)
        else:
            start, end = cluster[0]
            mention_span = document[start:end+1]
            mention = cluster[0]
            
        for coref in cluster:
            resolved = replacement(coref, resolved, mention_span)

    
    return ("".join(resolved))


def coreference_resolution(text):
    doc = nlp(text)
    clusters = get_clusters(doc, text) 
    answer= replace_corefs(doc, clusters) 
    return answer

# Pre-Processing

In [6]:
def is_three_word_name(entity):
    return len(entity.text.split()) >= 3 and entity.label_ == "PERSON"

In [7]:
def replace_three_worded_names(text):
    doc = preprocessing(text)
    new_text = text
    
    for entity in doc.ents:
        if is_three_word_name(entity):
            words = entity.text.split()
            new_name = f"{words[0]} {words[-1]}"
            new_text = new_text.replace(entity.text, new_name)
    return new_text

In [8]:
def preprocess_text(text): 
    # text = replace_three_worded_names(text) 
    text = re.sub(r'[^\w\s.,()\'"\-]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r"(['\"])\1+", r"\1", text)
    # preprocessed_text = coreference_resolution(text)
    return text

# Spotlight Based System

## Sentence Based Link Making

In [None]:
def get_sentence_based_links(text):
    final_text = coreference_resolution(text)
    doc = nlp(final_text)
    final_sents = [sents for sents in doc.sents]
    # entities = list(doc.ents)
    # print("Entities found by spaCy:", entities)
    
    subjects = []
    subject_set = set([])
    for sent in doc.sents:
        found_subject = False
        for token in sent:
            # print(f"{token},{token.dep_}")
            if token.dep_ in ['nsubj', 'nsubjpass'] and token.ent_kb_id_:
                subjects.append(token.ent_kb_id_)
                subject_set.add(token.ent_kb_id_)
                found_subject = True
                break

        if not found_subject:
            subjects.append(None)
    ("Subjects identified:", subjects)
    
    sentence_forms = []
    for sent in doc.sents:
        entities = [(ent.text, ent.start, ent.end) for ent in sent.ents]
        sentence_forms.append(entities)
    # print("Entities in each sentence:", sentence_forms)
    
    # print(subjects)
    
    pairs = []
    count = 0
    for i in range(len(sentence_forms)):
        tmp_storage = []
        if subjects[i] is not None and len(sentence_forms[i]) > 1:
            for entity, ent_start, ent_end in sentence_forms[i]:
                check_sub = False
                for token in doc[ent_start:ent_end]:
                    if token.dep_ in ['nsubj', 'nsubjpass']:
                        check_sub = True
                        break
                if check_sub == True:
                    continue
                tmp_storage.append([subjects[i], entity])
                count += 1
        pairs.append(tmp_storage)
    print(f"The number of pairs is: {count}\n")
    
    return pairs, final_sents, subject_set

In [None]:
def get_sentence_based_links(text):
    final_text = coreference_resolution(text)
    doc = nlp(final_text)
    final_sents = [sents for sents in doc.sents]
    # entities = list(doc.ents)
    # print("Entities found by spaCy:", entities)
    
    subjects = []
    subject_set = set([])
    for sent in doc.sents:
        found_subject = False
        for token in sent:
            # print(f"{token},{token.dep_}")
            if token.dep_ in ['nsubj', 'nsubjpass'] and token.ent_kb_id_:
                subjects.append(token.ent_kb_id_)
                subject_set.add(token.ent_kb_id_)
                found_subject = True
                break

        if not found_subject:
            subjects.append(None)
    ("Subjects identified:", subjects)
    
    new_doc = preprocessing(final_text)
    sentence_forms = []
    for sent in new_doc.sents:
        entities = [(ent.text, ent.start, ent.end) for ent in sent.ents]
        sentence_forms.append(entities)
    
    # print("Entities in each sentence:", sentence_forms)
    
    # print(subjects)
    
    pairs = []
    count = 0
    for i in range(len(sentence_forms)):
        tmp_storage = []
        if subjects[i] is not None and len(sentence_forms[i]) > 1:
            for entity, ent_start, ent_end in sentence_forms[i]:
                check_sub = False
                for token in new_doc[ent_start:ent_end]:
                    # print(f"{token} and {token.dep_}")
                    if token.dep_ in ['nsubj', 'nsubjpass']:
                        check_sub = True
                        break
                if check_sub == True:
                    continue
                tmp_storage.append([subjects[i], entity])
                count += 1
        pairs.append(tmp_storage)
    print(f"The number of pairs is: {count}\n")
    
    return pairs, final_sents, subject_set

In [None]:
text = """Wilhelm Windelband (May 11, 1848 - October 22, 1915) was a German philosopher of the Baden School. Wilhelm Windelband is now mainly remembered for the terms "nomothetic" and "idiographic", which Wilhelm Windelband introduced. the terms "nomothetic" and "idiographic", which he introducedhave currency in psychology and other areas, though not necessarily in line with Wilhelm Windelband original meanings. Wilhelm Windelband was a Neo-Kantian who protested other Neo-Kantians of Wilhelm Windelband time and maintained that "to understand Kant rightly means to go beyond Kant ". Against Wilhelm Windelband positivist contemporaries, Wilhelm Windelband argued that philosophy should engage in humanistic dialogue with the natural sciences rather than uncritically appropriating the natural sciences methodologies. Wilhelm Windelband interests in psychology and cultural sciences represented an opposition to psychologism and historicism schools by a critical philosophic system. Wilhelm Windelband relied in Wilhelm Windelband effort to reach beyond Kant on such philosophers as Georg Wilhelm Friedrich Hegel, Johann Friedrich Herbart, and Hermann Lotze. Closely associated with Wilhelm Windelband was Heinrich Rickert. Wilhelm Windelband disciples were not only noted philosophers, but sociologists like Max Weber and theologians like Ernst Troeltsch and Albert Schweitzer."""
pairs, final_sents, subject_set = (get_sentence_based_links(text))
print(pairs)

## Making Direct Subject Link Embeddings

In [None]:
def subjects_direct_links(subjects_set):
    subject_direct_dict = {}
    for subj in subjects_set:
        sparql = SPARQLWrapper("https://dbpedia.org/sparql")
        sparql.setMethod('POST')  

        # Dynamically insert the subject into the query
        query_source_to_target = f"""
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX dbo: <http://dbpedia.org/ontology/>
        PREFIX dbr: <http://dbpedia.org/resource/>

        SELECT DISTINCT ?label
        WHERE {{
        {{
            <{subj}> ?property ?entity .
            ?entity rdfs:label ?label .
        }} UNION {{
            ?entity ?property <{subj}> .
            ?entity rdfs:label ?label .
        }}
        FILTER (lang(?label) = "en")
        }}
        """
        sparql.setQuery(query_source_to_target)
        sparql.setReturnFormat(JSON)

        try:
            result = sparql.query().convert()
            tmp = [item['label']['value'] for item in result["results"]["bindings"]]
            subject_direct_dict[subj] = model.encode(tmp,show_progress_bar=False)
        except Exception as e:
            print(f"Query for {subj} didn't work: {e}")

    return subject_direct_dict


### Testing

In [None]:
# Example usage
result_dict = subjects_direct_links(subject_set)
print(result_dict)

In [None]:
word = "Psychologist"
entity_embedding = model.encode(word)
embeddings = result_dict['http://dbpedia.org/resource/Wilhelm_Windelband']
cosine_similarities = np.dot(embeddings, entity_embedding)
print(np.argmax(cosine_similarities))
cosine_similarities[np.argmax(cosine_similarities)]

## Checking the Direct Links

In [None]:
def check_direct_link(source_target):
    source_uri, target_uri = source_target
    sparql = SPARQLWrapper("https://dbpedia.org/sparql")
    sparql.setMethod('POST')  
    
    query_source_to_target = f"""
    ASK WHERE {{
      <{source_uri}> ?p <{target_uri}> .
    }}
    """
    sparql.setQuery(query_source_to_target)
    sparql.setReturnFormat(JSON)

    try:
        result_source_to_target = sparql.query().convert()
        has_link_source_to_target = result_source_to_target['boolean']
    except Exception as e:
        print(f"Error querying {source_uri} -> {target_uri}: {e}")
        has_link_source_to_target = False

    # Query from target to source
    query_target_to_source = f"""
    ASK WHERE {{
      <{target_uri}> ?p <{source_uri}> .
    }}
    """
    sparql.setQuery(query_target_to_source)

    try:
        result_target_to_source = sparql.query().convert()
        has_link_target_to_source = result_target_to_source['boolean']
    except Exception as e:
        print(f"Error querying {target_uri} -> {source_uri}: {e}")
        has_link_target_to_source = False

    # Combine the results
    has_link = has_link_source_to_target or has_link_target_to_source
    return source_uri, target_uri, has_link

In [None]:
print(check_direct_link(["http://dbpedia.org/resource/John_Russell_Reynolds","http://dbpedia.org/resource/Judge"])[2])

## Scoring

In [None]:
def scoring_linear_version(text):
    pairs,final_sents,subject_set = get_sentence_based_links(text)
    subject_dict = subjects_direct_links(subject_set)
    
    if not pairs:
        print("No entity pairs found here.")
        return 0,[],final_sents
    
    fractions = []
    pair_and_values = []
    
    for sent_sets in pairs:
        score = 0
        for pair in sent_sets:
            if check_direct_link(pair)[2]:
                score+=1
                pair_and_values.append([pair[0],pair[1],1])
            else:
                local_subject= pair[0]
                entity_link = pair[1]
                last_part = entity_link.split('/')[-1]
                entity = last_part.replace('_', ' ')
                print(entity)
                entity_embeddings = model.encode(entity,show_progress_bar=False)
                embeddings = subject_dict[local_subject]
                cosine_similarities = np.dot(embeddings, entity_embeddings)
                max_similarity = np.max(cosine_similarities)
                print(max_similarity)
                if max_similarity>0.65:
                    score+=1
                    pair_and_values.append([pair[0],pair[1],1])
                else:
                    pair_and_values.append([pair[0],pair[1],0])
        if len(sent_sets)==0:
            fractions.append(-1)
        else:
            fractions.append(score/len(sent_sets))
    
    return fractions, pair_and_values, final_sents

In [None]:
def process_sentence_set(args):
    index, sent_sets, subject_dict = args
    score = 0
    local_pair_and_values = []
    for pair in sent_sets:
        _, _, is_direct = check_direct_link(pair)
        if is_direct:
            score += 1
            local_pair_and_values.append([pair[0], pair[1], 1])
        else:
            local_subject= pair[0]
            entity_link = pair[1]
            last_part = entity_link.split('/')[-1]
            entity = last_part.replace('_', ' ')
            entity_embeddings = model.encode(entity,show_progress_bar=False)
            embeddings = subject_dict[local_subject]
            
            if embeddings.size == 0:
                local_pair_and_values.append([pair[0], pair[1], 0])
                continue
                
            
            cosine_similarities = np.dot(embeddings, entity_embeddings)
            max_similarity = np.max(cosine_similarities)
            if max_similarity>0.65:
                score+=1
                local_pair_and_values.append([pair[0],pair[1],1])
            else:
                local_pair_and_values.append([pair[0],pair[1],0])
    fraction = score / len(sent_sets) if len(sent_sets) > 0 else -1
    return index, fraction, local_pair_and_values

def scoring_parallel_version(text):
    pairs, final_sents, subject_set = get_sentence_based_links(text)
    subject_dict = subjects_direct_links(subject_set)

    if not pairs:
        print("No entity pairs found here.")
        return 0, [], final_sents

    fractions = [None] * len(pairs)
    pair_and_values = [None] * len(pairs)

    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        # Prepare the arguments for each task
        task_args = [(index, sent_sets, subject_dict) for index, sent_sets in enumerate(pairs)]
        # Process each sentence set in parallel
        results = list(executor.map(process_sentence_set, task_args))

    # Place results back into the correct order
    for index, fraction, local_pair_and_values in results:
        fractions[index] = fraction
        pair_and_values[index] = local_pair_and_values

    # Flatten pair_and_values list
    flat_pair_and_values = [item for sublist in pair_and_values for item in sublist]

    return fractions, flat_pair_and_values, final_sents


In [None]:
# #test =  ["John Russell Reynolds (1820–1876) was an English lawyer, judge, and author.", "He was born in London, the son of a barrister, and was educated at Eton College and Trinity College, Cambridge.", "He was called to the bar in 1845, and became a Queen's Counsel in 1859.", "He was appointed a judge of the Court of Common Pleas in 1867, and was knighted in 1871.", "Reynolds was a prolific author, writing on a wide range of topics.", "He wrote several books on legal topics, including The Law of Libel and Slander (1863), The Law of Copyright (1865), and The Law of Patents for Inventions (1868).", "He also wrote on a variety of other topics, including history, biography, and literature.", "He was a frequent contributor to the Saturday Review, and wrote several books on Shakespeare, including The Mystery of William Shakespeare (1848) and The Authorship of Shakespeare (1875).", "He also wrote a biography of the poet John Keats (1848)." ]
# test = [ "Gordon David Strachan (born 9 February 1957) is a Scottish football manager and former player.", "He is the manager of the Scotland national team.", "Strachan played for Dundee, Aberdeen, Manchester United, Leeds United and Coventry City, as well as the Scotland national team.", "He has also managed Coventry City, Southampton, Celtic and Middlesbrough.", "Strachan began his managerial career at Coventry City in 1996, leading them to the 1997 FA Cup Final, where they lost to Tottenham Hotspur.", "He then moved to Southampton in 2001, where he guided them to the 2003 FA Cup Final, which they lost to Arsenal.", "In 2005, he was appointed manager of Celtic, where he won three consecutive Scottish Premier League titles and the Scottish League Cup twice.", "He left Celtic in 2009 and was appointed manager of Middlesbrough in October 2010.", "He left Middlesbrough in October 2013.", "In January 2013, Strachan was appointed manager of the Scotland national team.", "He has since led Scotland to the UEFA Euro 2016 qualifying playoffs, where they were eliminated by eventual finalists, and to the 2018 FIFA World Cup" ]
# text = ""
# for i in test:
#    text += " " + i
# print(text)
print(scoring_parallel_version(text))

# print(end_time - start_time)


# Experimentation

In [None]:
accurate = []
minor_inaccurate = []
major_inaccurate = []

In [None]:
folder_name = "Output"

if not os.path.exists(folder_name):
        os.makedirs(folder_name)

for i in tqdm(range(28,100), desc="Processing entries", unit="entry"):
    list_of_sentences = (dataset["evaluation"][i]["gpt3_sentences"])
    sentences = ''''''
    for s in list_of_sentences:
        tmp = preprocess_text(s)
        if tmp[-1]!='.':
            tmp+='.'
        sentences = sentences + tmp + "\n" 
    ground_truth_doc = preprocessing(dataset["evaluation"][i]["wiki_bio_text"])
    ground_truth = ''''''
    for sent in ground_truth_doc.sents:
        temp = preprocess_text(sent.text)
        if temp[-1]!='.':
            temp+='.'
        ground_truth = ground_truth + temp + "\n" 
    annotation = dataset["evaluation"][i]["annotation"]
    
    sentences_scores, sentence_pairs_and_values , sentence_coref_sents= scoring_parallel_version(sentences)
    ground_truth_scores, ground_pairs_and_values, ground_coref_sents = scoring_parallel_version(ground_truth)
        
    filename = os.path.join(folder_name, f"entry_{i+1}.txt")
    with open(filename, 'w') as file:
        file.write("#############SENTENCE_PAIRS############\n\n")
        for x in range(len(sentence_pairs_and_values)):
            file.write(f"{sentence_pairs_and_values[x][0]} and {sentence_pairs_and_values[x][1]} and the value is : {sentence_pairs_and_values[x][2]}\n")
        file.write("\n")
        file.write("#############GROUND_PAIRS############\n\n")
        for x in range(len(ground_pairs_and_values)):
            file.write(f"{ground_pairs_and_values[x][0]} and {ground_pairs_and_values[x][1]} and the value is : {ground_pairs_and_values[x][2]}\n")
        file.write("\n\n")
        file.write("%%%%%%%%%%%%%%%%%%%%SENTENCES%%%%%%%%%%%%%%%%%\n")
        file.write(f"Sentences : \n\n{sentences}\n\n")
        file.write(f"Coref Resolved: \n\n")
        for y,x in enumerate(sentence_coref_sents):
            file.write(f"{y}. {x}")
        file.write("\n")
        file.write("\n")
        file.write("%%%%%%%%%%%%%%%%%%%%GROUND_TRUTH%%%%%%%%%%%%%%%\n")
        file.write(f"Ground Truth : \n\n{ground_truth} \n\n")
        file.write(f"Coref Resolved: \n\n")
        for y,x in enumerate(ground_coref_sents):
            file.write(f"{y}. {x}")
        file.write("\n")
        file.write("%%%%%%%%%%%%%%%%%%%%FRACTIONS%%%%%%%%%%%%%%%%%%\n")
        file.write(f"Value for sentences is : \n")
        for x in sentences_scores:
            file.write(f"{x} ")
        file.write("\n")
        file.write(f"Value for ground truth is : \n")
        for x in ground_truth_scores:
            file.write(f"{x} ")
        file.write("\n\n")
        file.write("%%%%%%%%%%%%%%%%%%%%ANNOTATIONS%%%%%%%%%%%%%%%%\n")
        for x in annotation:
            file.write(f"{x} ")
        file.write("\n\n")
        
    suma = 0
    leng = 0
    for x in ground_truth_scores:
        if x!=-1:
            suma+=x
            leng+=1
    
    if leng>0:
        print(f"The ground truth scores are: {suma/leng}\n")

    if (leng)>0 and (suma/leng)>0.25:
        if len(sentences_scores)!=len(annotation):
            continue
        for scores in ground_truth_scores:
            if scores==-1:
                continue
            else:
                accurate.append(scores)
        for t,score in enumerate(sentences_scores):
            if score==-1:
                continue
            if annotation[t-1]=="accurate":
                accurate.append(score) 
            elif annotation[t-1]=="minor_inaccurate":
                minor_inaccurate.append(score)
            elif annotation[t-1]=="major_inaccurate":
                major_inaccurate.append(score)

In [None]:
sum(accurate)/len(accurate)

In [None]:
len(accurate)

In [None]:
sum(minor_inaccurate)/len(minor_inaccurate)

In [None]:
len(minor_inaccurate)

In [None]:
sum(major_inaccurate)/len(major_inaccurate)

In [None]:
len(major_inaccurate)

In [None]:
text = "Traded to the San Diego Chargers, Aldridge played two seasons in San Diego before retiring from professional football in 1973."

In [None]:
doc = preprocessing(text)

In [None]:
for ents in doc.ents:
    print(f"{ents} __ {ents.label_}")

In [None]:
import spacy
import requests

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Define the sentence
sentence = "Albert Einstein was a theoretical physicist who developed the theory of relativity."

# Parse the sentence using spaCy
doc = nlp(sentence)

# Extract subjects from the sentence
subjects = [token.text for token in doc if token.dep_ == "nsubj"]

print("Extracted Subjects:", subjects)

def link_to_dbpedia(subject):
    spotlight_url = "https://api.dbpedia-spotlight.org/en/annotate"
    headers = {"accept": "application/json"}
    params = {"text": subject, "confidence": 0.5}

    response = requests.get(spotlight_url, headers=headers, params=params)
    if response.status_code == 200:
        data = response.json()
        if 'Resources' in data:
            resources = data['Resources']
            return resources[0]['@URI']
    return None

# Link each extracted subject to a DBpedia entry
for subject in subjects:
    dbpedia_uri = link_to_dbpedia(subject)
    if dbpedia_uri:
        print(f"Subject: {subject} -> DBpedia URI: {dbpedia_uri}")
    else:
        print(f"Subject: {subject} -> No DBpedia URI found")


# Wikipedia Based System

Wikipediaapi Doesnt work ->

In [None]:
import wikipediaapi

def search_wikipedia(query):
    wiki_wiki = wikipediaapi.Wikipedia('en')  # Initialize Wikipedia wrapper for English

    # Search for the query
    search_result = wiki_wiki.page(query)

    if search_result.exists():
        print(f"Title: {search_result.title}")
        print(f"Summary: {search_result.summary}")
    else:
        print(f"Page for '{query}' does not exist.")

# Example usage
search_query = "Artificial intelligence"
search_wikipedia(search_query)

Switched to wikipedia

In [None]:
pip install wikipedia

In [9]:
import wikipedia
 
# getting suggestions
result = wikipedia.search("Wilhelm Windelband interests", results = 1)
for results in result:
    print(wikipedia.summary(result))
    page = wikipedia.page(result)
    print("\n")
    for link in page.links:
            print(link)
# printing the result
print(result)

Wilhelm Windelband (; German: [ˈvɪndl̩bant]; 11 May 1848 – 22 October 1915) was a German philosopher of the Baden School.


19th-century philosophy
A priori and a posteriori
Albert Schweitzer
Alfred North Whitehead
Analytic–synthetic distinction
Anti-realism
Archive.org
Auguste Comte
Baden School
Bas van Fraassen
Bertrand Russell
C. D. Broad
Carl Gustav Hempel
Causality
Charles Sanders Peirce
Coherentism
Commensurability (philosophy of science)
Confirmation holism
Consilience
Construct (philosophy)
Constructive empiricism
Constructive realism
Constructivist epistemology
Contextualism
Conventionalism
Creative synthesis
Criticism of science
David Hume
Deductive-nomological model
Demarcation problem
Descriptive research
Determinism
Doctoral advisor
Empirical evidence
Empiricism
Epistemological anarchism
Epistemology
Ernst Troeltsch
Evidence-based practice
Evolutionism
Explanatory power
Fact
Faith and rationality
Fallibilism
Falsifiability
Feminist method
Foundationalism
Francis Bacon
Fred

In [10]:
def extract_compound_subject_span(sent):
    compound_subject = []

    for i, token in enumerate(sent):
        # print(f"{token} and {token.dep_}")
        if token.dep_ in ["nsubj","nsubjpass"]:
            compound_subject.append(token.text)
            for j in range(i - 1, -1, -1):
                if sent[j].dep_ == "compound":
                    # print(f"{sent[j]} and {sent[j].dep_}")
                    compound_subject.insert(0, sent[j].text) 
                else:
                    break  
            break
    if compound_subject == []:
        for i, token in enumerate(sent):
            # print(f"{token} and {token.dep_}")
            if token.dep_ in ["dobj","iobj","pobj"]:
                compound_subject.append(token.text)
                for j in range(i - 1, -1, -1):
                    if sent[j].dep_ == "compound":
                        # print(f"{sent[j]} and {sent[j].dep_}")
                        compound_subject.insert(0, sent[j].text) 
                    else:
                        break  
                break

    return " ".join(compound_subject) if compound_subject else None

In [11]:
text ="""After the family moved to Kilburn, Tommy Nutter and Tommy Nutter brother David attended Willesden Technical College."""
text = """Windelband relied in his effort to reach beyond Kant on such philosophers as Georg Wilhelm Friedrich Hegel, Johann Friedrich Herbart, and Hermann Lotze."""

text = coreference_resolution(text)
input = preprocessing(text)

# for token in input:
#     print(f"{token} and {token.dep_}")
# for sent in input.sents:
#     print(extract_compound_subject_span(sent))
for entities in input.ents:
    print(entities)

07/02/2024 16:22:45 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 16:22:45 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

Windelband
Windelband
Kant
Georg Wilhelm Friedrich Hegel
Johann Friedrich Herbart
Hermann Lotze


In [54]:
def get_sentence_based_links_wiki(text):
    # print(text)
    t1 = coreference_resolution(text)
    final_text = preprocess_text(t1)
    
    # print(final_text)
    
    doc = nlp(final_text)
    final_sents = [sents for sents in doc.sents]
    
    subjects = []
    subject_set = set([])
    subjects_with_context = []
    for sent in doc.sents:
        tmp_subj = extract_compound_subject_span(sent)
        subjects.append(tmp_subj)
        if tmp_subj!=None:
            if tmp_subj not in subject_set:
                subject_set.add(tmp_subj)
                subjects_with_context.append([tmp_subj,sent.text])
    ("Subjects identified:", subjects)
    
    sentence_forms = []
    for sent in doc.sents:
        # print(sent)
        entities = [ent.text for ent in sent.ents]
        # print(entities)
        sentence_forms.append(entities)
    # print("Entities in each sentence:", sentence_forms)
    
    # print(subjects)
    
    pairs = []
    count = 0
    for i in range(len(sentence_forms)):
        tmp_storage = []
        if subjects[i] is not None and len(sentence_forms[i]) > 1:
            for entity in sentence_forms[i]:
                if entity not in subjects:
                    tmp_storage.append([subjects[i], entity])
                    count += 1
        pairs.append(tmp_storage)
    print(f"The number of pairs is: {count}\n")
    
    return pairs, final_sents, subject_set, subjects_with_context

In [55]:
text ="""Richard Keith Mahler (August 5, 1953 in Austin, Texas - March 2, 2005 in Jupiter, Florida) was a starting pitcher in Major League Baseball who played for the Atlanta Braves (1979-1988, 1991), Cincinnati Reds (1989-1990) and Montreal Expos (1991).
His brother Mickey was also a Major League pitcher, with the two being teammates in 1979.
The two had previously been teammates playing for the Triple-A Richmond Braves.
In his 13-year career, Mahler posted a 96-111 record with 952 strikeouts and a 3.99 ERA in 1951.1 innings.
Born in Austin, Texas, Mahler graduated from John Jay High School and then attended Trinity University, both in San Antonio, Texas.
After being signed by the Braves as an amateur free agent in 1975, he made his debut in the 1979 season.
Mahler started on Opening Day for the Braves in 1982, when Atlanta won the National League West title.
He made four straight Opening Day starts beginning in 1985.
In 1987, he tied an NL record with his third Opening Day shutout.
His best season came in 1985, when he went 17-15 with a 3.48 ERA.
He pitched twice in the postseason, with the Braves in 1982, and with the 1990 World Series champion Cincinnati Reds.
Mahler was a key member of that Reds' pitching staff as a spot starter and reliever, going 7-6 and contributing four saves.
He also appeared in ten games with the Montreal Expos in 1991 before returning to Atlanta in mid-season.
After retiring, Mahler served as a minor league pitching coach for the Kansas City Royals and the Florida Marlins, and he was a roving instructor for the St. Louis Cardinals.
He also managed St. Louis' Double-A affiliate in the Texas League from 1996 to 1997.
Mahler died at age 51 of a heart attack at home in Jupiter, Florida, where he was preparing for his second season as a minor league pitching coach for the New York Mets.
He was survived by his wife, Sheryl, and five children Ricky, Robby, Timothy, Tyler and Shannon."""

pairs, final_sents, subject_set, subjects_with_context = (get_sentence_based_links_wiki(text))
print(pairs)
print(final_sents)
print(subject_set)

07/02/2024 17:06:04 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:06:04 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 95

[[['Richard Keith Mahler', 'Richard Keith'], ['Richard Keith Mahler', 'Mahler'], ['Richard Keith Mahler', 'Austin, Texas'], ['Richard Keith Mahler', 'Texas'], ['Richard Keith Mahler', 'Jupiter, Florida'], ['Richard Keith Mahler', 'starting pitcher'], ['Richard Keith Mahler', 'Major League Baseball'], ['Richard Keith Mahler', 'Braves'], ['Richard Keith Mahler', 'Reds'], ['Richard Keith Mahler', 'Montreal Expos']], [['Richard Keith Mahler brother Mickey', 'Richard Keith'], ['Richard Keith Mahler brother Mickey', 'Mahler'], ['Richard Keith Mahler brother Mickey', 'Major League Baseball'], ['Richard Keith Mahler brother Mickey', 'pitcher']], [['two', 'Triple-A'], ['two', 'Braves']], [['Richard Keith Mahler', 'Richard Keith'], ['Richard Keith Mahler', 'Mahler'], ['Richard Keith Mahler', 'Richard Keith'], ['Richard Keith Mahler', 'Mahler'], ['Richard Keith Mahler', 'ERA']], [['Austin', 'Austin, Texas'], ['Austin', 'Richard Keith'], ['Austin', 'Mahler'], ['Austin',

In [80]:
def subjects_direct_links_wiki(subject_with_context):
    
    subject_direct_dict = {}
    subject_direct_word_dict = {}
    
    result_set = set([])
    result_dict = {}
    
    for pair in subject_with_context:
        subject = pair[0]
        context = pair[1]
        
        try:
            subj = subject + ": " + context
            if len(subj) > 250:
                subj = subj[:250]  # Truncate to 250 characters if it exceeds 300
                # print(f"Truncated query: {subj}")
            
            result = wikipedia.search(subj, results = 1)
            # print(result)
            for results in result:
                if results in result_set:
                    subject_direct_dict[subject] = result_dict[results]
                else:
                    page = wikipedia.page(results)
                    tmp = [links for links in page.links]
                    subject_direct_word_dict[subject] = tmp
                    subject_direct_dict[subject] = model.encode(tmp,show_progress_bar=False)
                    result_set.add(results)
                    result_dict[results] = subject_direct_dict[subject]
                    
            if not result:
                subject_direct_word_dict[subject] = []
                subject_direct_dict[subject] = np.array([],dtype="float32")
        except Exception as e:
            print(f"Query for {subj} didn't work: {e}")
            subject_direct_dict[subject] = np.array([],dtype="float32")

    return subject_direct_dict, subject_direct_word_dict


In [57]:
page = wikipedia.page("Rick Mahler")

PageError: Page id "rick miller" does not match any pages. Try another id!

In [90]:
import wikipedia
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed

def subjects_direct_links_wiki_fast(subject_with_context):
    subject_direct_dict = {}
    subject_direct_word_dict = {}
    result_set = set([])
    result_dict = {}
    
    def process_subject(pair):
        subject = pair[0]
        context = pair[1]
        try:
            subj = subject + ": " + context
            if len(subj) > 250:
                subj = subj[:250] 
                # print(f"Truncated query: {subj}")
            
            result = wikipedia.search(subj, results=1)
            for results in result:
                if results in result_set:
                    return subject, result_dict[results], []
                else:
                    page = wikipedia.page(results)
                    links = [link for link in page.links]
                    return subject, None, links
            return subject, None, []
        except Exception as e:
            print(f"Query for {subj} didn't work: {e}")
            return subject, None, []

    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_subject, pair) for pair in subject_with_context]
        for future in as_completed(futures):
            subject, encoded_data, links = future.result()
            if encoded_data is not None:
                subject_direct_dict[subject] = encoded_data
            else:
                if links:
                    encoded_links = model.encode(links, show_progress_bar=False)
                    subject_direct_word_dict[subject] = links
                    subject_direct_dict[subject] = encoded_links
                    result_set.add(subject)
                    result_dict[subject] = encoded_links
                else:
                    subject_direct_word_dict[subject] = []
                    subject_direct_dict[subject] = np.array([], dtype="float32")

    return subject_direct_dict, subject_direct_word_dict


In [91]:
subjects_with_context

[['Richard Keith Mahler',
  'Richard Keith Mahler (August 5, 1953 in Austin, Texas Texas - March 2, 2005 in Jupiter, Florida ) was a starting pitcher in Major League Baseball who played for the Atlanta Braves (1979 -1988, 1991), Cincinnati Reds (1989-1990) and Montreal Expos (1991).'],
 ['Richard Keith Mahler brother Mickey',
  'Richard Keith Mahler brother Mickey was also a Major League Baseball pitcher, with the two being teammates in 1979 .'],
 ['two',
  'the two had previously been teammates playing for the Triple-A Richmond Braves.'],
 ['Austin',
  'Born in Austin, Texas , Richard Keith Mahler graduated from John Jay High School and then attended Trinity University, both in San Antonio, Texas .'],
 ['season',
  'Richard Keith Mahler best season came in 1985 , when Richard Keith Mahler went 17-15 with a 3.48 ERA.'],
 ['Virat Kohli', 'Indian Cricketer']]

In [60]:
subjects_with_context.append(['Virat Kohli','Indian Cricketer'])

In [92]:
subjects_direct_links_wiki(subjects_with_context)


Query for Richard Keith Mahler brother Mickey: Richard Keith Mahler brother Mickey was also a Major League Baseball pitcher, with the two being teammates in 1979 . didn't work: Page id "rick miller" does not match any pages. Try another id!
Query for Austin: Born in Austin, Texas , Richard Keith Mahler graduated from John Jay High School and then attended Trinity University, both in San Antonio, Texas . didn't work: Page id "rick miller" does not match any pages. Try another id!
Query for season: Richard Keith Mahler best season came in 1985 , when Richard Keith Mahler went 17-15 with a 3.48 ERA. didn't work: Page id "rick miller" does not match any pages. Try another id!


({'Richard Keith Mahler': array([], dtype=float32),
  'Richard Keith Mahler brother Mickey': array([], dtype=float32),
  'two': array([[-0.02265206,  0.0010879 ,  0.01370007, ..., -0.032479  ,
           0.01720726,  0.00445213],
         [-0.0365615 ,  0.02709321,  0.02345949, ..., -0.01994519,
           0.03132329,  0.00293135],
         [-0.01592126,  0.05201155,  0.02858504, ..., -0.01279398,
           0.02660117,  0.00396994],
         ...,
         [-0.03668725, -0.00275293,  0.01194312, ...,  0.04677752,
          -0.01120493,  0.01350194],
         [ 0.01197165,  0.00140239,  0.02752407, ...,  0.0183214 ,
           0.01224163, -0.03842102],
         [ 0.00618787, -0.00136719, -0.03693965, ...,  0.00438008,
          -0.00909533,  0.00257993]], dtype=float32),
  'Austin': array([], dtype=float32),
  'season': array([], dtype=float32),
  'Virat Kohli': array([[-0.03902974, -0.06275151, -0.01127133, ...,  0.00084654,
           0.01985146,  0.02155641],
         [-0.04373354, -

In [93]:
subjects_direct_links_wiki_fast(subjects_with_context)


Query for season: Richard Keith Mahler best season came in 1985 , when Richard Keith Mahler went 17-15 with a 3.48 ERA. didn't work: Page id "rick miller" does not match any pages. Try another id!
Query for Austin: Born in Austin, Texas , Richard Keith Mahler graduated from John Jay High School and then attended Trinity University, both in San Antonio, Texas . didn't work: Page id "rick miller" does not match any pages. Try another id!
Query for Richard Keith Mahler brother Mickey: Richard Keith Mahler brother Mickey was also a Major League Baseball pitcher, with the two being teammates in 1979 . didn't work: Page id "rick miller" does not match any pages. Try another id!


({'Richard Keith Mahler': array([], dtype=float32),
  'season': array([], dtype=float32),
  'Austin': array([], dtype=float32),
  'Richard Keith Mahler brother Mickey': array([], dtype=float32),
  'Virat Kohli': array([[-0.03902974, -0.06275151, -0.01127133, ...,  0.00084654,
           0.01985146,  0.02155641],
         [-0.04373354, -0.05799465, -0.00368122, ..., -0.00078541,
           0.01103885,  0.01862861],
         [-0.03322043, -0.02028006,  0.01349401, ...,  0.02990761,
           0.00912451,  0.01396054],
         ...,
         [-0.00613717,  0.01893144,  0.00105132, ...,  0.0493106 ,
           0.01782053, -0.00718726],
         [ 0.03920937,  0.03370506, -0.00279895, ...,  0.04726687,
          -0.07020499,  0.00093569],
         [-0.00245601, -0.02317117, -0.03176381, ..., -0.01290523,
          -0.035393  ,  0.00284538]], dtype=float32),
  'two': array([[-0.02265206,  0.0010879 ,  0.01370007, ..., -0.032479  ,
           0.01720726,  0.00445213],
         [-0.0365615 ,  

In [94]:
def process_sentence_set_wiki(args):
    index, sent_sets, subject_dict = args
    score = 0
    local_pair_and_values = []
    for pair in sent_sets:
        local_subject= pair[0]
        entity = pair[1]

        entity_embeddings = model.encode(entity,show_progress_bar=False)
        embeddings = subject_dict[local_subject]
        
        if embeddings.size == 0:
            local_pair_and_values.append([pair[0], pair[1], -1])
            score -= 1
            continue
            
        cosine_similarities = np.dot(embeddings, entity_embeddings)
        max_similarity = np.max(cosine_similarities)
        if max_similarity>0.65:
            score+=1
            local_pair_and_values.append([pair[0],pair[1],1])
        else:
            local_pair_and_values.append([pair[0],pair[1],0])
    fraction = score / len(sent_sets) if len(sent_sets) > 0 else -1
    return index, fraction, local_pair_and_values

def scoring_parallel_version_wiki(text):
    # print(text)
    pairs, final_sents, subject_set, subjects_with_context = get_sentence_based_links_wiki(text)
    subject_dict, subject_word_dict = subjects_direct_links_wiki_fast(subjects_with_context)
    
    # print(subject_set)

    if not pairs:
        print("No entity pairs found here.")
        return 0, [], final_sents

    fractions = [None] * len(pairs)
    pair_and_values = [None] * len(pairs)

    with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
        task_args = [(index, sent_sets, subject_dict) for index, sent_sets in enumerate(pairs)]
        results = list(executor.map(process_sentence_set_wiki, task_args))

    for index, fraction, local_pair_and_values in results:
        fractions[index] = fraction
        pair_and_values[index] = local_pair_and_values

    flat_pair_and_values = [item for sublist in pair_and_values for item in sublist]
    return fractions, flat_pair_and_values, final_sents


In [95]:
text

"Richard Keith Mahler (August 5, 1953 in Austin, Texas - March 2, 2005 in Jupiter, Florida) was a starting pitcher in Major League Baseball who played for the Atlanta Braves (1979-1988, 1991), Cincinnati Reds (1989-1990) and Montreal Expos (1991).\nHis brother Mickey was also a Major League pitcher, with the two being teammates in 1979.\nThe two had previously been teammates playing for the Triple-A Richmond Braves.\nIn his 13-year career, Mahler posted a 96-111 record with 952 strikeouts and a 3.99 ERA in 1951.1 innings.\nBorn in Austin, Texas, Mahler graduated from John Jay High School and then attended Trinity University, both in San Antonio, Texas.\nAfter being signed by the Braves as an amateur free agent in 1975, he made his debut in the 1979 season.\nMahler started on Opening Day for the Braves in 1982, when Atlanta won the National League West title.\nHe made four straight Opening Day starts beginning in 1985.\nIn 1987, he tied an NL record with his third Opening Day shutout.\n

In [96]:
print(scoring_parallel_version_wiki(text))

07/03/2024 10:01:25 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/03/2024 10:01:26 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 95

Query for Richard Keith Mahler brother Mickey: Richard Keith Mahler brother Mickey was also a Major League Baseball pitcher, with the two being teammates in 1979 . didn't work: Page id "rick miller" does not match any pages. Try another id!
Query for season: Richard Keith Mahler best season came in 1985 , when Richard Keith Mahler went 17-15 with a 3.48 ERA. didn't work: Page id "rick miller" does not match any pages. Try another id!
Query for Austin: Born in Austin, Texas , Richard Keith Mahler graduated from John Jay High School and then attended Trinity University, both in San Antonio, Texas . didn't work: Page id "rick miller" does not match any pages. Try another id!
([-1.0, -1.0, 0.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0], [['Richard Keith Mahler', 'Richard Keith', -1], ['Richard Keith Mahler', 'Mahler', -1], ['Richard Keith Mahler', 'Austin, Texas', -1], ['Richard Keith Mahler', 'Texas', -1], ['Richard Kei

In [97]:
accurate = []
minor_inaccurate = []
major_inaccurate = []

In [79]:
folder_name = "Output"

if not os.path.exists(folder_name):
        os.makedirs(folder_name)

for i in tqdm(range(0,100), desc="Processing entries", unit="entry"):
    list_of_sentences = (dataset["evaluation"][i]["gpt3_sentences"])
    sentences = ''''''
    for s in list_of_sentences:
        tmp = preprocess_text(s)
        if tmp[-1]!='.':
            tmp+='.'
        sentences = sentences + tmp + "\n" 
        
    # sentences = sentences.strip()
    # raw = dataset["evaluation"][i]["wiki_bio_text"]
    
    ground_truth_doc = preprocessing(dataset["evaluation"][i]["wiki_bio_text"])
    ground_truth = ''''''
    for sent in ground_truth_doc.sents:
        temp = preprocess_text(sent.text.strip())  
        if temp:
            if temp[-1] not in '.!?': 
                temp += '.'
            ground_truth += temp + "\n"

    ground_truth = ground_truth.strip()
    # print(ground_truth)
    
    annotation = dataset["evaluation"][i]["annotation"]
    
    sentences_scores, sentence_pairs_and_values , sentence_coref_sents= scoring_parallel_version_wiki(sentences)
    ground_truth_scores, ground_pairs_and_values, ground_coref_sents = scoring_parallel_version_wiki(ground_truth)
        
    filename = os.path.join(folder_name, f"entry_{i+1}.txt")
    with open(filename, 'w') as file:
        file.write("#############SENTENCE_PAIRS############\n\n")
        for x in range(len(sentence_pairs_and_values)):
            file.write(f"{sentence_pairs_and_values[x][0]} and {sentence_pairs_and_values[x][1]} and the value is : {sentence_pairs_and_values[x][2]}\n")
        file.write("\n")
        file.write("#############GROUND_PAIRS############\n\n")
        for x in range(len(ground_pairs_and_values)):
            file.write(f"{ground_pairs_and_values[x][0]} and {ground_pairs_and_values[x][1]} and the value is : {ground_pairs_and_values[x][2]}\n")
        file.write("\n\n")
        file.write("%%%%%%%%%%%%%%%%%%%%SENTENCES%%%%%%%%%%%%%%%%%\n")
        file.write(f"Sentences : \n\n{sentences}\n\n")
        file.write(f"Coref Resolved: \n\n")
        for y,x in enumerate(sentence_coref_sents):
            file.write(f"{y}. {x}\n")
        file.write("\n")
        file.write("\n")
        file.write("%%%%%%%%%%%%%%%%%%%%GROUND_TRUTH%%%%%%%%%%%%%%%\n")
        file.write(f"Ground Truth : \n\n{ground_truth} \n\n")
        file.write(f"Coref Resolved: \n\n")
        for y,x in enumerate(ground_coref_sents):
            file.write(f"{y}. {x}\n")
        file.write("\n")
        file.write("%%%%%%%%%%%%%%%%%%%%FRACTIONS%%%%%%%%%%%%%%%%%%\n")
        file.write(f"Value for sentences is : \n")
        for x in sentences_scores:
            file.write(f"{x} ")
        file.write("\n")
        file.write("\n")
        file.write(f"Value for ground truth is : \n")
        for x in ground_truth_scores:
            file.write(f"{x} ")
        file.write("\n\n")
        file.write("%%%%%%%%%%%%%%%%%%%%ANNOTATIONS%%%%%%%%%%%%%%%%\n")
        for x in annotation:
            file.write(f"{x} ")
        file.write("\n\n")
        
    
    if len(ground_truth)>0:
        print(f"The ground truth scores are: {suma/leng}\n")

    if (len(ground_truth))>0 and (sum(ground_truth)/len(ground_truth))>0.1:
        if len(sentences_scores)!=len(annotation):
            continue
        for scores in ground_truth_scores:
            if scores==-1:
                continue
            else:
                accurate.append(scores)
        for t,score in enumerate(sentences_scores):
            if score==-1:
                continue
            if annotation[t-1]=="accurate":
                accurate.append(score) 
            elif annotation[t-1]=="minor_inaccurate":
                minor_inaccurate.append(score)
            elif annotation[t-1]=="major_inaccurate":
                major_inaccurate.append(score)

Processing entries:   0%|          | 0/100 [00:00<?, ?entry/s]

07/02/2024 17:12:55 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:12:55 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 13



07/02/2024 17:13:07 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:13:07 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 76

The ground truth scores are: 0.7992753623188406



07/02/2024 17:13:20 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:13:20 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 23



07/02/2024 17:13:29 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:13:29 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 34

The ground truth scores are: 0.6282312925170068



07/02/2024 17:13:38 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:13:38 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 28



07/02/2024 17:13:51 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:13:52 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 95

Query for Austin: Born in Austin, Texas , Richard Keith Mahler graduated from John Jay High School and then attended Trinity University, both in San Antonio, Texas . didn't work: Page id "rick miller" does not match any pages. Try another id!
Query for Richard Keith Mahler brother Mickey: Richard Keith Mahler brother Mickey was also a Major League Baseball pitcher, with the two being teammates in 1979 . didn't work: Page id "rick miller" does not match any pages. Try another id!
Query for season: Richard Keith Mahler best season came in 1985 , when Richard Keith Mahler went 17-15 with a 3.48 ERA. didn't work: Page id "rick miller" does not match any pages. Try another id!
The ground truth scores are: 0.0



07/02/2024 17:14:01 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:14:01 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 9



07/02/2024 17:14:10 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:14:10 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 23

The ground truth scores are: 0.5555555555555556



07/02/2024 17:14:18 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:14:18 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 8



07/02/2024 17:15:00 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:15:00 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 39

The ground truth scores are: 0.25



07/02/2024 17:15:12 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:15:12 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 18



07/02/2024 17:15:20 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:15:21 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 28

Query for Dananjaya: Officially a left-hand batsman and an off-break bowler, Dananjaya is said to have seven variations, including the leg-break, googly, carrom ball, doosra, and Dananjaya stock off-spinner that had impressed national team captain Mahela Jayawardene so much in the nets as to make national team captain Mahela Jayawardene request Dananjaya to be fast-tracked into the national squad. didn't work: An unknown error occured: "Search request is longer than the maximum allowed length. (Actual: 390; allowed: 300)". Please report it on GitHub!
Query for turn: took in turn led to a place in the final squad for the 2012 ICC World Twenty20. didn't work: Page id "kate williamson" does not match any pages. Try another id!
The ground truth scores are: 0.25



07/02/2024 17:15:29 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:15:29 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 22

Query for Derek King: Derek King (born 28 April 1965) is an Australian former professional footballer who played as a midfielder. didn't work: Page id "deaths in may 2023" does not match any pages. Try another id!


07/02/2024 17:15:36 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:15:36 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 15

The ground truth scores are: 0.7261904761904762



07/02/2024 17:15:44 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:15:44 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 15



07/02/2024 17:15:56 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:15:56 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 25

The ground truth scores are: 0.9277777777777777



07/02/2024 17:16:05 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:16:05 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 9



07/02/2024 17:16:09 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:16:09 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 53

The ground truth scores are: 0.53



07/02/2024 17:16:19 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:16:19 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 19



07/02/2024 17:16:29 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:16:29 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 27

The ground truth scores are: 0.7430555555555555



07/02/2024 17:16:38 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:16:38 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 13



07/02/2024 17:16:46 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:16:46 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 36

The ground truth scores are: 0.2333333333333333



07/02/2024 17:16:53 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:16:53 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 34

Query for Carter Henry Harrison Sr . tenure: During Carter Henry Harrison Sr. tenure, Carter Henry Harrison Sr. was a strong advocate for labor unions and was instrumental in the passage of the Eight-Hour Law in 1885. didn't work: Page id "martin can burden" does not match any pages. Try another id!


07/02/2024 17:17:04 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:17:04 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 47

Query for Carter Henry Harrison Sr: Carter Henry Harrison Sr. (February 15, 1825 October 28, 1893) was an American politician who served as Mayor of Chicago, Illinois from 1879 until 1887 Carter Henry Harrison Sr. (February 15, 1825 October 28, 1893) was subsequently elected to a fifth term in 1893 but was assassinated before completing a fifth term . didn't work: An unknown error occured: "Search request is longer than the maximum allowed length. (Actual: 343; allowed: 300)". Please report it on GitHub!
Query for Carter Henry Harrison II: Born near Lexington, Kentucky to Carter Henry Harrison II and Caroline Russell, Carter Henry Harrison Sr. (February 15, 1825 October 28, 1893) was only a few months old when Carter Henry Harrison II died. didn't work: Page id "martin can burden" does not match any pages. Try another id!
Query for E. Stearns: Margarette (or Margaret) E. Stearns was the daughter of Chicago, Illinois pioneer Marcus C. Stearns. didn't work: Pa

07/02/2024 17:17:14 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:17:14 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 17

Query for Winnebago Deal: Winnebago Deal is an American indie rock band from Brooklyn, New York. didn't work: Page id "2022 deaths in the united states" does not match any pages. Try another id!


07/02/2024 17:17:21 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:17:21 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 81

Query for length album: their second full-length album "Flight of the Raven" features guest appearances from other artists including Nick Oliveri, former Queens of the Stone Age bassist-vocalist , Jack Endino, producer of the first Nirvana album "Bleach" , Paul Morrill and Edward "Raven" Heaton from the band Sicarios, and Rusty Needles of Oxford band Deguello, and was cited as the sixth best album in 2006 by "Kerrang ". didn't work: An unknown error occured: "Search request is longer than the maximum allowed length. (Actual: 413; allowed: 300)". Please report it on GitHub!
The ground truth scores are: 0.3138888888888889



07/02/2024 17:17:46 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:17:46 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 39



07/02/2024 17:17:56 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:17:56 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 45

The ground truth scores are: 0.17052154195011338



07/02/2024 17:18:12 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:18:12 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 11



07/02/2024 17:18:20 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:18:20 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 63

The ground truth scores are: 0.5366402116402116



07/02/2024 17:18:32 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:18:32 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 16



07/02/2024 17:18:36 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:18:36 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 14

The ground truth scores are: 0.09999999999999999



07/02/2024 17:18:44 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:18:44 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 14



07/02/2024 17:18:54 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:18:54 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 20



07/02/2024 17:19:07 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:19:07 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 14



07/02/2024 17:19:16 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:19:16 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 15

The ground truth scores are: 0.25



07/02/2024 17:19:25 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:19:25 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 14



07/02/2024 17:19:33 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:19:34 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 28

The ground truth scores are: 0.41309523809523807



07/02/2024 17:19:43 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:19:43 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 22



07/02/2024 17:19:48 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:19:48 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 28

The ground truth scores are: 0.7766666666666666



07/02/2024 17:19:56 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:19:57 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 13

Query for Terry Alderman: Terry Alderman (born 28 April 1952) is a former Australian cricketer who played in 41 Tests and 53 One Day Internationals between 1979 and 1991 . didn't work: Page id "rodney hood" does not match any pages. Try another id!


07/02/2024 17:20:03 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:20:04 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 87

Query for which: Following a 3-year ban from international cricket which disqualified him from playing in the 1985 Ashes series in England , Terence Michael Alderman returned to the Australian national team and resumed Terence Michael Alderman success against England , taking 41 wickets in the 1989 Ashes series and another 16 in the 1990 91 series, Terence Michael Alderman final Ashes appearance. didn't work: An unknown error occured: "Search request is longer than the maximum allowed length. (Actual: 389; allowed: 300)". Please report it on GitHub!
The ground truth scores are: 0.35



07/02/2024 17:20:16 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:20:16 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 27



07/02/2024 17:20:28 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:20:28 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 31

The ground truth scores are: 0.15740740740740738



07/02/2024 17:20:36 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:20:36 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 14



07/02/2024 17:20:45 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:20:45 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 42

The ground truth scores are: 0.20284090909090907



07/02/2024 17:20:55 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:20:55 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 20



07/02/2024 17:21:10 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:21:10 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 36

The ground truth scores are: 0.6666666666666666



07/02/2024 17:21:18 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:21:19 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 53



07/02/2024 17:21:32 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:21:32 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 42

The ground truth scores are: 0.4822222222222222



07/02/2024 17:21:41 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:21:42 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 24



07/02/2024 17:21:46 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:21:46 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 16

The ground truth scores are: 0.6166666666666667



07/02/2024 17:21:53 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:21:54 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 26



07/02/2024 17:21:58 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:21:58 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 29

Query for Thomas Harriot: Thomas Harriot (Oxford, c. 1560 London, 2 July 1621 also spelled Harriott, Hariot, or Heriot) was an English astronomer, mathematician, ethnographer, and translator. didn't work: Page id "thomas harriet" does not match any pages. Try another id!


07/02/2024 17:22:05 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:22:06 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 17



07/02/2024 17:22:24 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:22:24 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 15

Query for Tadeusz Szeligowski achievements: Tadeusz Szeligowski achievements include the creation of the Poznań Philharmonic , where Tadeusz Szeligowski served as the Poznań Philharmonic first director between 1947 and 1949, and the founding of the Poznań Musical Spring, one of the most important festivals of contemporary music at the time. didn't work: An unknown error occured: "Search request is longer than the maximum allowed length. (Actual: 332; allowed: 300)". Please report it on GitHub!
The ground truth scores are: 0.85



07/02/2024 17:22:33 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:22:33 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 56



07/02/2024 17:22:47 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:22:47 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 124

The ground truth scores are: 0.7366804692891649



07/02/2024 17:23:07 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:23:07 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 18

Query for Steven Threet: Steven Threet (born August 8, 1985) is a former American football quarterback. didn't work: Page id "chad benne" does not match any pages. Try another id!


07/02/2024 17:23:15 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:23:15 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 43

Query for head coach Lloyd Carr: head coach Lloyd Carr was replaced before the 2008 season by Rich Rodriguez . didn't work: Page id "lloyd car" does not match any pages. Try another id!
The ground truth scores are: 0.4772727272727273



07/02/2024 17:23:24 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:23:25 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 17



07/02/2024 17:23:33 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:23:33 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 27

The ground truth scores are: 0.6461538461538461



07/02/2024 17:23:42 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:23:43 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 18



07/02/2024 17:24:24 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:24:24 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 33

The ground truth scores are: 0.32407407407407407



07/02/2024 17:24:32 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:24:32 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 21



07/02/2024 17:24:36 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:24:36 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 28

The ground truth scores are: 0.46851851851851856



07/02/2024 17:24:44 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:24:44 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 16



07/02/2024 17:24:57 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:24:58 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 45

Query for Blood Divine: The Blood Divine was a British gothic metal band, founded in the summer of 1995 by Paul Allender , after Paul Allender, brothers Paul and Benjamin Ryan left Cradle of Filth during the recording of the "V Empire (or Dark Faerytales in Phallustein)" EP and ousted Anathema singer Darren White. didn't work: An unknown error occured: "Search request is longer than the maximum allowed length. (Actual: 305; allowed: 300)". Please report it on GitHub!
The ground truth scores are: 0.4166666666666667



07/02/2024 17:25:07 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:25:07 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 37



07/02/2024 17:25:12 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:25:12 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 75

Query for minister: When a particularly senior minister challenged King Zhuang of Chu (died 591 BC) through a riddle, King Zhuang of Chu (died 591 BC) responded that King Zhuang of Chu (died 591 BC) had been waiting for three years for someone from King Zhuang of Chu (died 591 BC) court to show some nationalistic pride. didn't work: An unknown error occured: "Search request is longer than the maximum allowed length. (Actual: 311; allowed: 300)". Please report it on GitHub!
The ground truth scores are: 0.724867724867725



07/02/2024 17:25:22 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:25:22 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 9



07/02/2024 17:25:36 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:25:36 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 21

The ground truth scores are: 0.6666666666666666



07/02/2024 17:25:59 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:25:59 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 13



07/02/2024 17:26:04 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:26:04 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 41

Query for identity: Despite the existence of Hendrik van Rheede's over the last three centuries, the correct taxonomic identity of many plants listed in Hortus Malabaricus , many plants listed in Hortus Malabaricus medicinal properties, methods of use, etc., as described and codified by renowned traditional medical authorities of 17th century India remained inaccessible to English language-based scholars, until Professor Kattungal Subramaniam Manilal commenced publication of research papers and books on the Latin botanical treatise Hortus Malabaricus . didn't work: An unknown error occured: "Search request is longer than the maximum allowed length. (Actual: 548; allowed: 300)". Please report it on GitHub!
Query for scope: Whilst the scope of Professor Kattungal Subramaniam Manilal contributions to botany extend far beyond the research and publications around the Latin botanical treatise Hortus Malabaricus , Professor Kattungal Subramaniam Manilal research wo

07/02/2024 17:26:14 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:26:14 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 19



07/02/2024 17:26:28 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:26:28 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 40



07/02/2024 17:26:33 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:26:33 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 13



07/02/2024 17:26:40 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:26:40 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 19

The ground truth scores are: 0.5499999999999999



07/02/2024 17:26:47 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

07/02/2024 17:26:47 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

The number of pairs is: 17



KeyboardInterrupt: 

In [None]:
text = """Empowering Rural Youth through Skill Development:

The partnership between CSC Academy and Oil India aims to empower rural youth through skill development by providing online training for government exams, promoting digital literacy, and enhancing job-oriented education. This initiative is designed to equip rural youth with the necessary skills to thrive in the evolving job market and improve their overall quality of life.

Empowering Rural Youth through Skill Development: Partnership Between CSC Academy and Oil India In the contemporary era of globalization and rapid changes, the imperative for "Kaushal Bharat - Kushal Bharat" has become paramount. Recognizing the need for job-oriented education in villages, CSC Academy has taken a significant step by forging a partnership with Oil India Ltd. This collaboration aims to provide online training for Central and State-level government exams through the Sarkari Pariksha platform. A total of 4,500 candidates from rural areas will be enrolled and trained. The initiative is poised to empower rural youth, aligning their skills with the demands of the job market and thereby enhancing their overall quality of life, as well as access to education and skill development. Ranjan Goswami, CGM of Oil India Ltd., expressed enthusiasm about digitally empowering individuals in the remotest parts of the country, aligning with the company's mission. The partnership with CSC Academy is viewed as a significant step towards reaching more people and providing them with valuable skills, job-oriented education, and knowledge on how to promote computer literacy. Naveen Sharma, Chief Operating Officer of CSC Academy, highlighted the importance of skilled individuals as a foundation for national and societal growth within the digital economy. The collaboration with Oil India aims to promote digital literacy and skill development across India, focusing on enabling every youth in even the remotest areas to become self-reliant. Skills are envisioned to become an integral part of the lives of individuals in rural India. This partnership between CSC Academy and Oil India marks a significant step towards transforming rural communities by providing them with the tools to promote digital literacy in the classroom and resources to thrive in the evolving job market.

How to register for it?"""

In [None]:
coreference_resolution(text)

In [84]:
sum(accurate)/len(accurate)

0.6171428571428571

In [85]:
len(accurate)

235

In [86]:
sum(minor_inaccurate)/len(minor_inaccurate)

0.5171580671580671

In [87]:
len(minor_inaccurate)

37

In [88]:
sum(major_inaccurate)/len(major_inaccurate)

0.5326666666666666

In [89]:
len(major_inaccurate)

50