## Code

In [1]:
%load_ext ipython_sparql_pandas

In [2]:
from SPARQLWrapper import SPARQLWrapper, JSON
from random import randint
sparql = SPARQLWrapper('http://localhost:7200/repositories/20Q-Stanford-186')
sparql.setReturnFormat(JSON)

In [3]:
#from the query results extract a list that contains the counts after splitting on certain attributes
PosAttr=['?s <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.com/popularEntity> .']
NegAttr=[]
AttrHistory = ['<http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.com/popularEntity>']

def extractCountAndAttr(result):
    number = []
    listOfObj = []
    listOfPred= []
    for attr in result["results"]["bindings"]:
        listOfObj.append(attr['o'])
        listOfPred.append(attr['p'])
    return(listOfObj,listOfPred)

def generateQuestion(listOfPred, listOfObj, index, subject):
    flag = True
    # Checks if question has been asked before
    if ('<' + str(listOfPred[index]) +'> <'+str(listOfObj[index])+ '>') in AttrHistory:
        # Goes to next question
        return generateQuestion(listOfPred, listOfObj, index+1, subject)
        
    else:
        # Stores question
        AttrHistory.append('<' + str(listOfPred[index]['value']) +'> <'+str(listOfObj[index]['value'])+ '>')
    
    
    # Prints question    
    #print(f'Does the thing you are looking for have the attribute: {listOfPred[index]["value"].split("/")[-1]} {listOfObj[index]["value"].split("/")[-1]}?' )
    
    # Checks if object has data values, such as 'gYear' or 'age'
    try:
        listOfObj[index]['datatype']
    except:
        flag = False
        
    if flag is True:
        # Looks up question in knowledge graph
        datatype = listOfObj[index]["datatype"].split("#")[-1]
        answer = findAnswer(subject, '<' + str(listOfPred[index]['value']) +'>', '"'+str(listOfObj[index]["value"])+'"^^xsd:'+datatype)
    else:
        # Looks up question in knowledge graph
        if len(listOfObj[index]["value"].split('/')) == 1:
            answer = findAnswer(subject, '<' + str(listOfPred[index]['value']) +'>', '"'+str(listOfObj[index]['value'])+ '"')
        else:
            answer = findAnswer(subject, '<' + str(listOfPred[index]['value']) +'>', '<'+str(listOfObj[index]['value'])+ '>')
    
    # Prints answer to question
    #print(answer)
    
    
    # Answer is "yes" or starts with 'y', adds positive filter to PosAttr 
    if answer:
        # Object has data type
        if flag is True:
            datatype = listOfObj[index]["datatype"].split("#")[-1]
            PosAttr.append('?s <' + str(listOfPred[index]["value"]) +'> "'+str(listOfObj[index]["value"])+'"^^xsd:'+datatype+'.') 
        else:
            if len(listOfObj[index]["value"].split('/')) == 1:
                PosAttr.append('?s <' + str(listOfPred[index]["value"]) +'> "'+str(listOfObj[index]["value"])+ '".')  
            else:
                PosAttr.append('?s <' + str(listOfPred[index]["value"]) +'> <'+str(listOfObj[index]["value"])+ '>.')    
    
    # Answer is "no" or starts with 'n', adds negative filter to NegAttr
    elif not answer:
        if flag is True:
            datatype = listOfObj[index]["datatype"].split("#")[-1]
            NegAttr.append('FILTER NOT EXISTS {{ \n ?s <' + str(listOfPred[index]["value"]) +'> "'+str(listOfObj[index]["value"])+'"^^xsd:'+datatype+'. }}')
        else:
            if len(listOfObj[index]["value"].split('/')) == 1:
                NegAttr.append('FILTER NOT EXISTS {{ \n ?s <' + str(listOfPred[index]["value"]) +'> "'+str(listOfObj[index]["value"])+ '". }}') 
            else:
                NegAttr.append('FILTER NOT EXISTS {{ \n ?s <' + str(listOfPred[index]["value"]) +'> <'+str(listOfObj[index]["value"])+ '>. }}')   
    
    # The answer is unknown, goes to next question
    else:
        return generateQuestion(listOfPred, listOfObj, index+1)
    NegFilters = "\n".join(NegAttr)
    PosFilters = "\n".join(PosAttr)
    # Returns filters
    return (PosFilters, NegFilters)

  
def updateQuery(left, PosFilters, NegFilters):
    # Find 10 ten attributes that occurr closest to half the entities left
    FilterQuestion =  (f"""
            select (count(*) as ?count) ?p ?o  where 
            {{
            
            {PosFilters}
             ?s ?p ?o .
            {NegFilters}
            }} 
            
            group by ?p ?o 
            ORDER BY ABS( {left} - ?count )
            Limit 10
            
            """) 
        
    return FilterQuestion

In [4]:
def pickSubject():
    # Adds all "popular" entities (with an abstract) to a list
    data = pd.read_csv('~/Desktop/GitHub/THESIS-20Qs-LR/data/abstract data/Abstracts-186.csv')
    data = data.drop('Unnamed: 0', axis=1)
    return (list(data['Entity']))
     

def findAnswer(subject,p,o):
    # Finds answer to question
    query = f"""
        SELECT ?s ?p ?o
        WHERE {{
        ?s ?p ?o .
            FILTER(?s = {subject})
            FILTER(?p = {p})
            FILTER(?o = {o})
            }}
            """
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    qres = sparql.query().convert() 
    res = qres["results"]["bindings"]
    return (len(res))>0

In [5]:
def numberleft(PosFilters,NegFilters):
    # Find out how many entities left
    sparql = SPARQLWrapper('http://localhost:7200/repositories/20Q-Stanford-186')
    query =  f"""
            select (count(?s) as ?count) where 
            {{
            {PosFilters}            
            ?s <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.com/popularEntity> .
            {NegFilters} 
          }}       
            group by ?p ?o 
            ORDER BY DESC(?count )            
            
            
            """ 
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    qres = sparql.query().convert() 
    for r in qres["results"]["bindings"]:
        left= (r["count"]["value"])
        return(int(left))

In [6]:
def popentities(PosFilters,NegFilters):
    # Find 
    query =  f"""
            select ?s where 
            {{
            
            {PosFilters}
            ?s <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.com/popularEntity> .
            {NegFilters} 
            
            }}
            
            """ 
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    qres = sparql.query().convert() 
    return qres["results"]["bindings"]

In [7]:
# Resets variables for game 
def game_reset():
    PosAttr= ['?s <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.com/popularEntity> .']
    NegAttr= ['']
    AttrHistory = ['<http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.com/popularEntity>']
    left = numberleft("".join(PosAttr), "".join(NegAttr))
    FilterQuestion =  f"""
        select (count(*) as ?count) ?p ?o  where 
        {{

        ?s <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.com/popularEntity> .
        ?s ?p ?o .
        }} 
        group by ?p ?o 
        ORDER BY ABS( %s - ?count )
        limit 20
        """ %((left/2))
    sparql.setQuery(FilterQuestion)
    sparql.setReturnFormat(JSON)
    qres = sparql.query().convert()
    listOfObj ,listOfPred = extractCountAndAttr(qres)
    return (left, qres, PosAttr, NegAttr, AttrHistory)

In [8]:
import random
import pandas as pd
random.seed(10)
wincount = 0
subjects_lost = []
gamesplayed = 0
lostbybreak = 0
breakskip = 0
subjects = pickSubject()
#print(subjects)
totalgames = len(subjects)
results = []
questionhistory = []

In [9]:
# Plays games until the amount of games played equals to the number of subjects
while gamesplayed < totalgames:
    # Resets all variables
    try:
        left,qres,PosAttr,NegAttr,AttrHistory = game_reset()
        i=0
        outofguess = []
        # Pick subject
        subject = "<" + subjects[gamesplayed] + ">"
        
        # Whilst less than 20 questions asked
        while i < 20:

            # List of Pred and Obj for questions
            listOfObj ,listOfPred = extractCountAndAttr(qres)

            # If only 1 popular entity is left break
            if left<=1:
                break

            # Ask question and add filters to query
            PosFilters, NegFilters = generateQuestion(listOfPred, listOfObj, 0, subject)

            # Number of popular entities left
            left = numberleft(PosFilters, NegFilters)
            #print('left -',left)

            # Run query
            sparql = SPARQLWrapper('http://localhost:7200/repositories/20Q-Stanford-186')
            query = updateQuery(left/2,PosFilters, NegFilters)
            sparql.setQuery(query)
            sparql.setReturnFormat(JSON)
            qres = sparql.queryAndConvert()

            i+=1

        # Guesses the entity that fits the questions asked
        if i != 20:
            for attr in popentities(PosFilters, NegFilters):
                for x in [attr['s']['value']][0:10]:
                    answer = (x.split("/")[-1])
                    
            # Guessed correctly 
            if str(answer.split("/")[-1] + ">") == str(subject.split("/")[-1]):
                wincount += 1
                gamesplayed += 1
                results.append(i)
                
                print(gamesplayed,{'Subject': subject.split("/")[-1].split('>')[0], 'Guessed': 'Yes', 'Questions-B': i})
                print('%s/%s' %(wincount,gamesplayed))
                print()
            
            # Guessed incorrectly 
            else:
                gamesplayed += 1
                subjects_lost.append(subject)
                results.append(i)
                print(gamesplayed,{'Subject': subject.split("/")[-1].split('>')[0], 'Guessed': 'No', 'Questions-B': i})
                print('%s/%s' %(wincount,gamesplayed))
                print()

        # ran out of questions
        else:
            # Randomly guess an entity that matchs the questions asked
            for j in popentities(PosFilters, NegFilters):
                    outofguess.append(f'{j["s"]["value"].split("/")[-1]}')
            guess = random.choice(outofguess)
            
            # Guessed correctly
            if str(subject.split("/")[-1]) == str(guess + ">"):
                wincount += 1
                gamesplayed += 1
                results.append(20)
                print(gamesplayed,{'Subject': subject.split("/")[-1].split('>')[0], 'Guessed': 'Yes', 'Questions-B': 20})
                print('%s/%s' %(wincount,gamesplayed))
                print()
                
            # Guessed incorrectly 
            else:
                gamesplayed += 1
                subjects_lost.append(subject)
                #results.append({'Subject': subject.split("/")[-1].split('>')[0], 'Questions-C': 20})
                results.append(20)
                print(gamesplayed, {'Subject': subject.split("/")[-1].split('>')[0],'Guessed': 'No', 'Questions-B': 20})
                print('%s/%s' %(wincount,gamesplayed))
                print()
        breakskip = 0
        #print(AttrHistory)
        questionhistory.append(AttrHistory)
    
    # If game is interrupted, stop
    except KeyboardInterrupt:
        breakf
    # If query breaks, retry entity game
    except:
        continue

# When tournament ends prints the amount of entities guessed correctly
print("\nTournament ended! I have guessed " + str(wincount) + " out of " + str(gamesplayed) + " things correctly.")

1 {'Subject': 'David_Bowie', 'Guessed': 'Yes', 'Questions-B': 7}
1/1

2 {'Subject': 'Kanye_West', 'Guessed': 'Yes', 'Questions-B': 7}
2/2

3 {'Subject': 'Eminem', 'Guessed': 'Yes', 'Questions-B': 8}
3/3

4 {'Subject': 'Jennifer_Aniston', 'Guessed': 'Yes', 'Questions-B': 7}
4/4

5 {'Subject': 'George_W._Bush', 'Guessed': 'Yes', 'Questions-B': 7}
5/5

6 {'Subject': 'Prince_(musician)', 'Guessed': 'Yes', 'Questions-B': 7}
6/6

7 {'Subject': 'Justin_Bieber', 'Guessed': 'Yes', 'Questions-B': 7}
7/7

8 {'Subject': 'Drake_(musician)', 'Guessed': 'Yes', 'Questions-B': 8}
8/8

9 {'Subject': 'Heath_Ledger', 'Guessed': 'Yes', 'Questions-B': 7}
9/9

10 {'Subject': 'John_Krasinski', 'Guessed': 'Yes', 'Questions-B': 7}
10/10

11 {'Subject': 'Val_Kilmer', 'Guessed': 'Yes', 'Questions-B': 6}
11/11

12 {'Subject': 'American_Horror_Story', 'Guessed': 'Yes', 'Questions-B': 7}
12/12

13 {'Subject': 'Brad_Pitt', 'Guessed': 'Yes', 'Questions-B': 7}
13/13

14 {'Subject': 'Japan', 'Guessed': 'Yes', 'Questions

# Result Analysis

## Top 10 Questions

In [None]:
questions = []
for subject in questionhistory:
    for question in subject:
        questions.append(question)

In [None]:
from collections import Counter
c=Counter(questions)
#print(c.most_common(11))
mostcommon = []
for i in c.most_common(11)[1:]:
    mostcommon.append(i[0])


In [None]:
#topquestions = pd.DataFrame({'None IE': mostcommon})
import pandas as pd 
topquestions = pd.read_csv('TopQuestions.csv')
topquestions = topquestions.drop('Unnamed: 0', axis=1)

#topquestions['Chat GPT'] = mostcommon

#topquestions.to_csv('TopQuestions.csv')

In [None]:
gpt = 0
stan = 0
clcy = 0

for b,c,s,g in topquestions.values:
    if b == c:
        clcy += 1
    if b == s:
        stan += 1
    if b == g:
        gpt += 1
gpt,stan,clcy
for i in topquestions['Chat GPT']:
    i = i.split(' ')
    j = i[1].split('<')[-1]
    i = i[0].split('<')[-1]
    i = i.split('/')[-1]
    i = i.split('#')[-1]
    j = j.split('/')[-1]
    j = j.split('>')[0]
    i = i.split('>')[0]
    i = " ".join(i.split('_'))
    j = " ".join(j.split('_'))
    print(i,j)

## Number of Questions asked

In [None]:
# if literal is less than 2-3 words make a link version
import pandas as pd 
thesis_results = pd.read_csv('ThesisResults.csv')
thesis_results = thesis_results.drop('Unnamed: 0', axis=1)

thesis_results

In [None]:
#thesis_results = pd.DataFrame(results)
thesis_results['Questions-GPT2'] = results
thesis_results
#thesis_results.to_csv('ThesisResults.csv')

In [None]:
import pandas as pd 
data1 = pd.read_csv('ThesisResults.csv')
data1 = data1.drop('Unnamed: 0', axis=1)
#data1 = data1.drop('Questions-GPT2', axis=1)
#data1.to_csv('betterThesisResults.csv')
data1
#data1.T.to_csv('flippedata.csv')

### Model compared to another

In [None]:
def results( results1, results2, label1, label2 ):
    score1 = 0
    score2 = 0
    tie = 0
    for res1,res2 in zip( results1, results2 ):
        if res1 == res2:
            tie += 1
        elif res1 > res2:
            score2 += 1
        else:
            score1 += 1
    
    print( '%s-%s:' %( label1.upper(), label2.upper() ) )
    print( '%s score:' %( label1 ), score1, '-%s score:' %( label2 ), score2, '-tie score:', tie ) 
    print( '%s:' %(label1), round( ( score1/( score1+score2+tie ) ) *100,2 ) )
    print( '%s:' %(label2), round( ( score2/( score1+score2+tie ) ) *100,2 ) )
    print( 'tie %:', round( ( tie/( score1+score2+tie ) ) *100,2 ) )
    if score1 > score2:
        print( '%s best' %( label1.upper() ) )
    else:
        print( '%s best' %( label2.upper() ) )
    #return [score1,score2,tie]
    return

results( thesis_results['Questions-GPT'], thesis_results['Questions-Base'], 'gpt', 'base' )
print()
results( thesis_results['Questions-GPT'], thesis_results['Questions-C'], 'gpt', 'clcy' ) 
print()
results( thesis_results['Questions-GPT'], thesis_results['Questions-S'], 'gpt', 'stan' )
print()
results( thesis_results['Questions-C'], thesis_results['Questions-Base'], 'clcy', 'base' )
print()
results( thesis_results['Questions-S'], thesis_results['Questions-Base'], 'stan', 'base' )
print()
results( thesis_results['Questions-S'], thesis_results['Questions-C'], 'stan', 'clcy' )

### IE models compared

In [None]:
s = 0
g = 0
c = 0
tie = 0
gs_tie = 0
gc_tie = 0
sc_tie = 0

for _,clcy,base,stan,gpt in thesis_results.values:
    if stan == gpt and gpt == clcy:
        #s += 1
        #g += 1
        #c += 1
        tie += 1 
    elif stan == gpt and stan < clcy:
        #s += 1
        #g += 1
        gs_tie += 1 
    elif stan == clcy and gpt > clcy:
        #s += 1
        #c += 1
        #print(stan,clcy,gpt)
        sc_tie += 1
    elif gpt == clcy and gpt < stan:
        #g += 1
        #c += 1
        gc_tie += 1
    elif gpt < stan and gpt < clcy:
        g += 1
    elif stan < gpt and stan < clcy:
        s += 1 
    elif clcy < gpt and clcy < stan:
        c += 1 
print('clcy:',c,'- Stan:',s,'- GPT:',g,'- tie:',tie,'- gptstan-tie:',gs_tie,'- gptclcy-tie:',gc_tie,'- stanclcy:',sc_tie) 
print(c+tie+gc_tie+sc_tie)
print(s+tie+gs_tie+sc_tie)
print(g+tie+gs_tie+gc_tie)
print()
print(round(20/(20+18+35+62+13+12+24)*100,1))
print(round(18/(20+18+35+62+13+12+24)*100,1))
print(round(35/(20+18+35+62+13+12+24)*100,1))

## Difference in amount questions asked

In [None]:
def results( results1, results2, label1, label2 ):
    diff = 0
    abdiff = 0
    for res1,res2 in zip( results1, results2 ):
        diff += (res2 - res1)
        abdiff += abs(res2 - res1)
    diff = round(diff/len(results1),2)
    abdiff = round(abdiff/len(results1),2)
    
    print( '%s-%s:' %( label1.upper(), label2.upper() ) )
    print('diff:',diff)
    print('abs diff:', abdiff)
    return 

results( thesis_results['Questions-GPT'], thesis_results['Questions-Base'], 'gpt', 'base' )
print()
results( thesis_results['Questions-C'], thesis_results['Questions-Base'], 'clcy', 'base' )
print()
results( thesis_results['Questions-S'], thesis_results['Questions-Base'], 'stan', 'base' )