# Finding queries characteristics in the dataset

In order to better understand the dataset, we can try to classify queries into multiple groups:
- single hop / multiple hop
- select / count

In [1]:
import pandas as pd
import re
from multiprocesspandas import applyparallel

In [2]:
df = pd.read_json("../datasets/final_queries_v1.4.json")
df.head()

Unnamed: 0,query,description,context,prompt,num_tokens,start_with_SELECT
0,SELECT ?property ?propertyType ?propertyLabel ...,Wikidata properties in numerical order,Counting stuff on Wikidata\nAll Wikidata prope...,<s>[INST] <<SYS>>This is a conversation betwee...,267,True
1,SELECT ?id ?idLabel ?idDescription ?new{\n?id ...,Wikidata properties excluding external IDs,Counting stuff on Wikidata\nVariation of the a...,<s>[INST] <<SYS>>This is a conversation betwee...,296,True
2,SELECT (COUNT(?article) AS ?count)\nWHERE {\n?...,,Counting stuff on Wikidata\nCount of scientifi...,<s>[INST] <<SYS>>This is a conversation betwee...,197,True
3,SELECT (COUNT(DISTINCT ?article) AS ?count)\nW...,Count of fictional characters,Counting stuff on Wikidata\nCount of fictional...,<s>[INST] <<SYS>>This is a conversation betwee...,203,True
4,SELECT (COUNT(?item) AS ?count)\nWHERE { ?item...,Count of items with coordinate locations,Counting stuff on Wikidata\nCount of items wit...,<s>[INST] <<SYS>>This is a conversation betwee...,186,True


In [3]:
print(df.iloc[9].query)

SELECT ?workLabel ?pubdate ?pubdatePrecision ?image ?typelabel ?english
(URI(CONCAT("https://tools.wmflabs.org/reasonator/?q=", SUBSTR(STR(?work),32) )) as ?reasonator) #build a reasonator link
(IF(BOUND(?english), ?english, ?reasonator) as ?link) # link to English Wikipedia article, if available
WHERE {
?work wdt:P50 wd:Q9068; # author: Voltaire
wdt:P31 ?type; # What is the work? poem, play, historical work etc.?
p:P577/psv:P577 ?pubdateStatementNode
MINUS {?work wdt:P31 wd:Q3331189} # exclude editions
MINUS {?work wdt:P31 wd:Q105420} # exclude anthologies
?pubdateStatementNode wikibase:timeValue ?pubdate ;
wikibase:timePrecision ?pubdatePrecision # get precision of the publication date
OPTIONAL {?work wdt:P18 ?image}
OPTIONAL{ ?english schema:about ?work ; schema:isPartOf <https://en.wikipedia.org/> }
SERVICE wikibase:label {bd:serviceParam wikibase:language "fr,en,de"} # Prefer labels in French but fall back to English and German
?type rdfs:label ?typelabel FILTER(lang(?typelabel) =

In [4]:
def remove_comments(query: str) -> str:
    import re
    return re.sub("#.*", "", query)

def remove_comments_parallel(x: pd.Series) -> str:
    import re
    return re.sub("#.*", "", x['query'])


In [5]:
df["query_no_comments"] = df.apply_parallel(remove_comments_parallel, axis=0)

100%|██████████| 2842/2842 [00:01<00:00, 1881.97it/s]


In [6]:
print(df.iloc[9].query_no_comments)

SELECT ?workLabel ?pubdate ?pubdatePrecision ?image ?typelabel ?english
(URI(CONCAT("https://tools.wmflabs.org/reasonator/?q=", SUBSTR(STR(?work),32) )) as ?reasonator) 
(IF(BOUND(?english), ?english, ?reasonator) as ?link) 
WHERE {
?work wdt:P50 wd:Q9068; 
wdt:P31 ?type; 
p:P577/psv:P577 ?pubdateStatementNode
MINUS {?work wdt:P31 wd:Q3331189} 
MINUS {?work wdt:P31 wd:Q105420} 
?pubdateStatementNode wikibase:timeValue ?pubdate ;
wikibase:timePrecision ?pubdatePrecision 
OPTIONAL {?work wdt:P18 ?image}
OPTIONAL{ ?english schema:about ?work ; schema:isPartOf <https://en.wikipedia.org/> }
SERVICE wikibase:label {bd:serviceParam wikibase:language "fr,en,de"} 
?type rdfs:label ?typelabel FILTER(lang(?typelabel) ="en") 
}


In [15]:
def is_single_hop(x: pd.Series, colname:str = "query_no_comments"):
    import re
    results = re.findall(r"(?:WHERE)\s?{[\s\S](?:\?)?(?:\w|:|\/|\*|\[|\])* (?:\?)?(?:\w|:|\/|\*|\[|\])* (?:\?)?(?:\w|:|\/|\*|\[|\])*\s*}", x[colname])
    if results and len(results) == 1:
        return True
    return False
        

In [16]:
df["is_single_hop"] = df.apply_parallel(is_single_hop, colname="query_no_comments", axis=0)


100%|██████████| 2842/2842 [00:01<00:00, 1726.66it/s]


In [17]:
print("\n-----\n".join(df.loc[df['is_single_hop']].query_no_comments.head()))


SELECT (COUNT(?article) AS ?count)
WHERE {
?article wdt:P31/wdt:P279* wd:Q13442814
}
-----
SELECT (COUNT(DISTINCT ?article) AS ?count)
WHERE {?article wdt:P31/wdt:P279* wd:Q95074}
-----
SELECT (COUNT(?item) AS ?count)
WHERE { ?item wdt:P625 [] }
-----
SELECT ?cbdb ?idlabel ?value WITH {
SELECT DISTINCT ?item ?cbdb WHERE {
?item wdt:P497 ?cbdb
}
} AS %subquery WHERE {
INCLUDE %subquery .
?id wikibase:propertyType wikibase:ExternalId; wikibase:claim ?p; wikibase:statementProperty ?ps; wdt:P31/wdt:P279* wd:Q19595382 FILTER( ?id != wd:P497 ).
?item ?p [ ?ps ?value ] .
?id rdfs:label ?idlabel FILTER (lang(?idlabel) = "en").
} ORDER BY ?idlabel
-----
SELECT * WHERE {
?item wdt:P31 wd:Q125576
}
