# Wikidata enrichment

This notebook converts the IMKG graph to KGTK format and queries relevant entities in Wikidata to enrich IMKG with Wikidata knowledge.

## 0. Setup

In [1]:
import os
import os.path

from kgtk.configure_kgtk_notebooks import ConfigureKGTK
from kgtk.functions import kgtk, kypher

In [2]:
# Parameters

# Folders on local machine where to create the output and temporary files:
input_path = "wikidata"
output_path = "projects"
project_name = "tutorial-kypher"

In [5]:
# IMKG files
imkg_dir='imkg02'
instances_file='%s/instances.imgflip.nt' % imkg_dir
templates_file='%s/templates.kym.nt' % imkg_dir
instance_entities_file='%s/instances.entities.csv' % imkg_dir
templates_entities_file='%s/templates.entities.csv' % imkg_dir

In [3]:
big_files=["label"]

additional_files = {
    "P31": "derived.P31.tsv.gz",
    "items": "claims.wikibase-item.tsv.gz",
    "P1963": "derived.P1963computed.count.star.tsv.gz",
    "external": "claims.external-id.tsv.gz",
    "indegree": "metadata.in_degree.tsv.gz",
    "outdegree": "metadata.out_degree.tsv.gz",
    "pagerank": "metadata.pagerank.directed.tsv.gz"
}

ck = ConfigureKGTK(big_files)
ck.configure_kgtk(input_graph_path=input_path, 
                  output_path=output_path, 
                  project_name=project_name,
                  additional_files=additional_files)

User home: /Users/filipilievski
Current dir: /Users/filipilievski/mcs/imkg
KGTK dir: /Users/filipilievski/mcs
Use-cases dir: /Users/filipilievski/mcs/use-cases


In [4]:
ck.print_env_variables()

USE_CASES_DIR: /Users/filipilievski/mcs/use-cases
STORE: projects/tutorial-kypher/temp.tutorial-kypher/wikidata.sqlite3.db
kgtk: kgtk
KGTK_GRAPH_CACHE: projects/tutorial-kypher/temp.tutorial-kypher/wikidata.sqlite3.db
KGTK_OPTION_DEBUG: false
OUT: projects/tutorial-kypher
kypher: kgtk query --graph-cache projects/tutorial-kypher/temp.tutorial-kypher/wikidata.sqlite3.db
EXAMPLES_DIR: /Users/filipilievski/mcs/examples
TEMP: projects/tutorial-kypher/temp.tutorial-kypher
GRAPH: wikidata
KGTK_LABEL_FILE: wikidata/labels.en.tsv.gz
label: wikidata/labels.en.tsv.gz
P31: wikidata/derived.P31.tsv.gz
items: wikidata/claims.wikibase-item.tsv.gz
P1963: wikidata/derived.P1963computed.count.star.tsv.gz
external: wikidata/claims.external-id.tsv.gz
indegree: wikidata/metadata.in_degree.tsv.gz
outdegree: wikidata/metadata.out_degree.tsv.gz
pagerank: wikidata/metadata.pagerank.directed.tsv.gz


In [6]:
!tail -1 "imkg02/templates.kym.nt"  

<http://www.wikidata.org/entity/Q4810998> <https://www.wikidata.org/wiki/Property:P646> "/m/0b96rw" .


In [7]:
!wc -l "imkg02/templates.kym.nt"  

 4709548 imkg02/templates.kym.nt


In [9]:
!wc -l "imkg02/instances.imgflip.nt"  

 17011388 imkg02/instances.imgflip.nt


## 1. Import into KGTK

Define namespaces to make the import-ntriples command work:

In [80]:
namespaces={'"http://www.wikidata.org/entity/"': 'wde',
           '"https://www.wikidata.org/wiki/"': 'wdp',
           '"https://knowyourmeme.com/memes/"': 'kym',
           '"http://www.w3.org/2000/01/rdf-schema#"': 'rdfs',
           '"http://www.w3.org/1999/02/22-rdf-syntax-ns#"': 'rdf',
           '"http://dbpedia.org/resource/"': 'dbr',
           '"https://meme4.science/"': 'm4s',
           '"Http://xmlns.com/foaf/0.1/"': 'foaf',
           '"https://knowyourmeme.com/types/"': 'kymt',
           '"https://dbpedia.org/property/"': 'dbp',
           '"https://dbpedia.org/ontology/"': 'dbo',
           '"https://schema.org/"': 'schema',
           '"https://imgflip.com/"': 'imgflip'}

prop='prefix_expansion'
with open('namespaces.tsv', 'w') as w:
    w.write('node1\tlabel\tnode2\n')
    for k,v in namespaces.items():
        triple=[v, prop, k]
        w.write('\t'.join(triple) + '\n')

In [81]:
!cat namespaces.tsv

node1	label	node2
wde	prefix_expansion	"http://www.wikidata.org/entity/"
wdp	prefix_expansion	"https://www.wikidata.org/wiki/"
kym	prefix_expansion	"https://knowyourmeme.com/memes/"
rdfs	prefix_expansion	"http://www.w3.org/2000/01/rdf-schema#"
rdf	prefix_expansion	"http://www.w3.org/1999/02/22-rdf-syntax-ns#"
dbr	prefix_expansion	"http://dbpedia.org/resource/"
m4s	prefix_expansion	"https://meme4.science/"
foaf	prefix_expansion	"Http://xmlns.com/foaf/0.1/"
kymt	prefix_expansion	"https://knowyourmeme.com/types/"
dbp	prefix_expansion	"https://dbpedia.org/property/"
dbo	prefix_expansion	"https://dbpedia.org/ontology/"
schema	prefix_expansion	"https://schema.org/"
imgflip	prefix_expansion	"https://imgflip.com/"


In [25]:
%%time
kgtk("""
    import-ntriples -i "imkg02/templates.kym.nt"  
        --namespace-file "namespaces.tsv"
        -o $TEMP/raw_templates.kgtk.gz
        --namespace-id-use-uuid True 
        --build-new-namespaces False
        --output-only-used-namespaces True 
        --structured-value-label m4s:structured_value 
        --structured-uri-label m4s:structured_uri 
        --newnode-prefix node 
        --newnode-use-uuid True
    """)

CPU times: user 46.3 ms, sys: 49.8 ms, total: 96.1 ms
Wall time: 2min 1s


In [83]:
import pandas

templates_fn='projects/tutorial-kypher/temp.tutorial-kypher/raw_templates.kgtk.gz'
templates_df = pandas.read_csv(templates_fn, sep='\t')
templates_out='projects/tutorial-kypher/temp.tutorial-kypher/clean_templates.tsv'

instances_fn='projects/tutorial-kypher/temp.tutorial-kypher/raw_instances.kgtk.gz'
instances_df = pandas.read_csv(instances_fn, sep='\t')
instances_out='projects/tutorial-kypher/temp.tutorial-kypher/clean_instances.tsv'

In [28]:
def replace_me(n):
    try:
        return n.replace('wdp:Property:', '').replace('wdp:', '').replace('wdt:', '').replace('wde:', '')
    except:
        return n

In [29]:
def clean_df(df, filename):
    all_rows=[]
    for i, row in df.iterrows():
        n1, label, n2 = row
        n1=replace_me(n1)
        n2=replace_me(n2)
        label=replace_me(label)
        new_row=[str(n1),str(label),str(n2)]
        all_rows.append(new_row)

    with open(filename, 'w') as w:
        w.write('node1\tlabel\tnode2\n')
        for row in all_rows:
            w.write('\t'.join(row) + '\n')

We do some cleaning of the graph (<font color='red'>Could this be done with KGTK clean or something?</font>)

In [30]:
clean_df(templates_df, templates_out)

In [32]:
!kgtk deduplicate -i $TEMP/clean_templates.tsv -o $TEMP/templates.kgtk.gz

Now that we imported and deduplicated the instances, let's do some sanity check to make sure our graph is reasonable.

In [33]:
kgtk("""cat -i $TEMP/templates.kgtk.gz""")

Unnamed: 0,node1,label,node2
0,L492950-S6,P646,/m/01j3sz
1,Q1,P646,/m/07v7c
2,Q100,P646,/m/01cx_
3,Q1000,P646,/m/03548
4,Q10000,P646,/m/04s5c9
...,...,...,...
4715174,nodeejVELtbjj8NrLjNkkVJgqh-999,m4s:structured_uri,http://www.w3.org/2001/XMLSchema#timestamp
4715175,nodeejVELtbjj8NrLjNkkVJgqh-999,m4s:structured_value,1311452326
4715176,rdf,prefix_expansion,http://www.w3.org/1999/02/22-rdf-syntax-ns#
4715177,wde,prefix_expansion,http://www.wikidata.org/entity/


How many memes we have in the graph?

In [34]:
!kgtk query -i $TEMP/templates.kgtk.gz \
    --match '(n1)-[r:`rdf:type`]->(:`kym:Meme`)' \
    --return 'count(distinct n1)'

count(DISTINCT graph_1_c1."node1")
2343


What are the relations?

In [35]:
!kgtk query -i $TEMP/templates.kgtk.gz \
    --match '()-[r]->()' \
    --return 'distinct r.label as Relation'

Relation
P646
P6760
dbp:confidence
kym:about
kym:added
kym:children
kym:from
kym:last_update_source
kym:origin
kym:parent
kym:sibling
kym:spread
kym:status
kym:tag
kym:title
kym:year
m4s:fromAbout
m4s:fromImage
m4s:fromImageEntities
m4s:fromImageLabels
m4s:fromTags
m4s:structured_uri
m4s:structured_value
prefix_expansion
rdf:type


Now let's import the instances file into KGTK. This command takes around 6mins on my laptop, I was not sure if it is running or got stuck somewhere.
<font color='red'>Progress bar would really help us work with commands better</font>

In [None]:
%%time
kgtk("""
    import-ntriples -i "imkg02/instances.imgflip.nt"  
        --namespace-file "namespaces.tsv"
        -o $TEMP/raw_instances.kgtk.gz
        --namespace-id-use-uuid True 
        --build-new-namespaces False
        --output-only-used-namespaces True 
        --structured-value-label m4s:structured_value 
        --structured-uri-label m4s:structured_uri 
        --newnode-prefix node 
        --newnode-use-uuid True
    """)

In [None]:
clean_df(instances_df, instances_out)

In [None]:
!kgtk deduplicate -i $TEMP/clean_instances.tsv -o $TEMP/instances.kgtk.gz

In [None]:
kgtk("""head --debug -i $TEMP/raw_instances.kgtk.gz""")

<font color='red'>Error</font>
What I did:
1. Ran import-ntriples on the instances .nt file (17M lines)
2. It worked well and I could `cat` the top 10 lines, but I saw that there was a prefix that I had not specified
3. Then i added the prefix to the namespaces file
4. Import-ntriples ran successfully again
5. Cat now failed and the error message is not helpful. I tried adding the --debug flag but this did not help me understand better

Let's validate that the import worked well:

In [None]:
!kgtk query -i $TEMP/instances.kgtk.gz \
    --match '(n1)-[r:`rdf:type`]->(:`kym:Meme`)' \
    --return 'count(distinct n1)'

In [None]:
!kgtk query -i $TEMP/instances.kgtk.gz \
    --match '()-[r]->()' \
    --return 'distinct r.label as Relation'

## 2. Enrich with Wikidata now

### 2a. Data where memes are subjects or objects

We start with relations where the meme Qnode is a subject in Wikidata:

In [43]:
!kgtk query -i $items -i $TEMP/templates.kgtk.gz \
    --match 'templates: (meme_qid)-[:P6760]->(), \
            item: (meme_qid)-[mrel]->(mval)' \
    --return 'meme_qid as node1, mrel.label as label, mval as node2' \
    -o $TEMP/wikidata_sub.kgtk.gz

In [44]:
!kgtk query -i $TEMP/wikidata_sub.kgtk.gz \
    --match '(n1)-[r]->()' \
    --return 'count (n1)'

count(graph_4_c1."node1")
1397


We get information for 1,397 memes as subjects. What about memes as objects?

In [48]:
!kgtk query -i $items -i $TEMP/templates.kgtk.gz \
    --match 'templates: (meme_qid)-[:P6760]->(), \
            item: (mval)-[mrel]->(meme_qid)' \
    --return 'mval as node1, mrel.label as label, meme_qid as node2' \
    -o $TEMP/wikidata_obj.kgtk.gz

Combine and deduplicate:

In [49]:
!kgtk cat -i $TEMP/wikidata_sub.kgtk.gz -i $TEMP/wikidata_obj.kgtk.gz / deduplicate -o $TEMP/wikidata_memes.kgtk.gz

In [50]:
!kgtk query -i $TEMP/wikidata_memes.kgtk.gz \
    --match '(n1)-[]->()' \
    --return 'count(n1)'

count(graph_5_c1."node1")
1587


In [52]:
!kgtk query -i $TEMP/wikidata_memes.kgtk.gz \
    --match '(n1)-[]->()' \
    --limit 10

node1	label	node2
P1651	P1855	Q5230628
P21	P1855	Q7714263
P7946	P1855	Q7714263
Q101833802	P822	Q98951569
Q102240167	P793	Q17521923
Q104005472	P1080	Q87609688
Q104005472	P1340	Q17122834
Q104005472	P1441	Q83279
Q104005472	P21	Q6581097
Q104005472	P31	Q15711870


We now combine Wikidata information with the original graph:

In [53]:
!kgtk cat -i $TEMP/wikidata_memes.kgtk.gz -i $TEMP/templates.kgtk.gz / deduplicate -o $TEMP/templates_with_wd.kgtk.gz

### 2b. Data about Wikidata entities from the combined graph

Next, let's obtain data about the other Qnodes that are not memes. Here, we want to get relations for Qnode pairs in our graph in Wikidata. So, we query Wikidata for statements where both node1 and node2 are in our graph:

In [66]:
!kgtk query -i $items -i $TEMP/templates_with_wd.kgtk.gz \
    --match 'item: (qnode1)-[mrel]->(qnode2), \
            template: (qnode1)-[]->(), (qnode2)-[]->()' \
    --return 'qnode1 as node1, mrel.label as label, qnode2 as node2' \
    / deduplicate -o $TEMP/ss.tsv.gz

In [67]:
!kgtk query -i $items -i $TEMP/templates_with_wd.kgtk.gz \
    --match 'item: (qnode1)-[mrel]->(qnode2), \
            template: (qnode1)-[]->(), ()-[]->(qnode2)' \
    --return 'qnode1 as node1, mrel.label as label, qnode2 as node2' \
    / deduplicate -o $TEMP/so.tsv.gz

In [68]:
!kgtk query -i $items -i $TEMP/templates_with_wd.kgtk.gz \
    --match 'item: (qnode1)-[mrel]->(qnode2), \
            template: ()-[]->(qnode1), ()-[]->(qnode2)' \
    --return 'qnode1 as node1, mrel.label as label, qnode2 as node2' \
    / deduplicate -o $TEMP/oo.tsv.gz

In [69]:
!kgtk query -i $items -i $TEMP/templates_with_wd.kgtk.gz \
    --match 'item: (qnode1)-[mrel]->(qnode2), \
            template: ()-[]->(qnode1), (qnode2)-[]->()' \
    --return 'qnode1 as node1, mrel.label as label, qnode2 as node2' \
    / deduplicate -o $TEMP/os.tsv.gz

In [70]:
!kgtk query -i $TEMP/ss.tsv.gz \
    --match '(n1)-[r]->()' \
    --return 'count (n1)'

count(graph_11_c1."node1")
26319266


In [71]:
!kgtk query -i $TEMP/so.tsv.gz \
    --match '(n1)-[r]->()' \
    --return 'count (n1)'

count(graph_12_c1."node1")
11731891


In [72]:
!kgtk query -i $TEMP/os.tsv.gz \
    --match '(n1)-[r]->()' \
    --return 'count (n1)'

count(graph_9_c1."node1")
168219


In [73]:
!kgtk query -i $TEMP/oo.tsv.gz \
    --match '(n1)-[r]->()' \
    --return 'count (n1)'

count(graph_10_c1."node1")
60206


In [74]:
!kgtk query -i $TEMP/oo.tsv.gz \
    --match '(n1)-[r]->()' \
    --limit 10

node1	label	node2
P2013	P17	Q30
P2013	P1855	Q383541
P2013	P9073	Q355
Q1	P1552	Q11412
Q1	P2670	Q6999
Q1	P2670	Q79925
Q1	P3113	Q2051667
Q1	P527	Q133327
Q1	P793	Q323
Q1	P828	Q323


In [75]:
!kgtk cat -i $TEMP/ss.tsv.gz $TEMP/so.tsv.gz $TEMP/os.tsv.gz $TEMP/oo.tsv.gz / deduplicate -o $TEMP/wikidata_ent.kgtk.gz

In [78]:
!kgtk query -i $TEMP/wikidata_ent.kgtk.gz \
    --match '(n1)-[]->(n2)' \
    --return 'count(n1)'

count(graph_14_c1."node1")
27892223
