In [5]:
# SPARQL query preparation
from SPARQLWrapper import SPARQLWrapper, JSON
endpoint = SPARQLWrapper("http://114.212.81.217:8890/sparql/")

In [11]:
def get_all_item_with_relations_list(r_list,r_type="temporal fact",result_path="",t_type="period"):
    relation_stat=[]
    for relation in r_list:
        # variables for statistics for each relation
        this_r_stat={}
        this_r_stat["r"]=relation
        cnt=0
        st_cnt=0
        en_cnt=0
        point_cnt=0
        st_en_pair_cnt=0

        # differrent query for different relation types
        if r_type=="temporal fact":
            query = '''
                PREFIX wd: <http://www.wikidata.org/entity/>
                PREFIX wdt: <http://www.wikidata.org/prop/direct/>
                SELECT DISTINCT ?e1 ?e2 ?st ?en ?time
                WHERE
                {{
                    ?e1 p:{} ?s.
                    ?s ps:{} ?e2.
                    ?s a <http://wikiba.se/ontology#Statement>.

                    {{
                        ?s pq:P580 ?st.
                        OPTIONAL {{?s pq:P582 ?en.}}
                        OPTIONAL {{?s pq:P585 ?time.}}
                    }}
                    UNION
                    {{
                        ?s pq:P582 ?en.
                        OPTIONAL {{?s pq:P580 ?st.}}
                        OPTIONAL {{?s pq:P585 ?time.}}
                    }}
                    UNION
                    {{
                        ?s pq:P585 ?time.
                        OPTIONAL {{?s pq:P580 ?st.}}
                        OPTIONAL{{?s pq:P582 ?en.}}
                    }}
                }}
                '''.format(relation,relation)
        elif r_type=="time property":
            query = '''
                PREFIX wd: <http://www.wikidata.org/entity/>
                PREFIX wdt: <http://www.wikidata.org/prop/direct/>
                SELECT DISTINCT ?e1 ?e2 ?st ?en
                WHERE
                {{
                    ?e1 p:{} ?s.
                    ?s ps:{} ?e2.

                    OPTIONAL {{?s pq:P580 ?st.}}
                    OPTIONAL {{?s pq:P582 ?en.}}
                }}
                '''.format(relation,relation)
        # print(query)
        endpoint.setQuery(query)
        endpoint.setReturnFormat(JSON)
        response = endpoint.query().convert()
        results = response['results']['bindings']
        print("%s Query complete!" % relation)

        # process query results
        query_res=[]
        for each_result in results:
            temporal_fact={}

            temporal_fact["s"]=each_result["e1"]["value"][31:]
            temporal_fact["p"]=relation

            # process time info
            # process point in time typed relation with "point in time"
            if ("time" in each_result) and (t_type=="point"):
                if each_result["time"]["type"]=="typed-literal":
                    if each_result["time"]["datatype"]!="http://www.w3.org/2001/XMLSchema#dateTime":
                        print("DatatypeAlert!type:%s",each_result["en"]["datatype"])
                    temporal_fact["t1"]=each_result["time"]["value"].split("T")[0]
                    temporal_fact["t2"]=each_result["time"]["value"].split("T")[0]
                else:
                    continue # unexcepted datatype of time found, drop it
            # process period typed relation
            elif t_type=="period":
                # first check if start time exists
                if "st" in each_result:
                    if each_result["st"]["type"]=="typed-literal":
                        if each_result["st"]["datatype"]!="http://www.w3.org/2001/XMLSchema#dateTime":
                            print("DatatypeAlert!type:%s",each_result["en"]["datatype"])
                        temporal_fact["t1"]=each_result["st"]["value"].split("T")[0]
                    else:
                        continue # unexcepted datatype of time found, drop it
                else:
                    temporal_fact["t1"]="null"

                # then check if end time exists
                if "en" in each_result:
                    if each_result["en"]["type"]=="typed-literal":
                        if each_result["en"]["datatype"]!="http://www.w3.org/2001/XMLSchema#dateTime":
                            print("DatatypeAlert!type:%s",each_result["en"]["datatype"])
                        temporal_fact["t2"]=each_result["en"]["value"].split("T")[0]
                    else:
                        continue
                else:
                    temporal_fact["t2"]="null" # unexcepted datatype of time found, drop it

                # none of start and end time exists, use point of time
                if (temporal_fact["t1"]=="null") and (temporal_fact["t2"]=="null"):
                    if "time" in each_result:
                        if each_result["time"]["type"]=="typed-literal":
                            if each_result["time"]["datatype"]!="http://www.w3.org/2001/XMLSchema#dateTime":
                                print("DatatypeAlert!type:%s",each_result["en"]["datatype"])
                            temporal_fact["t1"]=each_result["time"]["value"].split("T")[0]
                            temporal_fact["t2"]=each_result["time"]["value"].split("T")[0]
                        else:
                            continue # unexcepted datatype of time found, drop it
                    elif r_type=="temporal fact":
                        continue # point in time also not exists

            # process object(tail entity) info
            # for temporal facts
            if each_result["e2"]["type"]=="uri":
                temporal_fact["o"]=each_result["e2"]["value"][31:]
            #for time property
            elif each_result["e2"]["type"]=="typed-literal":
                if each_result["e2"]["datatype"]!="http://www.w3.org/2001/XMLSchema#dateTime":
                    print("DatatypeAlert!type:%s",each_result["en"]["datatype"])
                temporal_fact["o"]=each_result["e2"]["value"].split("T")[0]
                # if temporal_fact["t1"]=="null" and temporal_fact["t2"]=="null":
                temporal_fact["t1"]=temporal_fact["o"]
                temporal_fact["t2"]=temporal_fact["o"]
            else:
                continue # for unexcepted datatype of object

            if "st" in each_result:
                st_cnt+=1
            if "en" in each_result:
                en_cnt+=1
            if ("st" in each_result)and("en" in each_result):
                st_en_pair_cnt+=1
            if "time" in each_result:
                point_cnt+=1
            cnt+=1
            query_res.append(temporal_fact)

        this_r_stat["cnt"]=cnt
        this_r_stat["st_cnt"]=st_cnt
        this_r_stat["en_cnt"]=en_cnt
        this_r_stat["st_en_pair_ent"]=st_en_pair_cnt
        this_r_stat["point_cnt"]=point_cnt
        relation_stat.append(this_r_stat)
        f=open("%s/%sres.csv" %(result_path,relation),"w")
        for tf in query_res:
            f.write("%s,%s,%s,%s,%s\n" %(tf["s"],tf["p"],tf["o"],tf["t1"],tf["t2"]))
        f.close()
    return relation_stat

In [7]:
# import optional relation info
tr_file=open("property final.tsv")
line=tr_file.readline()
line=tr_file.readline()
optional_relation_list=[]
while line:
    relation=line.split('\t')[0][32:-1]
    optional_relation_list.append(relation)
    line=tr_file.readline()
# print(optional_relation_list)

In [8]:
# required relations
common_relation_list=['P26', 'P108', 'P54', 'P286']
timeobj_relation_list=['P569', 'P570']


In [9]:
# phase 1 for temporal relations in required relations (all period)
stat_info=get_all_item_with_relations_list(common_relation_list,"temporal fact","raw/required/common/")
print("Phase 1 done.")

statfilename="PHASE1_RELATION_STAT.txt"
f=open(statfilename,"w")
f.write("?relation\t?count\t?st_count\t?en_count\t?st_en_pairs_count\t?point_count\n")
for this_r_stat in stat_info:
    f.write("%s\t%d\t%d\t%d\t%d\t%d\n" %(this_r_stat["r"],this_r_stat["cnt"],this_r_stat["st_cnt"],this_r_stat["en_cnt"],this_r_stat["st_en_pair_ent"],this_r_stat["point_cnt"]))
f.close()

P26 Query complete!
P108 Query complete!
P54 Query complete!
P286 Query complete!
Phase 1 done.


In [12]:
# phase 2 for time property in required relations
stat_info=get_all_item_with_relations_list(timeobj_relation_list,"time property","raw/required/time_prop")
print("Phase 2 done.")

statfilename="PHASE2_RELATION_STAT.txt"
f=open(statfilename,"w")
f.write("?relation\t?count\t?st_count\t?en_count\t?st_en_pairs_count\t?point_count\n")
for this_r_stat in stat_info:
    f.write("%s\t%d\t%d\t%d\t%d\t%d\n" %(this_r_stat["r"],this_r_stat["cnt"],this_r_stat["st_cnt"],this_r_stat["en_cnt"],this_r_stat["st_en_pair_ent"],this_r_stat["point_cnt"]))
f.close()

P569 Query complete!
P570 Query complete!
Phase 2 done.


In [None]:
# phase 3 for all optional relations (suppose they are all period)
stat_info=get_all_item_with_relations_list(optional_relation_list,"temporal fact","raw/optional/common")
print("Phase 3 done.")

statfilename="PHASE3_RELATION_STAT.txt"
f=open(statfilename,"w")
f.write("?relation\t?count\t?st_count\t?en_count\t?st_en_pairs_count\t?point_count\n")
for this_r_stat in stat_info:
    f.write("%s\t%d\t%d\t%d\t%d\t%d\n" %(this_r_stat["r"],this_r_stat["cnt"],this_r_stat["st_cnt"],this_r_stat["en_cnt"],this_r_stat["st_en_pair_ent"],this_r_stat["point_cnt"]))
f.close()

In [13]:
# merge files into dataset
import os
dir_list=["raw/required/common/","raw/required/time_prop/"]
r_list=[]
dataset_name="required_relations"
dataset_version="alpha-1"
merged_f=open(dataset_name+"_"+dataset_version+".csv","w")
for each_dir in dir_list:
    all_obj=os.listdir(each_dir)
    for each_obj in all_obj:
        if each_obj.endswith("res.csv"):
            read_f=open(each_dir+each_obj,"r")
            file_content=read_f.read()
            merged_f.write(file_content)
            read_f.close()
            print(each_dir+each_obj)
merged_f.close()


raw/required/common/P26res.csv
raw/required/common/P108res.csv
raw/required/common/P286res.csv
raw/required/common/P54res.csv
raw/required/time_prop/P570res.csv
raw/required/time_prop/P569res.csv


In [None]:
#test temporal fact query sparql
relation="P1789"
query = '''
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    SELECT DISTINCT ?e1 ?e2 ?st ?en
    WHERE
    {{
        ?e1 p:{} ?s.
        ?s ps:{} ?e2.

        {{?s pq:P580 ?st.
        OPTIONAL {{?s pq:P582 ?en.}}}}
        UNION
        {{?s pq:P582 ?en.
        OPTIONAL {{?s pq:P580 ?st.}}}}
    }}
    '''.format(relation,relation)
# print(query)
endpoint.setQuery(query)
endpoint.setReturnFormat(JSON)
response = endpoint.query().convert()
results = response['results']['bindings']
print(results)
query_res=[]
print("%s Query complete!" % relation)
for each_result in results:
    temporal_fact={}
    temporal_fact["s"]=each_result["e1"]["value"][31:]
    temporal_fact["p"]=relation
    temporal_fact["o"]=each_result["e2"]["value"][31:]
    
    if "st" in each_result:
        if each_result["st"]["type"]=="typed-literal":
            if each_result["st"]["datatype"]!="http://www.w3.org/2001/XMLSchema#dateTime":
                print("DatatypeAlert!type:%s",each_result["en"]["datatype"])
            temporal_fact["t1"]=each_result["st"]["value"].split("T")[0]
        else:
            continue
    else:
        temporal_fact["t1"]="null"
    if "en" in each_result:
        if each_result["en"]["type"]=="typed-literal":
            if each_result["en"]["datatype"]!="http://www.w3.org/2001/XMLSchema#dateTime":
                print("DatatypeAlert!type:%s",each_result["en"]["datatype"])
            temporal_fact["t2"]=each_result["en"]["value"].split("T")[0]
        else:
            continue
    else:
        temporal_fact["t2"]="null"
    query_res.append(temporal_fact)

# f=open("%sres.csv" %relation,"w")
# for tf in query_res:
#     f.write("%s,%s,%s,%s,%s\n" %(tf["s"],tf["p"],tf["o"],tf["t1"],tf["t2"]))
# f.close()
# print(query_res)


In [None]:
#test time property query sparql
relation="P569"
query = '''
        PREFIX wd: <http://www.wikidata.org/entity/>
        PREFIX wdt: <http://www.wikidata.org/prop/direct/>
        SELECT  ?e1 \"{}\" ?time
        WHERE
        {{
            ?e1 wdt:{} ?time.
        }}
    '''.format(relation,relation)
# print(query)
endpoint.setQuery(query)
endpoint.setReturnFormat(JSON)
response = endpoint.query().convert()
results = response['results']['bindings']
print(results)
query_res=[]
print("%s Query complete!" % relation)
for each_result in results:
    if "literal" in each_result["time"]["type"]:
        temporal_fact={}
        temporal_fact["s"]=each_result["e1"]["value"][31:]
        temporal_fact["p"]=each_result["callret-1"]["value"]
        temporal_fact["o"]=each_result["time"]["value"]
        temporal_fact["t1"]=each_result["time"]["value"]
        temporal_fact["t2"]=each_result["time"]["value"]
        query_res.append(temporal_fact)
# f=open("%sres.csv" %relation,"w")
# for tf in query_res:
#     f.write("%s,%s,%s,%s,%s\n" %(tf["s"],tf["p"],tf["o"],tf["t1"],tf["t2"]))
# f.close()
# print(query_res)