In [9]:
import bz2
import json
import pandas as pd
import pydash
import time

In [10]:
def wikidata(filename):
    with bz2.open(filename, mode='rt') as f:
        f.read(2) # skip first two bytes: "{\n"
        for line in f:
            try:
                yield json.loads(line.rstrip(',\n'))
            except json.decoder.JSONDecodeError:
                continue

In [11]:
languages = ['en', 'cy', 'sco', 'gd', 'ga', 'kw']

In [12]:
comp_start_time = time.time()

df_record_all = pd.DataFrame(columns=['wikidata_id', 'aliases', 'english_label', 'latitude', 'longitude', 'description_set', 'wikititle', 'continents', 'adm_regions', 'countries', 'near_water', 'os_grid_ref', 'located_on_street', 'instance_of', 'coord_bbox', 'capital_of', 'inception_date', 'dissolved_date', 'follows', 'replaces', 'population_dict', 'hcounties', 'vob_placeIDs', 'vob_unitIDs', 'epns', 'geonamesIDs', 'toIDs', 'vchID', 'borders_with', 'demonyms'])

i = 0
for record in wikidata('../gazetteer/latest-all.json.bz2'):
    # only extract items with geographical coordinates (P625)
    if pydash.has(record, 'claims.P625'):
        
        # Wikidata ID:
        wikidata_id = record['id']
        
        # Aliases and labels:
        aliases = pydash.get(record, 'aliases')
        labels = pydash.get(record, 'labels')
        alias_dict = dict()
        for x in aliases:
            if x in languages or x.startswith('en-'):
                for y in aliases[x]:
                    if not y["value"].isupper() and not y["value"].islower() and any(x.isalpha() for x in y["value"]):
                        if x in alias_dict:
                            if not y["value"] in alias_dict[x]:
                                alias_dict[x].append(y["value"])
                        else:
                            alias_dict[x] = [y["value"]]
        for x in labels:
            if x in languages or x.startswith('en-'):
                if not labels[x]["value"].isupper() and not labels[x]["value"].islower() and any(z.isalpha() for z in labels[x]["value"]):
                    if x in alias_dict:
                        if not labels[x]["value"] in alias_dict[x]:
                            alias_dict[x].append(labels[x]["value"])
                    else:
                        alias_dict[x] = [labels[x]["value"]]
        
        # Main label:
        english_label = pydash.get(record, 'labels.en.value')
        
        # Latitude and longitude:
        latitude = pydash.get(record, 'claims.P625[0].mainsnak.datavalue.value.latitude')
        longitude = pydash.get(record, 'claims.P625[0].mainsnak.datavalue.value.longitude')
        
        # Descriptions in English:
        description_set = set()
        descriptions = pydash.get(record, 'descriptions')
        for x in descriptions:
            if x == 'en' or x.startswith('en-'):
                description_set.add(descriptions[x]['value'])
        
        # English Wikipedia title:
        wikititle = pydash.get(record, 'sitelinks.enwiki.title')
        
        # Continents (Wikidata ID)
        continent_dict = pydash.get(record, 'claims.P30')
        continents = None
        if continent_dict:
            continents = [r["mainsnak"]["datavalue"]["value"]["id"] for r in continent_dict]

        # Located in adminitrative territorial entities (Wikidata ID)
        adm_regions_dict = pydash.get(record, 'claims.P131')
        adm_regions = dict()
        if adm_regions_dict:
            for r in adm_regions_dict:
                regname = pydash.get(r, 'mainsnak.datavalue.value.id')
                if regname:
                    entity_start_time = pydash.get(r, 'qualifiers.P580[0].datavalue.value.time')
                    entity_end_time = pydash.get(r, 'qualifiers.P582[0].datavalue.value.time')
                    adm_regions[regname] = (entity_start_time, entity_end_time)

        # Country: sovereign state of this item
        country_dict = pydash.get(record, 'claims.P17')
        countries = dict()
        if country_dict:
            for r in country_dict:
                countryname = pydash.get(r, 'mainsnak.datavalue.value.id')
                if countryname:
                    entity_start_time = pydash.get(r, 'qualifiers.P580[0].datavalue.value.time')
                    entity_end_time = pydash.get(r, 'qualifiers.P582[0].datavalue.value.time')
                    countries[countryname] = (entity_start_time, entity_end_time)
        
        # Nearby waterbodies (Wikidata ID)
        near_water_dict = pydash.get(record, 'claims.206')
        near_water = None
        if near_water_dict:
            near_water = [r["mainsnak"]["datavalue"]["value"]["id"] for r in near_water_dict]
        
        # OS grid reference (Wikidata ID)
        os_grid_ref = pydash.get(record, 'claims.613[0].mainsnak.datavalue.value')
        
        # Located on street
        located_on_street = pydash.get(record, 'claims.669[0].mainsnak.datavalue.value.id')
        
        # Location is instance of
        instance_of_dict = pydash.get(record, 'claims.P31')
        instance_of = None
        if instance_of_dict:
            instance_of = [r["mainsnak"]["datavalue"]["value"]["id"] for r in instance_of_dict]
            
        northernmost_lat = pydash.get(record, 'claims.P1332[0].mainsnak.datavalue.value.latitude')
        northernmost_lon = pydash.get(record, 'claims.P1332[0].mainsnak.datavalue.value.longitude')
            
        southernmost_lat = pydash.get(record, 'claims.P1333[0].mainsnak.datavalue.value.latitude')
        southernmost_lon = pydash.get(record, 'claims.P1333[0].mainsnak.datavalue.value.longitude')
            
        easternmost_lat = pydash.get(record, 'claims.P1334[0].mainsnak.datavalue.value.latitude')
        easternmost_lon = pydash.get(record, 'claims.P1334[0].mainsnak.datavalue.value.longitude')
            
        westernmost_lat = pydash.get(record, 'claims.P1335[0].mainsnak.datavalue.value.latitude')
        westernmost_lon = pydash.get(record, 'claims.P1335[0].mainsnak.datavalue.value.longitude')
        
        # Location is capital of
        capital_of_dict = pydash.get(record, 'claims.P1376')
        capital_of = None
        if capital_of_dict:
            capital_of = [r["mainsnak"]["datavalue"]["value"]["id"] for r in capital_of_dict]
            
        # Inception: date or point in time when the subject came into existence as defined
        inception_date = pydash.get(record, 'claims.P571[0].mainsnak.datavalue.value.time')
            
        # Dissolved, abolished or demolished: point in time at which the subject ceased to exist
        dissolved_date = pydash.get(record, 'claims.P576[0].mainsnak.datavalue.value.time')
        
        # Follows...: immediately prior item in a series of which the subject is a part: e.g. Vanuatu follows New Hebrides
        follows_dict = pydash.get(record, 'claims.P155')
        follows = []
        if follows_dict:
            for f in follows_dict:
                follows.append(pydash.get(f, 'mainsnak.datavalue.value.id'))
        
        # Replaces...: item replaced: e.g. New Hebrides is replaced by 
        replaces_dict = pydash.get(record, 'claims.P1365')
        replaces = []
        if replaces_dict:
            for r in replaces_dict:
                replaces.append(pydash.get(r, 'mainsnak.datavalue.value.id'))
            
        # Population at: dictionary of year-population pairs
        population_dump = pydash.get(record, 'claims.P1082')
        population_dict = dict()
        if population_dump:
            for ppl in population_dump:
                pop_amount = pydash.get(ppl, 'mainsnak.datavalue.value.amount')
                pop_time = pydash.get(ppl, 'qualifiers.P585[0].datavalue.value.time')
                pop_time = "UNKNOWN" if not pop_time else pop_time
                population_dict[pop_time] = pop_amount
        
        # Historical counties
        hcounties_dict = pydash.get(record, 'claims.P7959')
        hcounties = []
        if hcounties_dict:
            hcounties = [pydash.get(hc, 'mainsnak.datavalue.value.id') for hc in hcounties_dict]
        
        # Vision of Britain place ID: identifier of a place
        vob_placeID_dict = pydash.get(record, 'claims.P3616')
        vob_placeIDs = []
        if vob_placeID_dict:
            vob_placeIDs = [pydash.get(vobid, 'mainsnak.datavalue.value') for vobid in vob_placeID_dict]
            
        # Vision of Britain unit ID: identifier of an administrative unit
        vob_unitID_dict = pydash.get(record, 'claims.P3615')
        vob_unitIDs = dict()
        if vob_unitID_dict:
            for vobid in vob_unitID_dict:
                unit_id = pydash.get(vobid, 'mainsnak.datavalue.value')
                parish_name = pydash.get(vobid, 'qualifiers.P1810[0].datavalue.value')
                vob_unitIDs[unit_id] = parish_name
                
        # Identifier for a place in the Historical Gazetteer of England's Place Names website
        epns_dict = pydash.get(record, 'claims.P3627')
        epns = []
        if epns_dict:
            epns = [pydash.get(p, 'mainsnak.datavalue.value') for p in epns_dict]
            
        # Geonames ID
        geonamesID_dict = pydash.get(record, 'claims.P1566')
        geonamesIDs = []
        if geonamesID_dict:
            geonamesIDs = [pydash.get(gn, 'mainsnak.datavalue.value') for gn in geonamesID_dict]
            
        # TOID: TOpographic IDentifier assigned by the Ordnance Survey to identify a feature in Great Britain
        toID_dict = pydash.get(record, 'claims.P3120')
        toIDs = []
        if toID_dict:
            toIDs = [pydash.get(t, 'mainsnak.datavalue.value') for t in toID_dict]
            
        # British History Online VCH ID: identifier of a place, in the British History Online digitisation of the Victoria County History
        vchID_dict = pydash.get(record, 'claims.P3628')
        vchIDs = []
        if vchID_dict:
            vchIDs = [pydash.get(t, 'mainsnak.datavalue.value') for t in vchID_dict]
            
        # Shares border with:
        shares_border_dict = pydash.get(record, 'claims.P47')
        borders = []
        if shares_border_dict:
            borders = [pydash.get(t, 'mainsnak.datavalue.value.id') for t in shares_border_dict]
        
        # Demonyms:
        demonyms_dict = pydash.get(record, 'claims.P1549')
        demonyms = []
        if demonyms_dict:
            for d in demonyms_dict:
                demlang = pydash.get(d, 'mainsnak.datavalue.value.language')
                if demlang == "en" or demlang.startswith("en-"):
                    demtext = pydash.get(d, 'mainsnak.datavalue.value.text')
                    demonyms.append(demtext)
            if not demonyms:
                demtext = pydash.get(record, 'claims.P1549[0].mainsnak.datavalue.value.text')
                demonyms.append(demtext)

        df_record = {'wikidata_id':wikidata_id, 'aliases':alias_dict, 'english_label':english_label, 'latitude':latitude, 'longitude':longitude, 'description_set':description_set, 'wikititle':wikititle, 'continents':continents, 'adm_regions':adm_regions, 'countries':countries, 'near_water':near_water, 'os_grid_ref':os_grid_ref, 'located_on_street':located_on_street, 'instance_of':instance_of, 'coord_bbox': ((northernmost_lat, northernmost_lon), (southernmost_lat, southernmost_lon), (easternmost_lat, easternmost_lon), (westernmost_lat, westernmost_lon)), 'capital_of':capital_of, 'inception_date':inception_date, 'dissolved_date':dissolved_date, 'follows':follows, 'replaces':replaces, 'population_dict':population_dict, 'hcounties':hcounties, 'vob_placeIDs':vob_placeIDs, 'vob_unitIDs':vob_unitIDs, 'epns':epns, 'geonamesIDs':geonamesIDs, 'toIDs':toIDs, 'vchIDs':vchIDs, 'borders_with':borders, 'demonyms':demonyms}
        df_record_all = df_record_all.append(df_record, ignore_index=True)
                       
        i += 1
    if i >= 10000:
        break

print('\nAll items finished, final CSV exported!')
print("--- %s seconds ---" % (time.time() - comp_start_time))


All items finished, final CSV exported!
--- 209.23083400726318 seconds ---


In [22]:
df_record_all.iloc[100:150]

Unnamed: 0,wikidata_id,aliases,english_label,latitude,longitude,description_set,wikititle,continents,adm_regions,countries,...,hcounties,vob_placeIDs,vob_unitIDs,epns,geonamesIDs,toIDs,vchID,borders_with,demonyms,vchIDs
100,Q2032,"{'en': ['District of Luxembourg', 'Luxembourg ...",Luxembourg District,49.616667,6.066667,{one of three districts of the Grand Duchy of ...,Luxembourg District,,"{'Q32': (None, None)}","{'Q32': (None, None)}",...,[],[],{},[],[2960314],[],,"[Q1126, Q12626, Q12652]",[],[]
101,Q2059,"{'en': ['Gare de Lyon-Gorge-de-Loup'], 'en-ca'...",Gare de Lyon-Gorge-de-Loup,45.766089,4.804792,"{railway station Lyon, France}",Lyon-Gorge-de-Loup station,,"{'Q456': (None, None)}","{'Q142': (None, None)}",...,[],[],{},[],[],[],,[],[],[]
102,Q2073,"{'en': ['Republic of Komi', 'Komi', 'Komi Repu...",Komi Republic,64.283333,54.466667,"{federal subject of Russia, republic of Russia}",Komi Republic,,"{'Q159': ('+1991-12-26T00:00:00Z', None), 'Q21...","{'Q159': (None, None)}",...,[],[],{},[],[545854],[],,"[Q1875, Q2164, Q5824, Q6407, Q6320, Q5462, Q54...",[],[]
103,Q2096,"{'en': ['Edmonton, Alberta', 'Edmonton, AB', '...",Edmonton,53.533333,-113.5,"{capital city of province of Alberta, Canada}",Edmonton,,"{'Q1951': ('+1905-09-01T00:00:00Z', None), 'Q2...","{'Q16': (None, None)}",...,[],[],{},[],[5946768],[],,"[Q939127, Q3476253, Q253903, Q7628998, Q100740...",[Edmontonian],[]
104,Q2100,"{'en': ['Duisburg'], 'en-ca': ['Duisburg'], 'e...",Duisburg,51.432222,6.761111,"{city in Germany, city in North Rhine-Westphal...",Duisburg,[Q5401],"{'Q7926': ('+1815-00-00T00:00:00Z', None)}","{'Q183': (None, None), 'Q7318': ('+1933-00-00T...",...,[],[],{},[],[3231901],[],,"[Q1718, Q6245, Q2838, Q2899, Q6253, Q2805, Q41...",[],[]
105,Q2109,"{'sco': ['Arica an Parinacota Region', 'Arica ...",Arica y Parinacota Region,-18.475,-70.314444,{administrative division in Chile},Arica y Parinacota Region,,"{'Q298': (None, None)}","{'Q298': (None, None)}",...,[],[],{},[],[6693562],[],,"[Q2114, Q207413, Q272784, Q1061368]",[ariqueño],[]
106,Q2114,"{'en': ['Tarapaca Region', 'Tarapacá Region'],...",Tarapacá Region,-20.283333,-69.333333,{administrative division in Chile},Tarapacá Region,,"{'Q298': (None, None)}","{'Q298': (None, None)}",...,[],[],{},[],[3870116],[],,"[Q2109, Q2118, Q1061368, Q238079]",[tarapaqueño],[]
107,Q2119,"{'en': ['Mannheim'], 'en-ca': ['Mannheim'], 'e...",Mannheim,49.487778,8.466111,"{city in Baden-Württemberg, Germany}",Mannheim,,"{'Q8165': ('+1973-01-01T00:00:00Z', None), 'Q1...","{'Q183': ('+1990-10-03T00:00:00Z', None), 'Q43...",...,[],[],{},[],[2873891],[],,"[Q2910, Q2966, Q8177, Q7917, Q6905, Q8554]",[],[]
108,Q2132,"{'en': ['Victoria, Canada', 'Victoria, BC', 'V...",Victoria,48.422151,-123.3657,"{capital city of province of British Columbia,...","Victoria, British Columbia",,"{'Q550219': ('+1966-00-00T00:00:00Z', None), '...","{'Q16': (None, None)}",...,[],[],{},[],[6174041],[],,[Q179553],[Victorian],[]
109,Q2137,"{'en': ['Totma'], 'en-ca': ['Totma'], 'en-gb':...",Totma,59.983333,42.766667,"{town in Vologda Oblast, Russia}",Totma,,"{'Q1655824': ('+1929-01-14T00:00:00Z', '+2005-...","{'Q159': (None, None)}",...,[],[],{},[],[481960],[],,[],[],[]


In [21]:
df_record_all.iloc[5]

wikidata_id                                                       Q163
aliases              {'en-gb': ['God's own county', 'Yorkshire'], '...
english_label                                                Yorkshire
latitude                                                       53.9583
longitude                                                     -1.08333
description_set                           {historic county of England}
wikititle                                                    Yorkshire
continents                                                       [Q46]
adm_regions                                      {'Q21': (None, None)}
countries                                       {'Q145': (None, None)}
near_water                                                        None
os_grid_ref                                                       None
located_on_street                                                 None
instance_of                                      [Q1138494, Q67376938]
coord_

In [None]:
# Geographical entities shouldn't have...
# author

In [None]:
### What to do wiht astronomical locations?
### What to do with locations with no label in english: e.g. Q2039165
### I'm on... P4091

### Check source for specific entity on Wikidata

In [65]:
from wikidata.client import Client
client = Client()
entity = client.get('Q23276', load=True)

In [66]:
print(entity.__dict__)

{'id': 'Q23276', 'client': wikidata.client.Client('https://www.wikidata.org/'), 'data': {'pageid': 26680, 'ns': 0, 'title': 'Q23276', 'lastrevid': 1222044747, 'modified': '2020-07-01T14:23:29Z', 'type': 'item', 'id': 'Q23276', 'labels': {'en-gb': {'language': 'en-gb', 'value': 'Surrey'}, 'en': {'language': 'en', 'value': 'Surrey'}, 'en-ca': {'language': 'en-ca', 'value': 'Surrey'}, 'af': {'language': 'af', 'value': 'Surrey'}, 'ang': {'language': 'ang', 'value': 'Sūþrīge'}, 'ar': {'language': 'ar', 'value': 'سري'}, 'ast': {'language': 'ast', 'value': 'Surrey'}, 'bg': {'language': 'bg', 'value': 'Съри'}, 'br': {'language': 'br', 'value': 'Surrey'}, 'ca': {'language': 'ca', 'value': 'Surrey'}, 'cs': {'language': 'cs', 'value': 'Surrey'}, 'cy': {'language': 'cy', 'value': 'Surrey'}, 'da': {'language': 'da', 'value': 'Surrey'}, 'de': {'language': 'de', 'value': 'Surrey'}, 'el': {'language': 'el', 'value': 'Σάρρεϋ'}, 'eo': {'language': 'eo', 'value': 'Surrey'}, 'es': {'language': 'es', 'valu

### Tests

In [None]:
set_q = {"Q31","Q51", "Q45", "Q148", "Q19251485"}

i = 0
for record in wikidata('../gazetteer/latest-all.json.bz2'):
    if record['id'] in set_q:
        print(pydash.get(record, 'claims.P17[0].mainsnak.datavalue.value.id'))
        print(pydash.get(record, 'claims.P30[0].mainsnak.datavalue.value.id'))
        print()
    i += 1
    break

In [None]:
import time
from qwikidata.entity import WikidataItem
from qwikidata.json_dump import WikidataJsonDump
from qwikidata.utils import dump_entities_to_json

# create an instance of WikidataJsonDump
wjd_dump_path = "../gazetteer/latest-all.json.bz2"
wjd = WikidataJsonDump(wjd_dump_path)
# create an iterable of WikidataItem representing politicians
politicians = []
t1 = time.time()
for ii, entity_dict in enumerate(wjd):
    if entity_dict["id"] in set_q:
        print(entity_dict)
#         entity = WikidataItem(entity_dict)
#         if entity['id'] in set_q:
#             print(pydash.get(record, 'labels.en.value'))
#     if ii % 1000 == 0:
#         t2 = time.time()
#         dt = t2 - t1
#         print(entity['id'], ii, ii / dt)
#     if ii > 10000:
#         break

### SPARQLWrapper

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON
import time

start_time = time.time()

sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery("""SELECT ?work ?workLabel
WHERE
{
  ?work wdt:P31/wdt:P279* wd:Q838948.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE]". }
}""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

for result in results["results"]["bindings"]:
    print(result)
    
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
# pip install sparqlwrapper
# https://rdflib.github.io/sparqlwrapper/

import sys
from SPARQLWrapper import SPARQLWrapper, JSON

endpoint_url = "https://query.wikidata.org/sparql"

query = """SELECT DISTINCT ?city ?cityLabel ?loc WHERE {
  ?city wdt:P31/wdt:P279* wd:Q486972 .
  ?city wdt:P17 wd:Q145 .
  ?city wdt:P625 ?loc .
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "en" .
  }
}"""


def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


results = get_results(endpoint_url, query)

for result in results["results"]["bindings"]:
    print(result)

In [None]:
### All subclasses of human settlement

In [None]:
# pip install sparqlwrapper
# https://rdflib.github.io/sparqlwrapper/

import sys
from SPARQLWrapper import SPARQLWrapper, JSON

endpoint_url = "https://query.wikidata.org/sparql"

query = """SELECT ?item ?itemLabel ?linkTo ?linkToLabel {
  ?item wdt:P279* wd:Q27096213
  OPTIONAL { ?item wdt:P279 ?linkTo }
  SERVICE wikibase:label {bd:serviceParam wikibase:language "en" }
}"""


def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

results = get_results(endpoint_url, query)

In [None]:
for result in results["results"]["bindings"]:
    print(result)

In [None]:
# pip install sparqlwrapper
# https://rdflib.github.io/sparqlwrapper/

import sys
from SPARQLWrapper import SPARQLWrapper, JSON

endpoint_url = "https://query.wikidata.org/sparql"

query = """SELECT ?street ?streetLabel ?cityLabel ?loc ?osmid ?countryLabel
WHERE
{
    ?street (p:P31/ps:P31/wdt:P279*) wd:Q34442 .
    ?street wdt:P131 ?city .
    ?city wdt:P17 ?country .
    VALUES ?country { wd:Q145 wd:Q27 } .
    OPTIONAL { ?street wdt:P625 ?loc . }
    OPTIONAL { ?street wdt:P402 ?osmid . }
    SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}"""


def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

results = get_results(endpoint_url, query)

In [None]:
for result in results["results"]["bindings"]:
    print(result)

In [None]:
len(results["results"]["bindings"])