### Process WikiGazetteer

WikiGazetteer is a gazetteer based on Wikipedia and enriched with Geonames data.

To build a WikiGazetteer (into a MySQL database) for a specific Wikipedia version follow [these instructions](https://github.com/Living-with-machines/lwm_GIR19_resolving_places/tree/master/gazetteer_construction). 

This notebook takes the relevant fields in the WikiGazetteer MySQL database and creates a more manageable pickle file.


In [30]:
#!/usr/bin/python
# -*- coding: UTF-8 -*-

import mysql.connector
from mysql.connector import Error
import pandas as pd
from wikimapper import WikiMapper
import bz2
import json
import pandas as pd
import pydash
import time

In [2]:
def wikigazExtract(language, dbname):
    # Access wikigazetteer database
    gazDB = ""
    cursorGaz = ""
    try:
        gazDB = mysql.connector.connect(
                host='localhost',
                database=dbname,
                user='testGazetteer',
                password='1234')
        if gazDB.is_connected():
            cursorGaz = gazDB.cursor(dictionary=True)
    except Error as e:
        print("Error while connecting to MySQL", e)

#     Query database
#     cursorGaz.execute("""
#             select altname.altname, location.wiki_title, location.lat, location.lon, altname.source, location.type, location.page_len from altname
#             join location on location.id=altname.main_id
#         """)
    cursorGaz.execute("""
            select altname.altname, location.wiki_title, location.lat, location.lon, altname.source, location.type, location.page_len, inlinks.inlinks from altname
            join location on location.id=altname.main_id
            join inlinks on location.id=inlinks.main_id
        """)
    results = cursorGaz.fetchall()

    # Store relevant metadata into pkl
    name = []
    wikititle = []
    latitude = []
    longitude = []
    source = []
    loctype = []
    pagelen = []
    inlinks = []
    for x in results:
        altname = x['altname']
        name.append(x['altname'])
        wikititle.append(x['wiki_title'])
        latitude.append(x['lat'])
        longitude.append(x['lon'])
        source.append(x['source'])
        loctype.append(x['type'])
        pagelen.append(x['page_len'])
        inlinks.append(x['inlinks'])
    wg = pd.DataFrame()
    wg["name"] = name
    wg["wikititle"] = wikititle
    wg["latitude"] = latitude
    wg["longitude"] = longitude
    wg["source"] = source
    wg["loctype"] = loctype
    wg["pagelen"] = pagelen
    wg["inlinks"] = inlinks
#     wg.to_pickle("wikigaz_" + language + "_basic.pkl")
    wg.to_pickle("wikigaz_" + language + "_inlinks.pkl")

    # Close connection to gazDB
    if (gazDB.is_connected()):
        cursorGaz.close()
        gazDB.close()

In [3]:
wikigazExtract("en", "wikiGazetteer")
# wikigazExtract("es", "wikiGazES")
# wikigazExtract("el", "wikiGazEL")

In [5]:
wg = pd.read_pickle("wikigaz_en_inlinks.pkl")

In [9]:
wg_titles = wg.wikititle.unique()

In [11]:
len(wg_titles)

1144016

In [None]:
mapper = WikiMapper("../gazetteer/index_enwiki-latest.db")

pedia_data_dict = dict()
data_pedia_dict = dict()
i = 0
for title in wg_titles:
    wikidata_id = mapper.title_to_id(title)
    if title in pedia_data_dict:
        pedia_data_dict[title].append(wikidata_id)
    else:
        pedia_data_dict[title] = [wikidata_id]
    if wikidata_id in data_pedia_dict:
        data_pedia_dict[wikidata_id].append(title)
    else:
        data_pedia_dict[wikidata_id] = [title]
    i += 1

# for p in pedia_data_dict:
#     print(p, pedia_data_dict[p])
# print()
    
# for p in data_pedia_dict:
#     print(p, data_pedia_dict[p])

In [25]:
len(data_pedia_dict)

1139778

In [31]:
def wikidata(filename):
    with bz2.open(filename, mode='rt') as f:
        f.read(2) # skip first two bytes: "{\n"
        for line in f:
            try:
                yield json.loads(line.rstrip(',\n'))
            except json.decoder.JSONDecodeError:
                continue

In [32]:
wikidata_ids_set = set(list(data_pedia_dict.keys()))

In [36]:
wikidata_ids_set = set(list(data_pedia_dict.keys()))

start_time = time.time()
i = 0
for record in wikidata('../gazetteer/latest-all.json.bz2'):
    if record['id'] in wikidata_ids_set:
#         print(pydash.get(record, 'claims.P17[0].mainsnak.datavalue.value.id'))
#         print(pydash.get(record, 'claims.P30[0].mainsnak.datavalue.value.id'))
#         print()
        wikidata_ids_set.remove(record['id'])
#         print(len(wikidata_ids_set))
        i += 1
        if i % 5000 == 0:
            print(i)
    if not wikidata_ids_set:
        break
        
print(i)
print("--- %s seconds ---" % (time.time() - start_time))

5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000
65000
70000
75000
80000
85000
90000
95000
100000
105000
110000
115000
120000
125000
130000
135000
140000
145000
150000
155000
160000
165000
170000
175000
180000
185000
190000
195000
200000
205000
210000
215000
220000
225000
230000
235000
240000
245000
250000
255000
260000
265000
270000
275000
280000
285000
290000
295000
300000
305000
310000
315000
320000
325000
330000
335000
340000
345000
350000
355000
360000
365000
370000
375000
380000
385000
390000
395000
400000
405000
410000
415000
420000
425000
430000
435000
440000
445000
450000
455000
460000
465000
470000
475000
480000
485000
490000
495000
500000
505000
510000
515000
520000
525000
530000
535000
540000
545000
550000
555000
560000
565000
570000
575000
580000
585000
590000
595000
600000
605000
610000
615000
620000
625000
630000
635000
640000
645000
650000
655000
660000
665000
670000
675000
680000
685000
690000
695000
700000
705000
710000
715000
720000
725000
73000