### Process WikiGazetteer

WikiGazetteer is a gazetteer based on Wikipedia and enriched with Geonames data.

To build a WikiGazetteer (into a MySQL database) for a specific Wikipedia version follow [these instructions](https://github.com/Living-with-machines/lwm_GIR19_resolving_places/tree/master/gazetteer_construction). 

This notebook takes the relevant fields in the WikiGazetteer MySQL database and creates a more manageable pickle file.


In [1]:
#!/usr/bin/python
# -*- coding: UTF-8 -*-

import mysql.connector
from mysql.connector import Error
import pandas as pd

In [2]:
def wikigazExtract(language, dbname):
    # Access wikigazetteer database
    gazDB = ""
    cursorGaz = ""
    try:
        gazDB = mysql.connector.connect(
                host='localhost',
                database=dbname,
                user='testGazetteer',
                password='1234')
        if gazDB.is_connected():
            cursorGaz = gazDB.cursor(dictionary=True)
    except Error as e:
        print("Error while connecting to MySQL", e)

    # Query database
    cursorGaz.execute("""
            select altname.altname, location.wiki_title, location.lat, location.lon, altname.source, location.type, location.page_len from altname
            join location on location.id=altname.main_id
            where altname.source IN ('wikimain', 'geonamesmain', 'geonamesascii', 'geonamesalt', 'wikiredirect')
            and LENGTH(altname) < 30
        """)
    results = cursorGaz.fetchall()

    # Store relevant metadata into pkl
    name = []
    wikititle = []
    latitude = []
    longitude = []
    source = []
    loctype = []
    pagelen = []
    for x in results:
        altname = x['altname']
        name.append(x['altname'])
        wikititle.append(x['wiki_title'])
        latitude.append(x['lat'])
        longitude.append(x['lon'])
        source.append(x['source'])
        loctype.append(x['type'])
        pagelen.append(x['page_len'])
    wg = pd.DataFrame()
    wg["name"] = name
    wg["wikititle"] = wikititle
    wg["latitude"] = latitude
    wg["longitude"] = longitude
    wg["source"] = source
    wg["loctype"] = loctype
    wg["pagelen"] = pagelen
    wg.to_pickle("../../../resources/wikiGaz_" + language + "_basic.pkl")

    # Close connection to gazDB
    if (gazDB.is_connected()):
        cursorGaz.close()
        gazDB.close()

In [3]:
wikigazExtract("en", "wikiGazetteer")
wikigazExtract("es", "wikiGazES")
wikigazExtract("el", "wikiGazEL")

#### Explore resulting gazetteers

In [4]:
wges = pd.read_pickle("../../../resources/wikiGaz_en_basic.pkl")

In [5]:
wges[wges["name"] == "Barcelona"]

Unnamed: 0,name,wikititle,latitude,longitude,source,loctype,pagelen
61435,Barcelona,"Barcelona,_Arkansas",35.6206,-94.4561,wikimain,city,927
152751,Barcelona,"Blooming_Grove,_Ohio",40.7078,-82.7167,geonamesalt,city,2863
391252,Barcelona,"Barcelona,_Rio_Grande_do_Norte",-5.93333,-35.9333,wikimain,city,4941
496148,Barcelona,Barcelona_(Parliament_of_Catalonia_constituency),41.45,2.08333,wikimain,,90111
542288,Barcelona,"Barcelona,_Cornwall",50.3552,-4.5047,wikimain,,2841
617597,Barcelona,Barcelona_(Congress_of_Deputies_constituency),41.45,2.08333,wikimain,,108024
1478146,Barcelona,Barcelona,41.3833,2.18333,wikimain,city,158048
1506863,Barcelona,Province_of_Barcelona,41.45,2.08333,geonamesalt,adm2nd,6523
1578593,Barcelona,"Barcelona,_Venezuela",10.1167,-64.7167,wikimain,city,27014
1591196,Barcelona,"Barcelona,_Sorsogon",12.87,124.13,wikimain,,10035


In [6]:
wges[wges["wikititle"] == "Barcelona"]

Unnamed: 0,name,wikititle,latitude,longitude,source,loctype,pagelen
1478146,Barcelona,Barcelona,41.3833,2.18333,wikimain,city,158048
1478147,Barcino,Barcelona,41.3833,2.18333,geonamesalt,city,158048
1478148,Bartzelona,Barcelona,41.3833,2.18333,geonamesalt,city,158048
1478149,Barzelona,Barcelona,41.3833,2.18333,geonamesalt,city,158048
1478150,Barcelono,Barcelona,41.3833,2.18333,geonamesalt,city,158048
1478151,Barcelone,Barcelona,41.3833,2.18333,geonamesalt,city,158048
1478152,Barselóna,Barcelona,41.3833,2.18333,geonamesalt,city,158048
1478153,Barcellona,Barcelona,41.3833,2.18333,geonamesalt,city,158048
1478154,Barselona,Barcelona,41.3833,2.18333,geonamesalt,city,158048
1478155,Barcillona,Barcelona,41.3833,2.18333,geonamesalt,city,158048


In [7]:
wges.shape

(2401260, 7)

In [8]:
wges.name.unique().shape[0]

2104201

In [9]:
wges.wikititle.unique().shape[0]

1087620

In [10]:
wges.source.unique()

array(['wikimain', 'geonamesalt', 'geonamesmain', 'geonamesascii',
       'wikiredirect'], dtype=object)