In [1]:
import pandas as pd # Beautiful tool for data wrangling! e.g. '!pip install pandas' from a Notebook
# See https://mariadb.com/blog/how-connect-python-programs-mariadb e.g. '!pip install mysql' from Notebook
import MySQLdb 
import re
from collections import Counter
import os
from collections import OrderedDict
import random

pd.set_option("display.max_rows",35) # Useful when having large Pandas DataFrames like we do here

In [2]:
conn = MySQLdb.connect(user='mos', passwd='', db='monuments_db')
cursor = conn.cursor()

## Monuments_all

In [3]:
# Load full table into memory
sql = "SELECT * FROM monuments_all"
df = pd.io.sql.read_sql(conn.escape_string(sql), conn)

In [4]:
list(df.keys())

['country',
 'lang',
 'project',
 'id',
 'adm0',
 'adm1',
 'adm2',
 'adm3',
 'adm4',
 'name',
 'address',
 'municipality',
 'lat',
 'lon',
 'lat_int',
 'lon_int',
 'image',
 'commonscat',
 'source',
 'changed',
 'monument_article',
 'registrant_url']

In [5]:
country_codes = dict(df.country.value_counts()).keys()
print(len(country_codes))
print(country_codes)

77
dict_keys(['be-wal', 'bo', 'us', 'gb-sct', 'nl-prov', 'de-he', 'tn', 'pa', 'ch', 'aq', 'dk-bygning', 'be-vlg', 'az', 'ca', 'se-arbetsl', 'it-88', 'hk', 'cz', 'no', 'at', 've', 'fr-object', 'in', 'be-bru', 'se-fornmin', 'dk-fortids', 'lu', 'nl-gem', 'ad', 'se-bbr', 'gb-eng', 'de-nrw-bm', 'gh', 'cm', 'it', 'cl', 'us-ca', 'ro', 'pt', 'am', 'nl-aw', 'il', 'pk', 'jo', 'ch-old', 'cn', 'gb-wls', 'ua', 'ru', 'nl', 'th', 'fr', 'uy', 'za', 'sv', 'it-bz', 'ph', 'es', 'de-by', 'dz', 'se-ship', 'ke', 'mt', 'ee', 'co', 'rs', 'ar', 'pl', 'by', 'ie', 'np', 'hu', 'jp-nhs', 'mx', 'de-nrw-k', 'sk', 'gb-nir'])


In [6]:
for code in country_codes:
    country = df[df["country"] == code]
    columns = country.columns
    for column in columns:
        values = country[column]
        sample = values.sample(n=10)
        examples = sample.values
        #print(code, column, values.sample(n=3))
        print(list(examples))

['be-wal', 'be-wal', 'be-wal', 'be-wal', 'be-wal', 'be-wal', 'be-wal', 'be-wal', 'be-wal', 'be-wal']
['nl', 'fr', 'fr', 'fr', 'fr', 'en', 'en', 'nl', 'en', 'nl']
['wikipedia', 'wikipedia', 'wikipedia', 'wikipedia', 'wikipedia', 'wikipedia', 'wikipedia', 'wikipedia', 'wikipedia', 'wikipedia']
['57081-CLT-0146-01', '61031-CLT-0015-01', '92035-CLT-0007-01', '57081-CLT-0291-01', '62063-CLT-0031-01', '63023-CLT-0082-01', '53020-CLT-0001-01', '62009-PEX-0002-01', '25023-PEX-0001-01', '61079-CLT-0003-01']
['be', 'be', 'be', 'be', 'be', 'be', 'be', 'be', 'be', 'be']
['be-wal', 'be-wal', 'be-wal', 'be-wal', 'be-wal', 'be-wal', 'be-wal', 'be-wal', 'be-wal', 'be-wal']
['', '', '', '', 'be-wht', '', '', 'be-wlg', 'be-wlg', 'be-wlg']
['Ath', 'Luik', 'Bel\x9cil', 'Tournai', None, 'Bergen', 'Luik', 'Luik', None, 'Theux']
[None, None, None, None, None, None, None, None, None, None]
['Certaines parties de la ferme du château', 'Terril n° 1', 'Museum van Schone Kunsten van Enclos Saint-Martin', "Divers 

ValueError: Cannot take a larger sample than population when 'replace=False'

In [57]:
def create_column_tables(df):
    """Takes a Pandas DataFrame object and writes wikitables to files named after the table they are
    produced from to {the current Directory}/wikitables/
    """
    country_codes = dict(df.country.value_counts()).keys()
    for code in country_codes:
        h1 = "= Non-standardized fields from country " + code + " in monuments_all =\n"
        column_tables = [] 
        country = df[df["country"] == code]
        columns = country.columns
        page_sections = []
        for column in columns:
            # the listed fields we avoid are assumed to be standardized
            if column not in ["country","lang","project","changed","lat","lon","lat_int","lon_int"]:
                h2 = "== 10 random samples from field " + column + " ==\n"
                values = country[column]
                try:
                    sample = values.sample(n=10)
                    examples = sample.values
                except ValueError as e:
                    print("country: {} cannot be sampled: {}. Total objects is {}. Skipping country.".format(code, len(values), e))
                    break
    
                table_header = '{| class="wikitable" style="width: 675px;\n'
                table_name = '|+ '+code + "-" + column + " 10 random samples\n"
            
                # create table columns
                table_columns = "! scope='col' style='width: 225px;' |" + str(column) + "\n" + \
                "! scope='col' style='width: 225px; height: 20px;'|" + "Conversion \n" + \
                "! scope='col' style='width: 225px;' | Comment\n|-\n"
                
                table_rows = []
            
                for example in examples:
                    row="| style='height: 20px;'| " + str(example) + "\n|\n|\n|-\n"
                    table_rows.append(row)
    
                    table_rows_str = "".join(table_rows)
                    # Fill in examples values from the first record in the table
                    table_footer = "\n|}"
            else:
                continue
            
            column_tables.append(h2)    
            column_table = table_header + table_name + table_columns + table_rows_str[:-1] + table_footer
            column_tables.append(column_table)
            
        
        wikipage = h1 + "\n".join(column_tables)
        
        if os.path.isdir("./files"):
            #print(wikipage)
            out = open("./files/" + code + ".columntables","w")
            out.write(wikipage)
            out.flush()
            print("Directory ./files exists. Wrote file {}".format(out.name))
            out.close()
            
        else:
            os.mkdir("./files")
            with open("./files/" + code + ".columntables","w") as out:
                out.write(wikipage)
                print("./files doesn't exist")

In [58]:
create_column_tables(df)

Directory ./files exists. Wrote file ./files/be-wal.columntables
Directory ./files exists. Wrote file ./files/bo.columntables
Directory ./files exists. Wrote file ./files/us.columntables
Directory ./files exists. Wrote file ./files/gb-sct.columntables
Directory ./files exists. Wrote file ./files/nl-prov.columntables
Directory ./files exists. Wrote file ./files/de-he.columntables
Directory ./files exists. Wrote file ./files/tn.columntables
Directory ./files exists. Wrote file ./files/pa.columntables
Directory ./files exists. Wrote file ./files/ch.columntables
Directory ./files exists. Wrote file ./files/aq.columntables
Directory ./files exists. Wrote file ./files/dk-bygning.columntables
Directory ./files exists. Wrote file ./files/be-vlg.columntables
Directory ./files exists. Wrote file ./files/az.columntables
Directory ./files exists. Wrote file ./files/ca.columntables
Directory ./files exists. Wrote file ./files/se-arbetsl.columntables
Directory ./files exists. Wrote file ./files/it-8