In [None]:
# special IPython command to prepare the notebook for matplotlib
%matplotlib inline 

from collections import defaultdict

import pandas as pd
import matplotlib.pyplot as plt
import requests
from pattern import web



## Fetching population data from Wikipedia

In this example we will fetch data about countries and their population from Wikipedia.

http://en.wikipedia.org/wiki/List_of_countries_by_past_and_future_population has several tables for individual countries, subcontinents as well as different years. We will combine the data for all countries and all years in a single panda dataframe and visualize the change in population for different countries.

###We will go through the following steps:
* fetching html with embedded data
* parsing html to extract the data
* collecting the data in a panda dataframe
* displaying the data

We will also show the different sub-steps that can be taken to reach the presented solution.

## Fetching the Wikipedia site

In [None]:
#url = 'http://en.wikipedia.org/wiki/List_of_countries_by_past_and_future_population'
f = open('countries_by_present_past_population.html', 'r')
website_html = f.read()
#website_html = requests.get(url).text
#print website_html

## Parsing html data

In [None]:
def get_population_html_tables(html):
    """Parse html and return html tables of wikipedia population data."""

    dom = web.Element(html)

    ### 0. step: look at html source!
    
    #### 1. step: get all tables

    #### 2. step: get all tables we care about
    tbls = dom.by_class('sortable wikitable')
    return tbls

tables = get_population_html_tables(website_html)
print "table length: %d" %len(tables)
for t in tables:
    #print t
    print t.attributes


In [None]:
def table_type(tbl):
    ### Extract the table type
    #print tbl('th')[0].content
    return tbl('th')[0].content
    
#   for tb in tbl('th'):
#       print tb.content
#   print thl   
#   return 'table'

# take a look here: https://docs.python.org/2/library/collections.html#collections.defaultdict

# group the tables by type
tables_by_type = defaultdict(list)  # defaultdicts have a default value that is inserted when a new key is accessed

# if we dont use defaultdict then we need write following code
#for tbl in tables:
#    typ = table_type(tbl)
#    if typ in tables_by_type:
#        tables_by_type[typ].append(tbl)
#    else:
#        tables_by_type[typ] = tbl


for tbl in tables:
    #print tbl
    tables_by_type[table_type(tbl)].append(tbl)

print tables_by_type

In [None]:
## Extracting data and filling it into a dictionary

In [None]:
def get_countries_population(tables):
    """Extract population data for countries from all tables and store it in dictionary."""
    
    result = defaultdict(dict)

    # 1. step: try to extract data for a single table
    for tbl in tables:
        #tbl = tables[0]
        #print tbl
        headers = tbl('tr')
        #print headers
        first_header = headers[0]
        th_s = first_header('th')
        years = [int(val.content) for val in th_s if val.content.isnumeric()]
        year_indices = [idx for idx, val in enumerate(th_s) if val.content.isnumeric()]
        #print years
        #print year_indices
        # 2. step: iterate over all tables, extract headings and actual data and combine data into single dict
        rows = tbl('tr')[1:]
        for row in rows:
            #print row
            tds = row('td')
            country_name = tds[0]('a')[0].content
            population_by_year = [tds[colindex].content.replace(',', '') for colindex in year_indices]
            #print country_name, population_by_year
            # converting to int
            subdict = dict(zip(years, (int(v) for v in population_by_year)))
            result[country_name].update(subdict)
    
    return result


result = get_countries_population(tables_by_type['Country or territory'])
print result

In [None]:
%debug # I put this cell as I had issues

In [None]:
## Creating a dataframe from a dictionary

In [None]:
# create dataframe

# orient index means top level keys used for row labels
# in panda index means rows
df = pd.DataFrame.from_dict(result, orient='index')
# sort based on year
df.sort(axis=1,inplace=True)
#print df.to_html() will print everything in html format
# look at panda printin settings and possibly way to print all
print df

## Some data accessing functions for a panda dataframe

In [None]:
# index location - starts from 0
subtable = df.iloc[0:2, 0:2]
print "subtable"
print subtable
print ""

column = df[1955]
print "column"
print column
print ""

row = df.ix[0] #row 0
print "row"
print row
print ""

rows = df.ix[:2] #rows 0,1
print "rows"
print rows
print ""

element = df.ix[0,1955] #element
print "element"
print element
print ""

# max along column
print "max"
print df[1950].max()
print ""

# axes
print "axes"
print df.axes
print ""

row = df.ix[0]
print "row info"
print row.name
print row.index
print ""

countries =  df.index
print "countries"
print countries
print ""

print "Austria"
print df.ix['Austria']

## Plotting population of 4 countries

In [None]:
plotCountries = ['Austria', 'Germany', 'United States', 'France']
    

for country in plotCountries:
    row = df.ix[country]
    #print row.index, row.values
    #print row
    plt.plot(row.index, [int(v) for v in row.values], label=row.name )      
    
plt.ylim(ymin=0) # start y axis at 0

# rotate labels 70 degree
plt.xticks(rotation=70)
# put legends best place not to interfere with the plot
plt.legend(loc='best')
plt.xlabel("Year")
plt.ylabel("# people (million)")
plt.title("Population of countries")

## Plot 5 most populous countries from 2010 and 2060

In [None]:
def plot_populous(df, year):
    # sort table depending on data value in year column
    df_by_year = df.sort(year, ascending=False)
    #print df_by_year
    for i in range(5):  
        row = df_by_year.ix[i]
        plt.plot(row.index, [int(v) for v in row.values], label=row.name ) 
            
    plt.ylim(ymin=0)
    
    plt.xticks(rotation=70)
    plt.legend(loc='best')
    plt.xlabel("Year")
    plt.ylabel("# people (million)")
    plt.title("Most populous countries in %d" % year)

plot_populous(df, 2010)
#plot_populous(df, 2050)
#df[2010]
#df[2010].order()
