In [8]:
import urllib
from bs4 import BeautifulSoup
import pandas as pd
import re
from functools import partial
from io import StringIO

In [9]:
import urllib.request


list_of_flags_page = urllib.request.urlopen('https://en.wikipedia.org/wiki/List_of_national_flags_of_sovereign_states').read()

In [10]:
soup = BeautifulSoup(list_of_flags_page, 'html.parser')

In [11]:
tables = soup.find_all('table')

In [12]:
def parse_table(table):

    split_ratio = re.compile('[∶:]').split
    def parse_aspect_ratio(s: str) -> float:
        try:
            n, d = map(int, split_ratio(s))
            return n / d
        except ValueError:
            return float('nan')

    strip_citations = partial(re.compile(r'\s*\[.*\]').sub, "")
    
    table = pd.read_html(StringIO(table.prettify()), flavor='bs4')[0]

    table = table.map(strip_citations)
    table.columns = map(strip_citations, table.columns)
    table.drop('Refs.', axis=1, inplace=True)
    table['Aspect ratio'] = table['Aspect ratio'].apply(parse_aspect_ratio)
    table['Date of latest adoption'] = pd.to_datetime(table['Date of latest adoption'], format='mixed', errors='coerce')
    table['Designer(s)'] = table['Designer(s)'].apply(lambda s: None if s == '—' else s)

    return table

In [13]:
all_flags = pd.concat(list(map(parse_table, tables)), axis=0)
all_flags

Unnamed: 0,Flag(s),State,Aspect ratio,Date of latest adoption,Designer(s),Description
0,Afghanistan (Islamic Emirate),Afghanistan,0.500000,2021-08-15,,White with a black Shahada in Thuluth scrip...
1,Afghanistan (Islamic Republic),Afghanistan,0.666667,2013-08-19,,Three equal vertical bands of black ( hoist s...
2,Albania,Albania,0.714286,1992-04-07,,Red with a black double-headed eagle in the ...
3,Algeria,Algeria,0.666667,1962-07-03,Disputed,Two equal vertical bands of green (hoist side)...
4,Andorra,Andorra,0.700000,1993-05-05,,"Three vertical bands of blue (hoist side), yel..."
...,...,...,...,...,...,...
4,Somaliland,Somaliland,,1996-10-14,,"Three equal horizontal bands of green (top), w..."
5,South Ossetia,South Ossetia,,NaT,,"Three equal horizontal bands of white (top), r..."
6,Taiwan,Taiwan,0.666667,1928-10-28,Lu Haodong (canton) Sun Yat-sen (red field),Red field with a dark blue rectangle in the up...
7,Transnistria,Transnistria,0.500000,2000-07-03,,"Three horizontal bands of red (top), green (ha..."


In [14]:
all_flags.to_feather('../data/all_flags.feather')