In [13]:
import urllib
from bs4 import BeautifulSoup
import pandas as pd
import re
from functools import partial
from io import StringIO

In [14]:
import urllib.request


list_of_flags_page = urllib.request.urlopen('https://en.wikipedia.org/wiki/List_of_national_flags_of_sovereign_states').read()

In [15]:
soup = BeautifulSoup(list_of_flags_page, 'html.parser')

In [16]:
tables = soup.find_all('table')

In [17]:
def parse_table(table):

    split_ratio = re.compile('[∶:]').split
    def parse_aspect_ratio(s: str) -> float:
        try:
            n, d = map(int, split_ratio(s))
            return n / d
        except ValueError:
            return float('nan')

    strip_citations = partial(re.compile(r'\s*\[.*\]').sub, "")
    
    table = pd.read_html(StringIO(table.prettify()), flavor='bs4')[0]

    table = table.map(strip_citations)
    table.columns = map(strip_citations, table.columns)
    table.drop('Refs.', axis=1, inplace=True)
    table['Aspect ratio'] = table['Aspect ratio'].apply(parse_aspect_ratio)
    table['Date of latest adoption'] = pd.to_datetime(table['Date of latest adoption'], format='mixed', errors='coerce')
    table['Designer(s)'] = table['Designer(s)'].apply(lambda s: None if s == '—' else s)

    return table

In [18]:
all_flags = pd.concat(list(map(parse_table, tables)), axis=0, ignore_index=True)
all_flags

Unnamed: 0,Flag(s),State,Aspect ratio,Date of latest adoption,Designer(s),Description
0,Afghanistan (Islamic Emirate),Afghanistan,0.500000,2021-08-15,,White with a black Shahada in Thuluth scrip...
1,Afghanistan (Islamic Republic),Afghanistan,0.666667,2013-08-19,,Three equal vertical bands of black ( hoist s...
2,Albania,Albania,0.714286,1992-04-07,,Red with a black double-headed eagle in the ...
3,Algeria,Algeria,0.666667,1962-07-03,Disputed,Two equal vertical bands of green (hoist side)...
4,Andorra,Andorra,0.700000,1993-05-05,,"Three vertical bands of blue (hoist side), yel..."
...,...,...,...,...,...,...
208,Somaliland,Somaliland,,1996-10-14,,"Three equal horizontal bands of green (top), w..."
209,South Ossetia,South Ossetia,,NaT,,"Three equal horizontal bands of white (top), r..."
210,Taiwan,Taiwan,0.666667,1928-10-28,Lu Haodong (canton) Sun Yat-sen (red field),Red field with a dark blue rectangle in the up...
211,Transnistria,Transnistria,0.500000,2000-07-03,,"Three horizontal bands of red (top), green (ha..."


In [22]:
from pathlib import Path

def download_flags(table, folder, idx_offset=0):
    folder  = Path(folder)
    images = table.find_all('img')
    for i, image in enumerate(images):
        src = image['src']
        name = src.split('/')[-1]

        with open(folder / f'{i+idx_offset}-{name}', mode='wb') as f:
            f.write(urllib.request.urlopen('http:' + src).read())

    return len(images) + idx_offset

In [23]:
last = 0
for table in tables:
    last = download_flags(tables[0], '../data/flags', idx_offset=last)

In [34]:
paths = pd.DataFrame.from_records([{'index': int(flag.name.split('-')[0]), 'path': str(flag)} for flag in Path('../data/flags').glob('*')], index='index')
paths

Unnamed: 0_level_0,path
index,Unnamed: 1_level_1
0,..\data\flags\0-220px-Flag_of_the_Taliban.svg.png
1,..\data\flags\1-220px-Flag_of_Afghanistan_%282...
10,..\data\flags\10-220px-Flag_of_Austria.svg.png
100,..\data\flags\100-220px-Flag_of_Liberia.svg.png
101,..\data\flags\101-220px-Flag_of_Libya.svg.png
...,...
95,..\data\flags\95-220px-Flag_of_Kyrgyzstan.svg.png
96,..\data\flags\96-220px-Flag_of_Laos.svg.png
97,..\data\flags\97-220px-Flag_of_Latvia.svg.png
98,..\data\flags\98-220px-Flag_of_Lebanon.svg.png


In [37]:
all_flags = all_flags.join(paths)
all_flags

ValueError: columns overlap but no suffix specified: Index(['path'], dtype='object')

In [38]:
all_flags.to_feather('../data/all_flags.feather')