In [1]:
import urllib
from bs4 import BeautifulSoup
import pandas as pd
import re
from functools import partial
from io import StringIO

In [2]:
import urllib.request


list_of_flags_page = urllib.request.urlopen('https://en.wikipedia.org/wiki/List_of_national_flags_of_sovereign_states').read()

In [3]:
soup = BeautifulSoup(list_of_flags_page, 'html.parser')

In [4]:
tables = soup.find_all('table')

In [5]:
def parse_table(table):

    split_ratio = re.compile('[∶:]').split
    def parse_aspect_ratio(s: str) -> float:
        try:
            n, d = map(int, split_ratio(s))
            return n / d
        except ValueError:
            return float('nan')

    strip_citations = partial(re.compile(r'\s*\[.*\]').sub, "")
    
    table = pd.read_html(StringIO(table.prettify()), flavor='bs4')[0]

    table = table.map(strip_citations)
    table.columns = map(strip_citations, table.columns)
    table.drop('Refs.', axis=1, inplace=True)
    table['Aspect ratio'] = table['Aspect ratio'].apply(parse_aspect_ratio)
    table['Date of latest adoption'] = pd.to_datetime(table['Date of latest adoption'], format='mixed', errors='coerce')
    table['Designer(s)'] = table['Designer(s)'].apply(lambda s: None if s == '—' else s)

    return table

In [6]:
all_flags = pd.concat(list(map(parse_table, tables[:2])), axis=0, ignore_index=True)
all_flags

Unnamed: 0,Flag(s),State,Aspect ratio,Date of latest adoption,Designer(s),Description
0,Afghanistan (Islamic Emirate),Afghanistan,0.500000,2021-08-15,,White with a black Shahada in Thuluth scrip...
1,Afghanistan (Islamic Republic),Afghanistan,0.666667,2013-08-19,,Three equal vertical bands of black ( hoist s...
2,Albania,Albania,0.714286,1992-04-07,,Red with a black double-headed eagle in the ...
3,Algeria,Algeria,0.666667,1962-07-03,Disputed,Two equal vertical bands of green (hoist side)...
4,Andorra,Andorra,0.700000,1993-05-05,,"Three vertical bands of blue (hoist side), yel..."
...,...,...,...,...,...,...
207,Somaliland,Somaliland,0.500000,1996-10-14,,"Three equal horizontal bands of green (top), w..."
208,South Ossetia,South Ossetia,0.500000,NaT,,"Three equal horizontal bands of white (top), r..."
209,Taiwan,Taiwan,0.666667,1928-10-28,Lu Haodong (canton) Sun Yat-sen (red field),Red field with a dark blue rectangle in the up...
210,Transnistria,Transnistria,0.500000,2000-07-03,,"Three horizontal bands of red (top), green (ha..."


In [7]:
from pathlib import Path

def download_flags(table, folder, idx_offset=0):
    folder  = Path(folder)
    images = table.find_all('img')
    for i, image in enumerate(images):
        src = image['src']
        name = src.split('/')[-1]

        with open(folder / f'{i+idx_offset}-{name}', mode='wb') as f:
            f.write(urllib.request.urlopen('http:' + src).read())

    return len(images) + idx_offset

In [8]:
last = 0
for table in tables:
    last = download_flags(tables[0], '../data/flags', idx_offset=last)

In [9]:
paths = pd.DataFrame.from_records([{'index': int(flag.name.split('-')[0]), 'path': str(flag)} for flag in Path('../data/flags').glob('*')], index='index')
paths

Unnamed: 0_level_0,path
index,Unnamed: 1_level_1
0,..\data\flags\0-250px-Flag_of_the_Taliban.svg.png
1,..\data\flags\1-250px-Flag_of_Afghanistan_%282...
10,..\data\flags\10-250px-Flag_of_Austria.svg.png
100,..\data\flags\100-250px-Flag_of_Liberia.svg.png
1000,..\data\flags\1000-250px-Flag_of_Uganda.svg.png
...,...
995,..\data\flags\995-250px-Flag_of_Trinidad_and_T...
996,..\data\flags\996-250px-Flag_of_Tunisia.svg.png
997,..\data\flags\997-250px-Flag_of_Turkey.svg.png
998,..\data\flags\998-250px-Flag_of_Turkmenistan.s...


In [10]:
all_flags = all_flags.join(paths)
all_flags

Unnamed: 0,Flag(s),State,Aspect ratio,Date of latest adoption,Designer(s),Description,path
0,Afghanistan (Islamic Emirate),Afghanistan,0.500000,2021-08-15,,White with a black Shahada in Thuluth scrip...,..\data\flags\0-250px-Flag_of_the_Taliban.svg.png
1,Afghanistan (Islamic Republic),Afghanistan,0.666667,2013-08-19,,Three equal vertical bands of black ( hoist s...,..\data\flags\1-250px-Flag_of_Afghanistan_%282...
2,Albania,Albania,0.714286,1992-04-07,,Red with a black double-headed eagle in the ...,..\data\flags\2-250px-Flag_of_Albania.svg.png
3,Algeria,Algeria,0.666667,1962-07-03,Disputed,Two equal vertical bands of green (hoist side)...,..\data\flags\3-250px-Flag_of_Algeria.svg.png
4,Andorra,Andorra,0.700000,1993-05-05,,"Three vertical bands of blue (hoist side), yel...",..\data\flags\4-250px-Flag_of_Andorra.svg.png
...,...,...,...,...,...,...,...
207,Somaliland,Somaliland,0.500000,1996-10-14,,"Three equal horizontal bands of green (top), w...",..\data\flags\207-250px-Flag_of_Andorra.svg.png
208,South Ossetia,South Ossetia,0.500000,NaT,,"Three equal horizontal bands of white (top), r...",..\data\flags\208-250px-Flag_of_Angola.svg.png
209,Taiwan,Taiwan,0.666667,1928-10-28,Lu Haodong (canton) Sun Yat-sen (red field),Red field with a dark blue rectangle in the up...,..\data\flags\209-250px-Flag_of_Antigua_and_Ba...
210,Transnistria,Transnistria,0.500000,2000-07-03,,"Three horizontal bands of red (top), green (ha...",..\data\flags\210-250px-Flag_of_Argentina.svg.png


In [11]:
all_flags.to_feather('../data/all_flags.feather')