In [1]:
import re
import pickle
import requests
import pycountry
from tqdm import tqdm
from bs4 import BeautifulSoup
import pandas as pd
import logging

logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0'}
page_url = 'https://nationalanthems.info/'  # home page
page = requests.get(page_url, headers=headers)
soup = BeautifulSoup(page.text)

## Scraping

In [2]:
menu_items = soup.find_all(class_=re.compile("menu-item-"))  # items from the horizontal dropdown menus at the top of the page
menu_items = [item.find("a") for item in menu_items if "menu-item-has-children" not in item.attrs["class"]]  # we want links only for single countries
links = [item["href"] for item in menu_items][:-1]  # excluding the last one, which is a link to FAQ
links = sorted(list(set(links)), key=lambda link: link.split("info")[1]) # remove possible duplicates and sort

In [3]:
print(len(links))
links[:10]

446


['https://nationalanthems.info/abk.htm',
 'https://nationalanthems.info/aca.htm',
 'https://nationalanthems.info/aco.htm',
 'https://nationalanthems.info/ad.htm',
 'https://nationalanthems.info/ae.htm',
 'https://nationalanthems.info/af-06.htm',
 'https://nationalanthems.info/af-43.htm',
 'https://nationalanthems.info/af-73.htm',
 'https://nationalanthems.info/af-78.htm',
 'https://nationalanthems.info/af-92.htm']

In [4]:

def scrap_anthems(links, pickle_results=True):
    
    logging.basicConfig(filename="scrapping_report.log",
                    filemode='w', force=True,
                    format='%(asctime)s, %(levelname)s %(message)s',
                    datefmt='%H:%M:%S',
                    level=logging.DEBUG)

    data = {}
    logger = logging.getLogger('Scraping report')
    for link in tqdm(links):
        try:
            page = requests.get(link, headers=headers)
            soup = BeautifulSoup(page.text)
        except Exception as e:
            logger.error(f"Connection error: {e} for {link}")
            continue

        # Different countries have different html labels for a div with english lyrics, e.g:
        # https://nationalanthems.info/dk.htm  -------> English versification
        # https://nationalanthems.info/fr.htm  -------> English translation
        # https://nationalanthems.info/gb.htm  -------> English lyrics
        # Some countries have multiple correct labels, e.g. South Africa
        
        country_name = soup.find("title").text.split(" –")[0]
        eng_lyrics_label = soup.find("div", title=re.compile("English lyrics"))
        eng_transl_label = soup.find("div", title=re.compile("English translation"))
        eng_versif_label = soup.find("div", title=re.compile("English versification"))
        possible_labels = [eng_lyrics_label, eng_transl_label, eng_versif_label]

        if any(possible_labels):
            all_matching_lyrics = []
            all_matching_labels = []
            if eng_transl_label:
                lyrics = eng_transl_label.find_next("div").text
                all_matching_lyrics.append(lyrics)
                all_matching_labels.append("transl")
            if eng_versif_label:
                lyrics = eng_versif_label.find_next("div").text
                all_matching_lyrics.append(lyrics)
                all_matching_labels.append("versif")
            if eng_lyrics_label:
                lyrics = eng_lyrics_label.find_next("div").text
                all_matching_lyrics.append(lyrics)
                all_matching_labels.append("lyrics")

            if len(all_matching_lyrics) == 1:
                logger.info(f"   Single matching lyrics found   |labels = {str(all_matching_labels):<30}| {link} [{country_name}]")
            else:
                logger.info(f"   Multiple matching lyrics found |labels = {str(all_matching_labels):<30}| {link} [{country_name}]")
            data[country_name] = [link.split("/")[-1].split(".")[0].upper(), all_matching_lyrics]   
        else:
            data[country_name] = [link.split("/")[-1].split(".")[0].upper(), [""]]
            logger.warning(f"Lyrics not found" + 55*" " + f"| {link} [{country_name}]")
    logging.shutdown()
    if pickle_results:
        with open('./data/data.pickle', 'wb') as handle:
            pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)
    return data

In [5]:
def validate_anthems(data):
    assert "Arise children of the fatherland" in " ".join(data["France"][1])
    assert "Poland has not yet perished" in " ".join(data["Poland"][1])
    assert "God save our gracious King" in " ".join(data["United Kingdom"][1])
    assert "Oh, say can you see" in " ".join(data["United States"][1])
    assert "God Bless Africa" in " ".join(data["South Africa"][1])

In [6]:
scrap_anthems(links);

  0%|          | 0/446 [00:00<?, ?it/s]

100%|██████████| 446/446 [15:04<00:00,  2.03s/it]


In [7]:
with open("./data/data.pickle", "rb") as handle:
    data = pickle.load(handle)

validate_anthems(data)

In [8]:
df = pd.DataFrame.from_dict(data, orient="index", columns=["iso_2", "lyrics"]).drop("Frequently Asked Questions (FAQ)")
df

Unnamed: 0,iso_2,lyrics
Abkhazia,ABK,"[\nMarch on, march on,\r\nsons of Abkhazia!\r\..."
Acadia,ACA,"[\n1. Hail Thou Star of the Ocean,\r\nPortal o..."
Azores,ACO,[\n1. Faith and firmness gave birth \r\nIn the...
Andorra,AD,"[\nThe great Charlemagne, my Father, liberated..."
United Arab Emirates,AE,"[\nLive my country, the unity of our Emirates ..."
...,...,...
Zanzibar (to 1890),ZNZ-90,[]
Zanzibar,ZNZ,"[\nGod has blessed us all, Unguja and Pemba.\r..."
Zaire,ZRE,"[\nZaireans, in the newfound peace, \r\nUnited..."
Zimbabwe (1980-1994),ZW-94,"[\nGod bless Africa,\r\nLet her fame spread fa..."


## Decide which lyrics to choose for countries with multiple english translations

In [9]:
df[df["lyrics"].apply(lambda x: len(x)) > 1]

Unnamed: 0,iso_2,lyrics
Botswana,BW,"[\n\n1. This land of ours,\r\nIs a gift from G..."
Canada,CA,[\nO Canada! Land of our ancestors\r\nGlorious...
Cameroon,CM,"[\n1. O Cameroon cradle of our ancestors,\r\nG..."
Finland,FI,"[\n1. O our home country, Finland, the land wh..."
Fiji,FJ,[\n1. Let us show pride and honour our nation\...
Malta,MT,"[\n1. To this sweet land, our mother, to which..."
New Zealand,NZ,"[\n1. O Lord, God, \r\nOf all people\r\nListen..."
Olympics,OLY,"[\nO Ancient immortal Spirit, pure father\r\nO..."
South Africa,ZA,[\nGod Bless Africa\r\nRaise high Her glory\r\...


In [10]:
data["Botswana"][1].pop(1)
data["Canada"][1].pop(0)
data["Cameroon"][1].pop(0)
data["Finland"][1].pop(0)
data["Fiji"][1].pop(0)
data["Malta"][1].pop(1)
data["New Zealand"][1].pop(0)
data["Olympics"][1].pop(1)
data["South Africa"][1].pop(1)

validate_anthems(data)
data = {k: [v[0], v[1][0]] for (k, v) in data.items()}

In [11]:
with open('./data/data.pickle', 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)