Python script by [__Hassan Mojeed__](https://www.linkedin.com/in/hassanmojeed)<br>
Email: mojeed.o.hassan@gmail.com<br>
Website: [https://hassanmojeed.pages.dev](https://hassanmojeed.pages.dev)



# Page 3

## Introduction

##### Continuing the study's progression, this phase involves web scraping data from Wikipedia to extract links to country flags essential for this research. This process is crucial for acquiring the necessary visual assets to enhance the analysis and presentation of findings.

In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
import pandas_gbq
import warnings
warnings.filterwarnings("ignore")

## Scrapping Country Flag links from wikipedia

In [4]:

# List to store tuples of country name and flag link
countryname_flaglnk = []

# URL of the Wikipedia page containing flags
url = "https://en.wikipedia.org/wiki/Gallery_of_sovereign_state_flags"

# Send an HTTP GET request to retrieve the page content
response_ok = requests.get(url)

# Parse the HTML content using BeautifulSoup
web_data = BeautifulSoup(response_ok.text, "html")

# Find all img elements with class "mw-file-element" (these are the flag images)
flaglnk_list = web_data.find_all("img", {"class": "mw-file-element"})

# Extract the src attribute of each img element and construct the flag link URLs
flaglnk = ["https:" + img["src"] for img in flaglnk_list][1:-2]

# Find all div elements with class "gallerytext" (these contain the country names)
country_list = web_data.find_all("div", {"class": "gallerytext"})

# Extract the text content (country names) from each div element
countryname = [element.text for element in country_list]

# Combine country names with corresponding flag links into a list of tuples
for row in zip(countryname, flaglnk):
    countryname_flaglnk.append(row)

# Create a DataFrame from the list of tuples with columns "Country" and "Flag Link"
country_flag = pd.DataFrame(countryname_flaglnk, columns=["Country", "Flag Link"])

In [5]:
# Removing some flag links that are not needed for this study

remove_flaglnks = [
"https://upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Flag_of_the_Republic_of_Abkhazia.svg/120px-Flag_of_the_Republic_of_Abkhazia.svg.png",
"https://upload.wikimedia.org/wikipedia/commons/thumb/5/5c/Flag_of_the_Taliban.svg/120px-Flag_of_the_Taliban.svg.png",
"https://upload.wikimedia.org/wikipedia/commons/thumb/3/35/Flag_of_the_Cook_Islands.svg/120px-Flag_of_the_Cook_Islands.svg.png",
"https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Flag_of_Kosovo.svg/120px-Flag_of_Kosovo.svg.png",
"https://upload.wikimedia.org/wikipedia/commons/thumb/0/01/Flag_of_Niue.svg/120px-Flag_of_Niue.svg.png",
"https://upload.wikimedia.org/wikipedia/commons/thumb/1/1e/Flag_of_the_Turkish_Republic_of_Northern_Cyprus.svg/120px-Flag_of_the_Turkish_Republic_of_Northern_Cyprus.svg.png",
"https://upload.wikimedia.org/wikipedia/commons/thumb/2/26/Flag_of_the_Sahrawi_Arab_Democratic_Republic.svg/120px-Flag_of_the_Sahrawi_Arab_Democratic_Republic.svg.png",
"https://upload.wikimedia.org/wikipedia/commons/thumb/4/4d/Flag_of_Somaliland.svg/120px-Flag_of_Somaliland.svg.png",
"https://upload.wikimedia.org/wikipedia/commons/thumb/1/12/Flag_of_South_Ossetia.svg/120px-Flag_of_South_Ossetia.svg.png",
"https://upload.wikimedia.org/wikipedia/commons/thumb/b/bc/Flag_of_Transnistria_%28state%29.svg/120px-Flag_of_Transnistria_%28state%29.svg.png"]


# Create an empty list to store filtered rows
filtered_rows = []

# Iterate through each row in the DataFrame
for index, row in country_flag.iterrows():
    flag_link = row["Flag Link"]
    # Check if the flag link is not in the remove_flaglnks list
    if flag_link not in remove_flaglnks:
        # Append the row to the filtered_rows list
        filtered_rows.append(row)

# Create a new DataFrame from the filtered rows
filtered_country_flag = pd.DataFrame(filtered_rows).reset_index(drop=True)

# Display the filtered DataFrame

print(filtered_country_flag.shape)

filtered_country_flag

(196, 2)


Unnamed: 0,Country,Flag Link
0,Afghanistan (Islamic Republic of)[a],https://upload.wikimedia.org/wikipedia/commons...
1,Albania,https://upload.wikimedia.org/wikipedia/commons...
2,Algeria,https://upload.wikimedia.org/wikipedia/commons...
3,Andorra,https://upload.wikimedia.org/wikipedia/commons...
4,Angola,https://upload.wikimedia.org/wikipedia/commons...
...,...,...
191,Vietnam,https://upload.wikimedia.org/wikipedia/commons...
192,Yemen,https://upload.wikimedia.org/wikipedia/commons...
193,Zambia,https://upload.wikimedia.org/wikipedia/commons...
194,Zimbabwe,https://upload.wikimedia.org/wikipedia/commons...


In [6]:
# renaming some of the country for conformity

filtered_country_flag["Country"].replace({
                                "Afghanistan (Islamic Republic of)[a]" : "Afghanistan",
                                "Brunei" : "Brunei Darussalam",
                                "Cape Verde" : "Cape Verde Islands",
                                "Comoros" : "Comoro Islands",
                                "Democratic Republic of the Congo" : "Congo (Dem. Rep.)",
                                "Republic of the Congo" : "Congo (Rep.)",
                                "Equitorial Guinea" : "Equatorial Guinea",
                                "East Timor" : "Timor-Leste",
                                "Ivory Coast" : "Cote d'Ivoire",
                                "Moldova" : "Republic of Moldova",
                                "Palau" : "Palau Islands",
                                "Palestine" : "State of Palestine",
                                "Russia" : "Russian Federation",
                                "São Tomé and Príncipe" : "Sao Tome and Principe",
                                "Vatican City" : "Vatican City(Holy See)",
                                "Taiwan" : "Taiwan (Chinese Taipei)"
                                },inplace=True
)


print(filtered_country_flag.shape)

filtered_country_flag.head()

(196, 2)


Unnamed: 0,Country,Flag Link
0,Afghanistan,https://upload.wikimedia.org/wikipedia/commons...
1,Albania,https://upload.wikimedia.org/wikipedia/commons...
2,Algeria,https://upload.wikimedia.org/wikipedia/commons...
3,Andorra,https://upload.wikimedia.org/wikipedia/commons...
4,Angola,https://upload.wikimedia.org/wikipedia/commons...


In [7]:
# we need additional flag links for three more countries, since we could not get them from the initial web scraping appraoch above.

# List to store tuples of country name and flag link for dependent territories
missing_flaglnk = []

# URL of the Wikipedia page containing flags of dependent territories
url_new = "https://en.wikipedia.org/wiki/Gallery_of_flags_of_dependent_territories"

# Send an HTTP GET request to retrieve the page content
response = requests.get(url_new)

# Parse the HTML content using BeautifulSoup
web_info = BeautifulSoup(response.text, "html")

# Find all img elements with class "mw-file-element" (these are the flag images)
flnk_list = web_info.find_all("img", {"class": "mw-file-element"})

# Extract the src attribute of each img element and construct the flag link URLs
flnk = ["https:" + img["src"] for img in flnk_list]

# Find all div elements with class "gallerytext" (these contain the country names)
name_list = web_info.find_all("div", {"class": "gallerytext"})

# Extract the text content (country names) from each div element
name = [element.text for element in name_list]

# Combine country names with corresponding flag links into a list of tuples
for row in zip(name, flnk):
    missing_flaglnk.append(row)

# Create a DataFrame from the list of tuples with columns "Country" and "Flag Link"
flag_cnty = pd.DataFrame(missing_flaglnk, columns=["Country", "Flag Link"])

# Filter the DataFrame to select rows where the country name matches "Flag of Hong Kong" or "Flag of Macau"
filtered_flag_cnty = flag_cnty[(flag_cnty["Country"] == "Flag of Hong Kong") | (flag_cnty["Country"] == "Flag of Macau")]

In [8]:
filtered_flag_cnty["Country"].replace({"Flag of Hong Kong" : "Hong Kong (SAR China)",
                                        "Flag of Macau" : "Macao (SAR China)"},
                                        inplace=True)

filtered_flag_cnty.shape

(2, 2)

In [9]:
# last missing flag from the dataset

lst_missingflg_dic = {"Country" : "Kosovo", 
                  "Flag Link" : "https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Flag_of_Kosovo.svg/238px-Flag_of_Kosovo.svg.png"}

lst_missingflg_df = pd.DataFrame([lst_missingflg_dic])

lst_missingflg_df.shape

(1, 2)

## Combining all DataFrames into one DataFrame

In [10]:
country_and_flag = pd.concat([filtered_country_flag, filtered_flag_cnty, lst_missingflg_df]).reset_index(drop=True)

country_and_flag["Flag Link"] = country_and_flag["Flag Link"].astype(str)

print(country_and_flag.shape)

country_and_flag = country_and_flag.astype(str)

#country_and_flag.to_parquet("country_and_flag.parquet")

country_and_flag.to_excel("country_and_flag.xlsx")

(199, 2)


In [11]:

# writing data to Google Big Query

#table = "Country_Rank.Country and Flag"
#project_id = "cool-ship-407420"

#pandas_gbq.to_gbq(country_and_flag,destination_table=table, project_id=project_id, if_exists="replace")

## Downloading all flags on local drive

In [12]:
# Create a directory to save the images if it doesn't exist
output_dir = "flag_images"
os.makedirs(output_dir, exist_ok=True)

# Function to download images
def download_image(url, filename):
    response = requests.get(url)
    if response.status_code == 200:
        with open(filename, 'wb') as f:
            f.write(response.content)
            print(f"Downloaded: {filename}")
    else:
        print(f"Failed to download: {filename}")

In [13]:
# Download images
for index, row in country_and_flag.iterrows():
    country = row["Country"]
    image_url = row["Flag Link"]
    filename = os.path.join(output_dir, f"{country}.png")
    download_image(image_url, filename)

Downloaded: flag_images/Afghanistan.png
Downloaded: flag_images/Albania.png
Downloaded: flag_images/Algeria.png
Downloaded: flag_images/Andorra.png
Downloaded: flag_images/Angola.png
Downloaded: flag_images/Antigua and Barbuda.png
Downloaded: flag_images/Argentina.png
Downloaded: flag_images/Armenia.png
Downloaded: flag_images/Australia.png
Downloaded: flag_images/Austria.png
Downloaded: flag_images/Azerbaijan.png
Downloaded: flag_images/Bahamas.png
Downloaded: flag_images/Bahrain.png
Downloaded: flag_images/Bangladesh.png
Downloaded: flag_images/Barbados.png
Downloaded: flag_images/Belarus.png
Downloaded: flag_images/Belgium.png
Downloaded: flag_images/Belize.png
Downloaded: flag_images/Benin.png
Downloaded: flag_images/Bhutan.png
Downloaded: flag_images/Bolivia.png
Downloaded: flag_images/Bosnia and Herzegovina.png
Downloaded: flag_images/Botswana.png
Downloaded: flag_images/Brazil.png
Downloaded: flag_images/Brunei Darussalam.png
Downloaded: flag_images/Bulgaria.png
Downloaded: flag