In [2]:
pip install requests beautifulsoup4 # Installing the libraries

Note: you may need to restart the kernel to use updated packages.


In [22]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def extract_data(url):
    # GET request to the webpage
    response = requests.get(url)
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')
        # Find all list items with class 'country_landing_list_item'
        countries = soup.find_all('li', class_='country_landing_list_item')

        # Initialize lists to store data
        country_names = []
        image_urls = []
        populations = []

        for country in countries:
            # Extract and store country name
            country_name = country.find('h3').text.strip()
            country_names.append(country_name)
            
            # Extract and construct the full URL for the flag image
            image_url = country.find('picture').find('img')['src']
            full_image_url = f"https://www.worldatlas.com{image_url}" if image_url.startswith('/') else image_url
            image_urls.append(full_image_url)
            
            # Extract population data and remove commas
            population_data = country.find('table').find('tr').find('td').text.strip().replace(',', '')
            # Try to convert population data to integer, handle exceptions
            try:
                population = int(population_data)
            except ValueError:
                # Assign None if conversion fails
                population = None  
            populations.append(population)

        # Create a DataFrame from the collected data
        data = {
            "Country": country_names,
            "Image": image_urls,
            "Population": populations
        }
        df = pd.DataFrame(data)
        return df
    else:
        # Print an error message if the web page is not successfully retrieved
        print("Failed to retrieve the webpage")
        return None

# URL of the web page to scrape
url = 'https://www.worldatlas.com/countries'
# Execute data extraction function
countries_df = extract_data(url)

# Check and display the resulting DataFrame, then save it to a CSV file
if countries_df is not None:
    print(countries_df)
    # Save the DataFrame to a CSV file
    countries_df.to_csv(r'Z:\Programas\Anaconda\Notebooks\Proyectos propios\Wordl Analysis\countries_data.csv', index=False)
    
# Show the DataFrame
df.head()  



                      Country  \
0                 Afghanistan   
1                     Albania   
2                     Algeria   
3                     Andorra   
4                      Angola   
..                        ...   
220                   Tokelau   
221  Turks and Caicos Islands   
222       U.S. Virgin Islands   
223         Wallis and Futuna   
224            Western Sahara   

                                                 Image  Population  
0    https://www.worldatlas.com/r/w236/img/flag/af-...  33332025.0  
1    https://www.worldatlas.com/r/w236/img/flag/al-...   3038594.0  
2    https://www.worldatlas.com/r/w236/img/flag/dz-...  40263711.0  
3    https://www.worldatlas.com/r/w236/img/flag/ad-...     85660.0  
4    https://www.worldatlas.com/r/w236/img/flag/ao-...  25789024.0  
..                                                 ...         ...  
220  https://www.worldatlas.com/r/w236/img/flag/tk-...      1285.0  
221  https://www.worldatlas.com/r/w236/img/flag/tc-

Unnamed: 0,Country,Image,Population
0,Afghanistan,https://www.worldatlas.com/r/w236/img/flag/af-...,33332025
1,Albania,https://www.worldatlas.com/r/w236/img/flag/al-...,3038594
2,Algeria,https://www.worldatlas.com/r/w236/img/flag/dz-...,40263711
3,Andorra,https://www.worldatlas.com/r/w236/img/flag/ad-...,85660
4,Angola,https://www.worldatlas.com/r/w236/img/flag/ao-...,25789024
