# Extracting one table

In [53]:
# import requests
# from bs4 import BeautifulSoup
# import pandas as pd

# # Setting a number of columns to display
# pd.set_option('display.max_columns', None)   # Set to None to display all columns
# pd.set_option('display.width', 1000)         # Set width to a high value to display columns next to each other

# # Define the range of years to extract data
# start_year = 2014
# end_year = 2023

# # Create an empty list to store the extracted tables
# dfs = []

# # Loop over the years
# for year in range(start_year, end_year + 1):
#     # Define the URL for the full year table
#     url = f"https://www.numbeo.com/quality-of-life/rankings_by_country.jsp?title={year}"

#     # Send a GET request to the URL
#     response = requests.get(url)

#     # Create a BeautifulSoup object with the response text
#     soup = BeautifulSoup(response.text, 'html.parser')

#     # Find the table with the specified ID
#     table = soup.find('table', {'id': 't2'})

#     # Extract the column names from the table header
#     header_row = table.find('thead').find('tr')
#     headers = [header.text.strip() for header in header_row.find_all('th')]
#     headers = headers[1:]  # Remove the first column

#     # Extract the data rows from the table
#     data = []
#     for row in table.find_all('tbody'):
#         for tr in row.find_all('tr'):
#             row_data = [td.text.strip() for td in tr.find_all('td')]
#             if len(row_data) > 0:
#                 data.append(row_data[1:])  # Remove the first empty column

#     # Combine headers list with item list
#     combined_data = [headers] + data

#     # Create a Pandas DataFrame from the list but exclude the header
#     df = pd.DataFrame(combined_data[1:], columns=combined_data[0])

#     # Add a column with the year
#     df.insert(0, 'Year', year)

#     # Append the DataFrame to the list
#     dfs.append(df)

# # Concatenate all DataFrames into a single DataFrame
# result_df = pd.concat(dfs, ignore_index=True)

# # Print the resulting DataFrame
# print(result_df)

# # Export that pandas dataframe to a csv file
# result_df.to_csv('numbeo.csv', index=False)


# Extracting all tables

In [54]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Define the range of years to extract data
start_year = 2012
end_year = 2023

# Create an empty list to store the extracted tables
dfs = []

# Define the URL pattern
url_pattern = "https://www.numbeo.com/quality-of-life/rankings_by_country.jsp?title={}"

# Loop over the years
for year in range(start_year, end_year + 1):
    # Loop over the URL types: full year, mid year, and first two years
    for url_type in ['{}', '{}-mid', '{}-Q1']:
        # Create the URL for the current year and URL type
        url = url_pattern.format(url_type.format(year))

        # Send a GET request to the URL
        response = requests.get(url)

        # Create a BeautifulSoup object with the response text
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the table with the specified ID
        table = soup.find('table', {'id': 't2'})

        # Extract the column names from the table header
        header_row = table.find('thead').find('tr')
        headers = [header.text.strip() for header in header_row.find_all('th')]
        headers = headers[1:]  # Remove the first column

        # Extract the data rows from the table
        data = []
        for row in table.find_all('tbody'):
            for tr in row.find_all('tr'):
                row_data = [td.text.strip() for td in tr.find_all('td')]
                if len(row_data) > 0:
                    data.append(row_data[1:])  # Remove the first empty column

        # Create a Pandas DataFrame for the data
        df = pd.DataFrame(data, columns=headers)

        # Add a column with the year type
        if url_type == '{}':
            year_type = 'Preliminary'
        else:
            year_type = 'Revised'
        df.insert(0, 'Year Type', year_type)

        # Add a column with the year
        df.insert(1, 'Year', year)

        # Append the DataFrame to the list
        dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
result_df = pd.concat(dfs, ignore_index=True)

# Print the resulting DataFrame
print(result_df)

        Year Type  Year               Country Quality of Life Index Purchasing Power Index Safety Index Health Care Index Cost of Living Index Property Price to Income Ratio Traffic Commute Time Index Pollution Index Climate Index
0         Revised  2012           Switzerland                 194.1                  127.4         74.2              79.5                148.7                            6.5                       30.9            35.5             -
1         Revised  2012               Germany                 184.4                  114.2         78.8              72.0                 92.8                            4.8                       33.1            45.1             -
2         Revised  2012                Norway                 183.4                   94.2         80.9              79.0                166.6                            7.1                       18.5            19.2             -
3         Revised  2012  United Arab Emirates                 177.1         

In [55]:
# Keep only the revised data for all years, except for the last year
result_df = result_df[(result_df['Year Type'] == 'Revised') | (result_df['Year'] == end_year)]

In [56]:
result_df

Unnamed: 0,Year Type,Year,Country,Quality of Life Index,Purchasing Power Index,Safety Index,Health Care Index,Cost of Living Index,Property Price to Income Ratio,Traffic Commute Time Index,Pollution Index,Climate Index
0,Revised,2012,Switzerland,194.1,127.4,74.2,79.5,148.7,6.5,30.9,35.5,-
1,Revised,2012,Germany,184.4,114.2,78.8,72.0,92.8,4.8,33.1,45.1,-
2,Revised,2012,Norway,183.4,94.2,80.9,79.0,166.6,7.1,18.5,19.2,-
3,Revised,2012,United Arab Emirates,177.1,134.9,64.1,63.9,80.8,4.5,17.0,69.2,-
4,Revised,2012,New Zealand,174.3,88.6,53.2,86.8,107.4,5.7,25.2,17.1,-
...,...,...,...,...,...,...,...,...,...,...,...,...
1500,Preliminary,2023,Iran,73.8,21.1,50.1,52.3,35.2,24.4,47.3,75.3,63.7
1501,Preliminary,2023,Sri Lanka,72.8,16.0,58.6,72.0,25.3,40.8,56.9,59.4,59.1
1502,Preliminary,2023,Venezuela,72.6,12.4,17.4,39.3,41.6,18.9,33.1,75.7,99.9
1503,Preliminary,2023,Bangladesh,70.1,26.9,37.4,42.3,26.6,12.6,57.4,85.1,72.9


In [57]:
# export to csv ignoring the index
result_df.to_csv('numbeo.csv', index=False)