In [39]:
import helpers as h
import json
import numpy as np
import os
import pandas as pd
import sqlite3
import sys

from tqdm import tqdm

Load population figures for European countries. Data source: https://worldpopulationreview.com/continents/europe/cities

The aim of the project is to compare the standard of living in European cities. Therefore, due to the fact that Russia's territory within Europe is small-Russian cities were excluded from further analysis.

In [43]:
population_df = pd.read_csv('table_data.csv')

selected_country_df = (population_df
    .sort_values(
        by=['country', 'population'], 
        ascending=[True,False]
    ))
    
selected_country_df = selected_country_df.loc[selected_country_df['country']!='Russia']
selected_country_df.to_csv(os.path.join('data','selected_countries.csv'))

A list of cities and countries with data was downloaded from numbeo.com. The list is extracted based on the population table data from the previous step.

In [41]:
numbeo_url = 'https://www.numbeo.com/cost-of-living'
numbeo = h.get_content_url(page_url=numbeo_url)

europe_country = selected_country_df.country.unique()

country_city = []
for country in tqdm(europe_country):

    country_url = f'{numbeo_url}/country_result.jsp?country={country}'
    country_soup = h.get_content_url(page_url=country_url)

    city_select = country_soup.find("select",{"name":"city"}) 
    city_list = city_select.find_all('option')
    for city in city_list:
        if city.text != '--- Select city---':
            country_city.append((country,city.text.strip()))
country_city_df = pd.DataFrame(country_city, columns=('country', 'city'))
country_city_df.head(5)

  0%|          | 0/36 [00:00<?, ?it/s]  3%|▎         | 1/36 [00:00<00:28,  1.25it/s]  6%|▌         | 2/36 [00:01<00:27,  1.24it/s]  8%|▊         | 3/36 [00:02<00:26,  1.25it/s] 11%|█         | 4/36 [00:03<00:28,  1.13it/s] 14%|█▍        | 5/36 [00:04<00:28,  1.11it/s] 17%|█▋        | 6/36 [00:05<00:26,  1.15it/s] 19%|█▉        | 7/36 [00:06<00:24,  1.16it/s] 22%|██▏       | 8/36 [00:06<00:23,  1.19it/s] 25%|██▌       | 9/36 [00:07<00:23,  1.15it/s] 28%|██▊       | 10/36 [00:08<00:24,  1.07it/s] 31%|███       | 11/36 [00:09<00:22,  1.12it/s] 33%|███▎      | 12/36 [00:10<00:20,  1.17it/s] 36%|███▌      | 13/36 [00:11<00:20,  1.12it/s] 39%|███▉      | 14/36 [00:12<00:20,  1.08it/s] 42%|████▏     | 15/36 [00:13<00:18,  1.12it/s] 44%|████▍     | 16/36 [00:14<00:17,  1.14it/s] 47%|████▋     | 17/36 [00:15<00:17,  1.09it/s] 50%|█████     | 18/36 [00:15<00:16,  1.07it/s] 53%|█████▎    | 19/36 [00:16<00:15,  1.07it/s] 56%|█████▌    | 20/36 [00:17<00:14,  1.12it/s] 58%|█████

Unnamed: 0,country,city
0,Albania,Apollonia
1,Albania,Ballsh
2,Albania,Berat
3,Albania,Burrel
4,Albania,Dhermi


In [44]:
country_city_df.to_csv(os.path.join('data','countries_cities.csv'))

Downloading data from numbeo.com and then saving it as a data dictionary {state: {city: data}}'. During data acquisition, a problem was noticed regarding the local currency, and for this reason it was decided to download the data in Euros.

In [None]:
country_city_dict = {}

for _, country, city in tqdm(list(country_city_df.itertuples())):
    city_url = f'{numbeo_url}/in/{city}'
    city_soup = h.get_content_url(city_url)
    city_dict = {}
    if 'These cities seems to have a name similar' in city_soup.find('p').text:
        for a in city_soup.select('p a', href=True): 
            if a.text and country in a.text: 
                city_url_eur = a['href'] + '?displayCurrency=EUR'
    else:
        city_url_eur = city_url + '?displayCurrency=EUR'

    city_soup = h.get_content_url(city_url_eur)

    if country in country_city_dict:
        city_dict = country_city_dict[country]
    else:
        city_dict = {}
        country_city_dict[country] = city_dict

    city_dict[city] = h.get_data_from_table(
        city_soup,
        converters={
            'Restaurants': str,
            'Edit': str,
            'Range': str
        }
    ).rename(
        columns={
            'Restaurants': 'Type',
            'Edit': 'Avg_price',
            'Range': 'Range_price'
        }
    )

Due to the long time of data retrieval, saved the obtained result in a .json file. This way we keep a backup of the data.

In [None]:
city_json = {}

for country in country_city_dict:
    city_json[country] ={}
    for city in country_city_dict[country]:
        city_json[country][city] = json.loads(country_city_dict[country][city].to_json())

with open(os.path.join('data','numbeo.json'), 'w') as f:
    json.dump(city_json, f)