# Web Scraping - City Data 

In [None]:
# !pip install lat-lon-parser
# !pip install sqlalchemy
# !pip install pymysql
# !pip install python-dotenv

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from lat_lon_parser import parse

## 1. Get data for country, latitude, longitude

In [None]:
# Berlin Example
url = 'https://en.wikipedia.org/wiki/Berlin'
response = requests.get(url)
berlin_soup = BeautifulSoup(response.content,'html.parser')

In [None]:
# City's country -> stored in elements with class "infobox-data"
# Check at which position the country occurs
berlin_soup.find_all(class_="infobox-data")

In [None]:
# get country for city
berlin_country = berlin_soup.find(class_="infobox-data").get_text(strip=True)
berlin_country

In [None]:
# The latitude and longitude -> stored in elements with class "latitude" and "longitude"
berlin_latitude = berlin_soup.find(class_='latitude').get_text()
berlin_longitude = berlin_soup.find(class_='longitude').get_text()
berlin_latitude, berlin_longitude

In [None]:
# convert latitude, longitude fo decimal coordinates
from lat_lon_parser import parse
parse(berlin_latitude)

In [None]:
# Create a function to extract country, latitude, longitude data from Wikipedia page for a list of cities
import pandas as pd
import requests
from bs4 import BeautifulSoup
from lat_lon_parser import parse    # for decimal coordinates

def cities_dataframe(cities):
    
    cities_data = []
    
    for city in cities:
        url = f'https://www.wikipedia.org/wiki/{city}'
        response = requests.get(url)
        city_soup = BeautifulSoup(response.content, 'html.parser')
    
        # get country data
        country = city_soup.find(class_="infobox-data").get_text(strip=True)

        # get latitude, longitude
        city_latitude = city_soup.find(class_="latitude").get_text()
        city_longitude = city_soup.find(class_="longitude").get_text()

        cities_data.append({'city':city, 
                            'country': country, 
                            'latitude': parse(city_latitude), 
                            'longitude': parse(city_longitude)})
    return pd.DataFrame(cities_data)

In [None]:
# create data frame using cities_dataframe function
cities = ["Berlin", "Hamburg", "Munich"]

cities_df = cities_dataframe(cities)
cities_df

## 2. Get data for population with time stamp

In [None]:
# Berlin Example
url = 'https://en.wikipedia.org/wiki/Berlin'
response = requests.get(url)
berlin_soup = BeautifulSoup(response.content,'html.parser')

In [None]:
# The population data follows the infobox-header 'Population'
population_header = berlin_soup.find(string='Population')
if population_header:
    berlin_population = population_header.find_next('td').get_text(strip=True) # the number is in the next 'td' tag
else:
    berlin_population = None

In [None]:
# convert population into clean integer
berlin_population_clean = int(berlin_population.replace(",", ""))
berlin_population_clean

In [None]:
# Create a function to extract population data with time stamp from Wikipedia page for cities_df

import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime

def populations_dataframe(cities):
    population_data = []

    for city in cities:
        url = f"https://www.wikipedia.org/wiki/{city}"
        response = requests.get(url)
        city_soup = BeautifulSoup(response.content, 'html.parser')

        # get population data with time stamp
        population_header = city_soup.find(string='Population')
        if population_header:
            population = population_header.find_next('td').get_text(strip=True) # the number is in the next 'td' tag
        else:
            population = None
        population_clean = int(population.replace(",", ""))

        # get today's date
        today = datetime.today().strftime("%d.%m.%Y")

        # append a dictionary of values to the list
        population_data.append({"city": city,
                        "population": population_clean,
                        "population_timestamp": today
                        })
    
    population_df = pd.DataFrame(population_data)
    population_df['population_timestamp'] = pd.to_datetime(population_df['population_timestamp'])
    
    return population_df

In [None]:
# create data frame using cities_dataframe function
cities = ["Berlin", "Hamburg", "Munich"]

populations_df = populations_dataframe(cities)
populations_df

## 3. Send scraped data to MySQL database

In [None]:
# Setup schema "city_infos" and tables in MySQL first

# Establish a connection with the MySQL database
# import libraries
from sqlalchemy import create_engine, text
import os
from dotenv import load_dotenv
load_dotenv()

# Connection setup
schema = "city_infos"
host = "127.0.0.1"
user = "root"
password = os.getenv("MYSQL_PASSWORD") # password = "YOUR_PASSWORD"
port = 3306

# Create connection string
connection_string = f'mysql+pymysql://{user}:{password}@{host}:{port}/{schema}'

In [None]:
cities_df.info()

In [None]:
# Send cities_df to MySQL database
cities_df.to_sql(name='cities', con=connection_string, if_exists='append', index=False)

In [None]:
# Retrieve cities_df with their auto-generated IDs from MySQL database
cities_in_db = pd.read_sql("SELECT * FROM cities", con=connection_string) # retrieve data with auto-generated IDs
merged_population_df = populations_df.merge(cities_in_db, on="city", how="left") # merge population_df with city_df
final_population_df = merged_population_df[['city_id', 'population', 'population_timestamp']] # select needed columns
final_population_df

In [None]:
final_population_df.info()

In [None]:
# Send population table to MySQL database
final_population_df.to_sql('population', con=connection_string, if_exists='append', index=False)