In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# 1.&nbsp; Scraping Basic Information



## 1.1.&nbsp; Scraping country, latitude and longitude of each city


### Berlin

In [2]:
# get the berlin_soup
url = 'https://www.wikipedia.org/wiki/Berlin'
headers = {'User-Agent': 'Chrome/139.0.0.0'}

response = requests.get(url, headers=headers)
berlin_soup = BeautifulSoup(response.content, 'html.parser')

In [None]:
berlin_soup.prettify()

In [None]:
berlin_soup.find_all(class_="infobox-data")

In [5]:
berlin_soup.find(class_="infobox-data")

<td class="infobox-data">Germany</td>

In [8]:
#extract the text
berlin_soup.find(class_="infobox-data").get_text()

'Germany'

In [None]:
#store it in a variable
berlin_country = berlin_soup.find(class_="infobox-data").get_text()
berlin_country

'Germany'

In [25]:
# BERLIN

url = 'https://www.wikipedia.org/wiki/Hamburg'
headers = {'User-Agent': 'Chrome/139.0.0.0'}

response = requests.get(url, headers=headers)
hamburg_soup = BeautifulSoup(response.content, 'html.parser')

berlin_country = berlin_soup.find(class_="infobox-data").get_text()
berlin_latitude = berlin_soup.find(class_="latitude").get_text()
berlin_longitude = berlin_soup.find(class_="longitude").get_text()

berlin_country, berlin_latitude, berlin_longitude

('Germany', '52°31′12″N', '13°24′18″E')

In [9]:
# HUMBURG

url = 'https://www.wikipedia.org/wiki/Hamburg'
headers = {'User-Agent': 'Chrome/139.0.0.0'}

response = requests.get(url, headers=headers)
hamburg_soup = BeautifulSoup(response.content, 'html.parser')

hamburg_country = hamburg_soup.find(class_="infobox-data").get_text()
hamburg_latitude = hamburg_soup.find(class_="latitude").get_text()
hamburg_longitude = hamburg_soup.find(class_="longitude").get_text()

hamburg_country, hamburg_latitude, hamburg_longitude

('Germany', '53°33′N', '10°00′E')

In [11]:
# MUNICH

url = 'https://www.wikipedia.org/wiki/Munich'
headers = {'User-Agent': 'Chrome/139.0.0.0'}

response = requests.get(url, headers=headers)
munich_soup = BeautifulSoup(response.content, 'html.parser')

munich_country = munich_soup.find(class_="infobox-data").get_text()
munich_latitude = munich_soup.find(class_="latitude").get_text()
munich_longitude = munich_soup.find(class_="longitude").get_text()

munich_country, munich_latitude, munich_longitude


('Germany', '48°08′15″N', '11°34′30″E')

In [23]:
# Making a loop

cities = ["Berlin", "Munich", "Hamburg"]

countries = []
latitudes = []
longitudes = []

for city in cities:
  # get the soup for the city
  url = f"https://www.wikipedia.org/wiki/{city}"
  headers = {'User-Agent': 'Chrome/134.0.0.0'}

  response = requests.get(url, headers=headers)
  city_soup = BeautifulSoup(response.content, 'html.parser')

  # extract the data
  city_country = city_soup.find(class_="infobox-data").get_text()
  city_latitude = city_soup.find(class_="latitude").get_text()
  city_longitude = city_soup.find(class_="longitude").get_text()

  # append data to a list
  countries.append(city_country)
  latitudes.append(city_latitude)
  longitudes.append(city_longitude)


#print

print(f"The cities are in the following countries: {countries}")
print(f"The cities have the following latitudes: {latitudes}")
print(f"The cities have the following longitudes: {longitudes}")

The cities are in the following countries: ['Germany', 'Germany', 'Germany']
The cities have the following latitudes: ['52°31′12″N', '48°08′15″N', '53°33′N']
The cities have the following longitudes: ['13°24′18″E', '11°34′30″E', '10°00′E']


# 2.&nbsp; Data Organisation


## 2.1.&nbsp; Creating a DataFrame

In [16]:
cities_df = pd.DataFrame({"City": cities,
                          "Country": countries,
                          "Latitude": latitudes,
                          "Longitude": longitudes})

cities_df

Unnamed: 0,City,Country,Latitude,Longitude
0,Berlin,Germany,52°31′12″N,13°24′18″E
1,Hamburg,Germany,53°33′N,10°00′E
2,Cologne,Germany,50°56′11″N,6°57′10″E


In [20]:
cities = ["Berlin", "Munich", "Hamburg"]

# create one single list to keep track of all values (instead of having separate lists)
city_data = []

for city in cities:
    url = f"https://www.wikipedia.org/wiki/{city}"
    headers = {'User-Agent': 'Chrome/134.0.0.0'}

    response = requests.get(url, headers=headers)
    city_soup = BeautifulSoup(response.content, 'html.parser')

    # extract the relevant information
    country = city_soup.find(class_="infobox-data").get_text()
    city_latitude = city_soup.find(class_="latitude").get_text()
    city_longitude = city_soup.find(class_="longitude").get_text()

    # for each city we append a dictionary of values to the list
    city_data.append({"City": city,
                     "Country": country,
                     "Latitude": city_latitude,
                     "Longitude": city_longitude
                    })

cities_df = pd.DataFrame(city_data)
cities_df


Unnamed: 0,City,Country,Latitude,Longitude
0,Berlin,Germany,52°31′12″N,13°24′18″E
1,Munich,Germany,48°08′15″N,11°34′30″E
2,Hamburg,Germany,53°33′N,10°00′E


In [None]:
#Here we will use [a python library](https://pypi.org/project/lat-lon-parser/) that converts latitude and longitude to decimal:

!pip install lat-lon-parser




In [26]:
# testing for Berlin

from lat_lon_parser import parse

parse(berlin_latitude)

52.519999999999996

## 2.2.&nbsp; Wrapping the code in a function

In [28]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from lat_lon_parser import parse    # for decimal coordinates


def cities_dataframe(cities):

  city_data = []

  for city in cities:
    url = f"https://www.wikipedia.org/wiki/{city}"
    headers = {'User-Agent': 'Chrome/134.0.0.0'}

    response = requests.get(url, headers=headers)
    city_soup = BeautifulSoup(response.content, 'html.parser')

    # extract the relevant information
    city_latitude = city_soup.find(class_="latitude").get_text()
    city_longitude = city_soup.find(class_="longitude").get_text()
    country = city_soup.find(class_="infobox-data").get_text()

    # keep track of data per city
    city_data.append({"City": city,
                    "Country": country,
                    "Latitude": parse(city_latitude), # latitude in decimal format
                    "Longitude": parse(city_longitude), # longitude in decimal format
                    })

  return pd.DataFrame(city_data)



# call the function
list_of_cities = ["Berlin", "Hamburg", "Cologne"]

cities_df = cities_dataframe(list_of_cities)
cities_df


Unnamed: 0,City,Country,Latitude,Longitude
0,Berlin,Germany,52.52,13.405
1,Hamburg,Germany,53.55,10.0
2,Cologne,Germany,50.936389,6.952778


In [29]:
new_cities = ["Munich", "Amsterdam", "Paris"]

cities_dataframe(new_cities)

Unnamed: 0,City,Country,Latitude,Longitude
0,Munich,Germany,48.1375,11.575
1,Amsterdam,Netherlands,52.372778,4.893611
2,Paris,France,48.856667,2.352222
