In [None]:
# colab has an older version of beautifulsoup by default
# here we upgrade it
# if you are working on your own computer, you can probably comment this step out and skip it
!pip install --upgrade beautifulsoup4

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting beautifulsoup4
  Downloading beautifulsoup4-4.12.0-py3-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.2/132.2 KB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: beautifulsoup4
  Attempting uninstall: beautifulsoup4
    Found existing installation: beautifulsoup4 4.11.2
    Uninstalling beautifulsoup4-4.11.2:
      Successfully uninstalled beautifulsoup4-4.11.2
Successfully installed beautifulsoup4-4.12.0


In [None]:
# 1. import libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

In [None]:
# 2. find url and store it in a variable
url = "https://en.wikipedia.org/wiki/Berlin"

In [None]:
# 3. download html with a get request
response = requests.get(url)
response.status_code # 200 status code means OK!

200

In [None]:
# 4.1. parse html (create the 'soup')
soup = BeautifulSoup(response.content, "html.parser")
# 4.2. check that the html code looks like it should
# soup

In [None]:
# 5. retrieve/extract the desired info (here, you'll paste the "Selector" you copied before to get the element that belongs to the top movie)

# let's first try to get the name of the city
# by copying the selector we can see that it has the id firstHeading (it also has a class by the same name!)
soup.select("#firstHeading")

[<h1 class="firstHeading mw-first-heading" id="firstHeading"><span class="mw-page-title-main">Berlin</span></h1>]

In [None]:
soup.select("#firstHeading")[0].get_text()

'Berlin'

In [None]:
# Let's use this class, infobox-data, to target the information country
soup.select(".infobox-data")[0].get_text()

'Germany'

In [None]:
#soup.select(".infobox-data")[0].get_text()

Now we just carry on exploring the html, finding classes, ids, and selectors to target the information we need. Hopefully these classes and selectors will be universal across all cities on wikipedia, but it is likely that they will change in a few places, and we will have to try to make our code robust to this

In [None]:

cities = pd.DataFrame(data = {'city_id' : [1,2,3,4],'city' : ['Berlin', 'Hamburg', 'London', 'Frankfurt'], 'country_code' : ['DE', 'DE', 'UK', 'DE']})
cities

Unnamed: 0,city_id,city,country_code
0,1,Berlin,DE
1,2,Hamburg,DE
2,3,London,UK
3,4,Frankfurt,DE


In [None]:
def recreate_wiki(cities_list):
  # empty list that will be filled with one dictionary of information per city
  list_for_df = []
 
  
  # begin a for loop to create a dictionary of information for each city
  for city in range(len(cities_list)):
    
    # we can use the universal nature of wikipedias urls to our advantage here
    # all of the urls are the same besides the city name
    url = f'https://en.wikipedia.org/wiki/{cities_list[city]}'

    # here we make our soup for the city
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')

    # here we initialise our empty dictionary for the city
    response_dict = {}

    # here we fill the dictionary with information using the ids, classes, and selectors that we found in the html
    
    response_dict['city_id'] = cities['city_id'][city] 
    response_dict['city'] = soup.select(".firstHeading")[0].get_text()
    response_dict['country'] = soup.select(".infobox-data")[0].get_text()
    response_dict['latitude'] = soup.select(".latitude")[0].get_text()
    response_dict['longitude'] = soup.select(".longitude")[0].get_text()
    # not all of the wikipedia pages contain elevation, look at Hamburg
    # the if clause means that our code can continue and won't stop at this hurdle
    if soup.select_one('.infobox-label:-soup-contains("Elevation")'):
      response_dict['elevation'] = soup.select_one('.infobox-label:-soup-contains("Elevation")').find_next(class_='infobox-data').get_text()
    response_dict['website'] = soup.select_one('.infobox-label:-soup-contains("Website")').find_next(class_='infobox-data').get_text()
    if soup.select_one('th.infobox-header:-soup-contains("Population")'):
      response_dict['population'] = soup.select_one('th.infobox-header:-soup-contains("Population")').parent.find_next_sibling().find(text=re.compile(r'\d+'))
    
    # add our dictionary for the city to list_for_df
    list_for_df.append(response_dict)
  
  # make the DataFrame
  cities_df = pd.DataFrame(list_for_df)

  # fixing latitude
  cities_df['latitude'] = cities_df['latitude'].str.split('″').str[0].str.replace('°', '.', regex=False).str.replace('′', '', regex=False)
  # fixing longitude
  cities_df['longitude'] = cities_df['longitude'].str.split('″').str[0].str.replace('°', '.', regex=False).str.replace('′', '', regex=False)
  # fixing elevation
  cities_df.insert(4, 'elevation_in_meters', cities_df['elevation'].str.split('m').str[0].str.strip())

  # return the DataFrame
  return cities_df

In [None]:
list_of_cities = ['Berlin', 'Hamburg', 'London', 'Frankfurt']
city_population = recreate_wiki(list_of_cities)
city_population

  response_dict['population'] = soup.select_one('th.infobox-header:-soup-contains("Population")').parent.find_next_sibling().find(text=re.compile(r'\d+'))


Unnamed: 0,city_id,city,country,latitude,elevation_in_meters,longitude,elevation,website,population
0,1,Berlin,Germany,52.3112,34,13.2418,34 m (112 ft),berlin.de,3677472
1,2,Hamburg,Germany,53.33N,,10.00E,,hamburg.com,1906411
2,3,London,United Kingdom,51.3026,36 ft (11,0.739,36 ft (11 m),www.london.gov.uk,8799800
3,4,Frankfurt,Germany,50.0638,112,08.4056,112 m (367 ft),frankfurt.de,759224


In [None]:
city_population = city_population.drop(["elevation_in_meters"], axis = 1)


In [None]:
city_population.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   city_id     4 non-null      int64 
 1   city        4 non-null      object
 2   country     4 non-null      object
 3   latitude    4 non-null      object
 4   longitude   4 non-null      object
 5   elevation   3 non-null      object
 6   website     4 non-null      object
 7   population  4 non-null      object
dtypes: int64(1), object(7)
memory usage: 384.0+ bytes


In [None]:
city_population['population'] = city_population['population'].str.replace(',', '', regex=False)
city_population['population'] = pd.to_numeric(city_population['population'])

In [None]:
city_population.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   city_id     4 non-null      int64 
 1   city        4 non-null      object
 2   country     4 non-null      object
 3   latitude    4 non-null      object
 4   longitude   4 non-null      object
 5   elevation   3 non-null      object
 6   website     4 non-null      object
 7   population  4 non-null      int64 
dtypes: int64(2), object(6)
memory usage: 384.0+ bytes


In [None]:
city_population['latitude'] = city_population['latitude'].str.replace(r"[a-zA-Z]",'')
city_population['longitude'] = city_population['longitude'].str.replace(r"[a-zA-Z]",'')

  city_population['latitude'] = city_population['latitude'].str.replace(r"[a-zA-Z]",'')
  city_population['longitude'] = city_population['longitude'].str.replace(r"[a-zA-Z]",'')


In [None]:
city_population

Unnamed: 0,city_id,city,country,latitude,longitude,elevation,website,population
0,1,Berlin,Germany,52.3112,13.2418,34 m (112 ft),berlin.de,3677472
1,2,Hamburg,Germany,53.33,10.0,,hamburg.com,1906411
2,3,London,United Kingdom,51.3026,0.739,36 ft (11 m),www.london.gov.uk,8799800
3,4,Frankfurt,Germany,50.0638,8.4056,112 m (367 ft),frankfurt.de,759224


In [None]:
city_population.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   city_id     4 non-null      int64 
 1   city        4 non-null      object
 2   country     4 non-null      object
 3   latitude    4 non-null      object
 4   longitude   4 non-null      object
 5   elevation   3 non-null      object
 6   website     4 non-null      object
 7   population  4 non-null      int64 
dtypes: int64(2), object(6)
memory usage: 384.0+ bytes


In [None]:
city_population['latitude'] = pd.to_numeric(city_population['latitude'])
city_population['longitude'] = pd.to_numeric(city_population['longitude'])

In [None]:
city_population.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   city_id     4 non-null      int64  
 1   city        4 non-null      object 
 2   country     4 non-null      object 
 3   latitude    4 non-null      float64
 4   longitude   4 non-null      float64
 5   elevation   3 non-null      object 
 6   website     4 non-null      object 
 7   population  4 non-null      int64  
dtypes: float64(2), int64(2), object(4)
memory usage: 384.0+ bytes


In [None]:
city_population

Unnamed: 0,city_id,city,country,latitude,longitude,elevation,website,population
0,1,Berlin,Germany,52.3112,13.2418,34 m (112 ft),berlin.de,3677472
1,2,Hamburg,Germany,53.33,10.0,,hamburg.com,1906411
2,3,London,United Kingdom,51.3026,0.739,36 ft (11 m),www.london.gov.uk,8799800
3,4,Frankfurt,Germany,50.0638,8.4056,112 m (367 ft),frankfurt.de,759224
