In [109]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from lat_lon_parser import parse

# City Data Collection

In [110]:
# All the cities I want
cities = ['Berlin', 'Hamburg', 'Paris', 'Stockholm','Tokyo','Rome','Vienna','Innsbruck','Cape Town','Aix-en-Provence']

# All the columns I want to scrap
lats = []
lons = []
countries = []
languages = []
pops = []
pop_times = []
average_temps = []
timezones = []
sizes = []
mayors = []

In [111]:
# Filling in the lists with the wanted data from Wikipedia
for city in cities:
    # Creation of the soup
    url = f'https://en.wikipedia.org/wiki/{city}'

    response = requests.get(url)
    if response.status_code != 200:
        print('unable to retrieve content', response.status_code)
        break

    soup = BeautifulSoup(response.content, 'html.parser')

    # Finding the data
    try:
      lat = soup.find(class_='latitude').text
      lon = soup.find(class_='longitude').text
      country = soup.find(class_="infobox-label").find_next('td').text
      pop = soup.find(string='Population').find_next(string=re.compile(r'(\d+,)?\d+,\d+')).text
      pop_time = soup.find(string='Population').find_next(string=re.compile(r"(\d+)")).text
      # If there is no date with the population we take the year this code was created
      if("20" not in pop_time):
         pop_time = 2025
      
      size = size = soup.find(string="Area").find_next(class_="infobox-data").text
      timezone = soup.find(string="Time zone").find_next("a").text
      mayor = soup.find(class_="infobox").find_next(string=re.compile(r"Mayor|Governor")).find_next(class_="infobox-data").text
    except:
      lat=None
      lon=None
      country = None
      pop = None
      pop_time = None
      size = None
      timezone = None
      mayor = None
    
    # Fill the lists
    lats += [parse(lat)]
    lons += [parse(lon)]
    countries += [country]
    pops += [pop]
    pop_times += [pop_time]
    sizes += [size]
    timezones += [timezone]
    mayors += [mayor]

## Cleaning Citys

In [112]:
# Create the Dataframe with the newly filled lists
city_data = pd.DataFrame({
    'City_Name': cities,
    'Latitude':lats,
    'Longitude':lons,
    'Population': pops,
    "Population_From": pop_times,
    'Country': countries,
    "Size": sizes,
    "Timezone": timezones,
    "Mayor": mayors
})

In [113]:
city_data

Unnamed: 0,City_Name,Latitude,Longitude,Population,Population_From,Country,Size,Timezone,Mayor
0,Berlin,52.52,13.405,3596999,(2022 census),Germany,891.3 km2 (344.1 sq mi),UTC+01:00,Kai Wegner (CDU)
1,Hamburg,53.55,10.0,1964021,(2023-12-31),Germany,755.22 km2 (291.59 sq mi),UTC+1,Peter Tschentscher (SPD)
2,Paris,48.856667,2.352222,2048472,(Jan 2025,France,105.4 km2 (40.7 sq mi),UTC+01:00,Anne Hidalgo[1] (PS)
3,Stockholm,59.329444,18.068611,984748,(2023),Sweden,188 km2 (73 sq mi),UTC+1,Karin Wanngård (S)
4,Tokyo,35.689722,139.692222,14254039,(May 2025),Japan,"2,194 km2 (847 sq mi)",UTC+09:00,Yuriko Koike (indp.)
5,Rome,41.893333,12.482778,2746984,(2025),Italy[a],"1,285 km2 (496 sq mi)",UTC+01:00,Roberto Gualtieri (PD)
6,Vienna,48.208333,16.3725,2028499,(2025),Austria,414.78 km2 (160.15 sq mi),UTC+01:00,Michael Ludwig (SPÖ)
7,Innsbruck,47.268333,11.393333,132493,(2018-01-01),Austria,104.91 km2 (40.51 sq mi),UTC+1,Johannes Anzengruber
8,Cape Town,-33.925278,18.423889,433688,2025,South Africa,"2,461 km2 (950 sq mi)",UTC+2,Geordin Hill-Lewis (DA)
9,Aix-en-Provence,43.526389,5.445556,147933,(2022),France,186.083 km2 (71.847 sq mi),UTC+01:00,Sophie Joissains[1]


In [114]:
city_data.loc[city_data["Country"]==" Austria", "Country"] = "Austria"
city_data.loc[city_data["Country"]=="Italy[a]", "Country"] = "Italy"

In [115]:
city_df = city_data[["City_Name", "Latitude", "Longitude", "Size", "Timezone", "Mayor"]]

# Country Database

## Cleaning Countrys

Cleaning has to changed depending on the city that you are trying to get

In [116]:
# Cleaning Country table
country_unique = city_data["Country"]
country_df = pd.DataFrame({"Country_Name": country_unique})
country_df = country_df.drop([0, 6])
country_df = country_df.drop(9)
country_df.iloc[[4]] = "Italy"
country_df

Unnamed: 0,Country_Name
1,Germany
2,France
3,Sweden
4,Japan
5,Italy
7,Austria
8,South Africa


## Collecting Languages

In [117]:
# Establishing language per Country gathering
country_list = country_df["Country_Name"]

languages = []

In [118]:
# Gather the languages
for c in country_list:
    # Creation of the soup
    url = f'https://en.wikipedia.org/wiki/{c}'

    response = requests.get(url)
    if response.status_code != 200:
        print('unable to retrieve content', response.status_code)
        break

    soup = BeautifulSoup(response.content, 'html.parser')

    # Finding the data
    try:
      language= soup.find(class_="infobox").find_next(string=re.compile(r"Official")).find_next("a").text
    except:
      language= None
    
    # Fill the list
    languages += [language]

languages

['German',
 'French',
 'Swedish',
 'Japanese',
 'Italian',
 'German',
 '12 languages']

## Cleaning Languages

Cleaning languages has to be done depending on the city that was picked

In [119]:
# Language cleaning
country_df["Country_Language"] = languages
country_df["Country_Language"].iloc[[6]] = "Afrikaans"

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  country_df["Country_Language"].iloc[[6]] = "Afrikaans"


# Connecting to SQL

In [None]:
schema = "cities_database"
host = "localhost"
user = "root"
password = "YOUR PASSWORD"
port = 3306

connection_string = f'mysql+pymysql://{user}:{password}@{host}:{port}/{schema}'

Make sure that the tables in sql are created and empty\
Otherwise you will not be able to generate an id

In [121]:
city_df.to_sql('city_start',
                  if_exists='append',
                  con=connection_string,
                  index=False)

10

In [122]:
country_df.to_sql("country",
                  if_exists="append",
                  con=connection_string,
                  index=False)

7

In [123]:
cities_from_sql = pd.read_sql("city_start", con=connection_string)
cities_from_sql

Unnamed: 0,city_id,city_name,latitude,longitude,size,timezone,mayor
0,1,Berlin,52.52,13.405,891.3 km2 (344.1 sq mi),UTC+01:00,Kai Wegner (CDU)
1,2,Hamburg,53.55,10.0,755.22 km2 (291.59 sq mi),UTC+1,Peter Tschentscher (SPD)
2,3,Paris,48.8567,2.35222,105.4 km2 (40.7 sq mi),UTC+01:00,Anne Hidalgo[1] (PS)
3,4,Stockholm,59.3294,18.0686,188 km2 (73 sq mi),UTC+1,Karin Wanngård (S)
4,5,Tokyo,35.6897,139.692,"2,194 km2 (847 sq mi)",UTC+09:00,Yuriko Koike (indp.)
5,6,Rome,41.8933,12.4828,"1,285 km2 (496 sq mi)",UTC+01:00,Roberto Gualtieri (PD)
6,7,Vienna,48.2083,16.3725,414.78 km2 (160.15 sq mi),UTC+01:00,Michael Ludwig (SPÖ)
7,8,Innsbruck,47.2683,11.3933,104.91 km2 (40.51 sq mi),UTC+1,Johannes Anzengruber
8,9,Cape Town,-33.9253,18.4239,"2,461 km2 (950 sq mi)",UTC+2,Geordin Hill-Lewis (DA)
9,10,Aix-en-Provence,43.5264,5.44556,186.083 km2 (71.847 sq mi),UTC+01:00,Sophie Joissains[1]


In [124]:
countries_from_sql = pd.read_sql("country", con=connection_string)
countries_from_sql

Unnamed: 0,country_id,country_name,country_language
0,1,Germany,German
1,2,France,French
2,3,Sweden,Swedish
3,4,Japan,Japanese
4,5,Italy,Italian
5,6,Austria,German
6,7,South Africa,Afrikaans


## Merging for IDS

In [125]:
city_data

Unnamed: 0,City_Name,Latitude,Longitude,Population,Population_From,Country,Size,Timezone,Mayor
0,Berlin,52.52,13.405,3596999,(2022 census),Germany,891.3 km2 (344.1 sq mi),UTC+01:00,Kai Wegner (CDU)
1,Hamburg,53.55,10.0,1964021,(2023-12-31),Germany,755.22 km2 (291.59 sq mi),UTC+1,Peter Tschentscher (SPD)
2,Paris,48.856667,2.352222,2048472,(Jan 2025,France,105.4 km2 (40.7 sq mi),UTC+01:00,Anne Hidalgo[1] (PS)
3,Stockholm,59.329444,18.068611,984748,(2023),Sweden,188 km2 (73 sq mi),UTC+1,Karin Wanngård (S)
4,Tokyo,35.689722,139.692222,14254039,(May 2025),Japan,"2,194 km2 (847 sq mi)",UTC+09:00,Yuriko Koike (indp.)
5,Rome,41.893333,12.482778,2746984,(2025),Italy,"1,285 km2 (496 sq mi)",UTC+01:00,Roberto Gualtieri (PD)
6,Vienna,48.208333,16.3725,2028499,(2025),Austria,414.78 km2 (160.15 sq mi),UTC+01:00,Michael Ludwig (SPÖ)
7,Innsbruck,47.268333,11.393333,132493,(2018-01-01),Austria,104.91 km2 (40.51 sq mi),UTC+1,Johannes Anzengruber
8,Cape Town,-33.925278,18.423889,433688,2025,South Africa,"2,461 km2 (950 sq mi)",UTC+2,Geordin Hill-Lewis (DA)
9,Aix-en-Provence,43.526389,5.445556,147933,(2022),France,186.083 km2 (71.847 sq mi),UTC+01:00,Sophie Joissains[1]


In [126]:
# Here we join onto our major data the data collected through countries
city_data = city_data.merge(countries_from_sql, left_on="Country", right_on="country_name", how="left")

In [127]:
# Here we give the city Database the country id for the foreign key link
cities_from_sql = cities_from_sql.merge(city_data[["City_Name", "country_id"]], left_on="city_name", right_on="City_Name", how="inner")

In [128]:
cities_from_sql = cities_from_sql.drop(["City_Name"], axis=1)
cities_from_sql

Unnamed: 0,city_id,city_name,latitude,longitude,size,timezone,mayor,country_id
0,1,Berlin,52.52,13.405,891.3 km2 (344.1 sq mi),UTC+01:00,Kai Wegner (CDU),1
1,2,Hamburg,53.55,10.0,755.22 km2 (291.59 sq mi),UTC+1,Peter Tschentscher (SPD),1
2,3,Paris,48.8567,2.35222,105.4 km2 (40.7 sq mi),UTC+01:00,Anne Hidalgo[1] (PS),2
3,4,Stockholm,59.3294,18.0686,188 km2 (73 sq mi),UTC+1,Karin Wanngård (S),3
4,5,Tokyo,35.6897,139.692,"2,194 km2 (847 sq mi)",UTC+09:00,Yuriko Koike (indp.),4
5,6,Rome,41.8933,12.4828,"1,285 km2 (496 sq mi)",UTC+01:00,Roberto Gualtieri (PD),5
6,7,Vienna,48.2083,16.3725,414.78 km2 (160.15 sq mi),UTC+01:00,Michael Ludwig (SPÖ),6
7,8,Innsbruck,47.2683,11.3933,104.91 km2 (40.51 sq mi),UTC+1,Johannes Anzengruber,6
8,9,Cape Town,-33.9253,18.4239,"2,461 km2 (950 sq mi)",UTC+2,Geordin Hill-Lewis (DA),7
9,10,Aix-en-Provence,43.5264,5.44556,186.083 km2 (71.847 sq mi),UTC+01:00,Sophie Joissains[1],2


In [129]:
city_data = city_data.merge(cities_from_sql[["city_id", "city_name"]], left_on="City_Name", right_on="city_name", how="left")
city_data = city_data.drop(["city_name"], axis=1)
city_data

Unnamed: 0,City_Name,Latitude,Longitude,Population,Population_From,Country,Size,Timezone,Mayor,country_id,country_name,country_language,city_id
0,Berlin,52.52,13.405,3596999,(2022 census),Germany,891.3 km2 (344.1 sq mi),UTC+01:00,Kai Wegner (CDU),1,Germany,German,1
1,Hamburg,53.55,10.0,1964021,(2023-12-31),Germany,755.22 km2 (291.59 sq mi),UTC+1,Peter Tschentscher (SPD),1,Germany,German,2
2,Paris,48.856667,2.352222,2048472,(Jan 2025,France,105.4 km2 (40.7 sq mi),UTC+01:00,Anne Hidalgo[1] (PS),2,France,French,3
3,Stockholm,59.329444,18.068611,984748,(2023),Sweden,188 km2 (73 sq mi),UTC+1,Karin Wanngård (S),3,Sweden,Swedish,4
4,Tokyo,35.689722,139.692222,14254039,(May 2025),Japan,"2,194 km2 (847 sq mi)",UTC+09:00,Yuriko Koike (indp.),4,Japan,Japanese,5
5,Rome,41.893333,12.482778,2746984,(2025),Italy,"1,285 km2 (496 sq mi)",UTC+01:00,Roberto Gualtieri (PD),5,Italy,Italian,6
6,Vienna,48.208333,16.3725,2028499,(2025),Austria,414.78 km2 (160.15 sq mi),UTC+01:00,Michael Ludwig (SPÖ),6,Austria,German,7
7,Innsbruck,47.268333,11.393333,132493,(2018-01-01),Austria,104.91 km2 (40.51 sq mi),UTC+1,Johannes Anzengruber,6,Austria,German,8
8,Cape Town,-33.925278,18.423889,433688,2025,South Africa,"2,461 km2 (950 sq mi)",UTC+2,Geordin Hill-Lewis (DA),7,South Africa,Afrikaans,9
9,Aix-en-Provence,43.526389,5.445556,147933,(2022),France,186.083 km2 (71.847 sq mi),UTC+01:00,Sophie Joissains[1],2,France,French,10


# Finish the City Database

In [130]:
cities_from_sql.to_sql('city',
                  if_exists='append',
                  con=connection_string,
                  index=False)

10

# Creating the Population Database

In [131]:
population_df = city_data[["city_id", "Population", "Population_From"]]
population_df["Retrieval_Year"] = 2025
population_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  population_df["Retrieval_Year"] = 2025


Unnamed: 0,city_id,Population,Population_From,Retrieval_Year
0,1,3596999,(2022 census),2025
1,2,1964021,(2023-12-31),2025
2,3,2048472,(Jan 2025,2025
3,4,984748,(2023),2025
4,5,14254039,(May 2025),2025
5,6,2746984,(2025),2025
6,7,2028499,(2025),2025
7,8,132493,(2018-01-01),2025
8,9,433688,2025,2025
9,10,147933,(2022),2025


In [132]:
# Read it to the sql database
population_df.to_sql('population',
                  if_exists='append',
                  con=connection_string,
                  index=False)

10