## 1.Export

### Scrap Contents for the required URLs
City,country,population,coordination

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

In [2]:
url_wiki = "https://en.wikipedia.org/wiki/List_of_European_cities_by_population_within_city_limits"
headers = {'Accept-Language': 'en-US,en;q=0.8'}
response = requests.get(url_wiki, headers = headers)

response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")

In [3]:
city_dict={
    "city":[],
    "country":[],
    "population":[],
    "coordination":[]
}

In [4]:
table_rows = "table.wikitable tr"
#print(soup.select(table_rows))
#mw-content-text > div.mw-parser-output > table > tbody > tr:nth-child(4)
for i, row in enumerate(soup.select(table_rows)):
    if (i == 0):
        continue
    #cities
    #mw-content-text > div.mw-parser-output > table > tbody > tr:nth-child(1) > td:nth-child(2) > a
    city_dict["city"].append(row.select("td")[1].select("a")[0].get_text())

    #counties
    #print(row.select(" td")[2].select("a")[0].get_text())
    city_dict["country"].append(row.select("td")[2].select("a")[0].get_text())
    
    #population
    #mw-content-text > div.mw-parser-output > table > tbody > tr:nth-child(1) > td:nth-child(4)
    #print(row.select(" td")[3].select("span")[0].get_text())
    city_dict["population"].append(row.select("td")[3].select("span")[0].get_text())
    
    #coordination
    #mw-content-text > div.mw-parser-output > table > tbody > tr:nth-child(1) > td:nth-child(8) > span > span > a > span.geo-default > span > span.geo-dec
    #print(row.select(" td")[7].select("span.geo-dec")[0].get_text())
    
    city_dict["coordination"].append(row.select("td")[7].select("span.geo-dec")[0].get_text())


In [5]:
len(city_dict["city"]) == len(city_dict["country"]) == len(city_dict["population"]) == len(city_dict["coordination"])

True

In [6]:
city_df = pd.DataFrame(city_dict)

## 2.Transform

In [7]:
city_c_df = city_df.copy()

###  Rework the Coordination column

In [8]:
def ConvertCoord(row):
    #convert lat and long
    lat,long = row.coordination.replace("°E","").replace("°N","").split(" ")
    if lat.find("°S") != -1 :
        lat="-" + lat.replace("°S","")
    if long.find("°W") != -1 :
        long="-" + long.replace("°W","")
    try:
        lat= float(lat)
        long= float(long)
    except:
        print("ERROR, COORDINATES are not Convertable")
    return (lat,long)

In [9]:
city_c_df["lat"]=city_c_df.apply(ConvertCoord,axis=1,result_type="expand")[0]
city_c_df["lon"]=city_c_df.apply(ConvertCoord,axis=1,result_type="expand")[1]

In [10]:
city_c_df=city_c_df.drop(["coordination"],axis=1)

In [11]:
city_c_df

Unnamed: 0,city,country,population,lat,lon
0,Istanbul,Turkey,15840900,41.013611,28.955
1,Moscow,Russia,12632409,55.75,37.616667
2,London,United Kingdom,8799800,51.507222,-0.1275
3,Saint Petersburg,Russia,5376672,59.95,30.3
4,Berlin,Germany,3677472,52.516667,13.383333
5,Madrid,Spain,3305408,40.383333,-3.716667
6,Kyiv,Ukraine,2962180,50.45,30.523333
7,Rome,Italy,2749031,41.9,12.5
8,Baku,Azerbaijan,2303100,40.4,49.9
9,Bucharest,Romania,2161347,44.4325,26.103889


### Convert Population to numeric

In [13]:
city_c_df.population = city_c_df.population.str.replace(",","")

In [14]:
city_c_df.population = pd.to_numeric(city_c_df.population)

In [15]:
city_c_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   city        36 non-null     object 
 1   country     36 non-null     object 
 2   population  36 non-null     int64  
 3   lat         36 non-null     float64
 4   lon         36 non-null     float64
dtypes: float64(2), int64(1), object(2)
memory usage: 1.5+ KB


## 3.Load

### Connect with  MySQL Local

In [13]:
import sqlalchemy as db
sys.path.append('..\keys')
import key

schema="gans_db" 
host="localhost"       
user="root"
password=key.SQL_PASSWORD
port=3306
con = image.pngf'mysql+pymysql://{user}:{password}@{host}:{port}/{schema}'

#### Merge with existing db (TBD)

In [14]:
city_db_df = pd.read_sql_table('city',con=con)

In [15]:
city_db_df

Unnamed: 0,city_id,city,country,population,lat,lon
0,1,Istanbul,Turkey,15840900,41.0136,28.955
1,2,Moscow,Russia,12632409,55.75,37.6167
2,3,London,United Kingdom,8799800,51.5072,-0.1275
3,4,Saint Petersburg,Russia,5376672,59.95,30.3
4,5,Berlin,Germany,3677472,52.5167,13.3833
5,6,Madrid,Spain,3305408,40.3833,-3.71667
6,7,Kyiv,Ukraine,2962180,50.45,30.5233
7,8,Rome,Italy,2761632,41.9,12.5
8,9,Baku,Azerbaijan,2303100,40.4,49.9
9,10,Bucharest,Romania,2161347,44.4325,26.1039


In [25]:
# assume population for Moscow is now 13,000,000 .. how to add this to mysql
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

# Connect to the database
engine = create_engine(f'mysql://root:{key.SQL_PASSWORD}@localhost/{schema}')
Session = sessionmaker(bind=engine)
session = Session()

In [26]:
# read from the MySQL via queries
session.execute("select population from city where(city_id = 1)").fetchall()

# result[0][0]

[(15840900,)]

In [None]:
session.execute("select city,population from city").all()

In [27]:
from sqlalchemy import Column, Integer, String
from sqlalchemy.ext.declarative import declarative_base

# Define a base model for the city table
Base = declarative_base()
class City(Base):
    __tablename__ = 'city'
    city_id = Column(Integer, primary_key=True)
    city = Column(String)
    population = Column(Integer)

In [34]:
# session.query(City).filter(City.city_id == 1).update({'population': 13000000})
session.commit()

#### Load Final df

In [23]:
city_c_df.to_sql('city',con=con,if_exists='append',index=False)

36

### Connect with  MySQL AWS

In [12]:
import sqlalchemy as db
sys.path.append('..\keys')
import key

schema="gans_db"   # name of the database you want to use here
host="wbs-gans-db.c87binzvwbjx.eu-central-1.rds.amazonaws.com"        # to connect to your local server
user="admin"
password=key.SQL_PASSWORD # your password!!!!
port=3306
con = f'mysql+pymysql://{user}:{password}@{host}:{port}/{schema}'

#### Load Final df

In [13]:
city_c_df.to_sql('city',con=con,if_exists='append',index=False)

36