# GANS Case study - web scraping and APIs

Gans is a startup developing an e-scooter-sharing system. It aspires to operate in the most populous cities all around the world. In each city, the company will have hundreds of e-scooters parked in the streets and allow users to rent them by the minute.

Gans has seen that its operational success depends on something more mundane: having its scooters parked where users need them.
Scrape information and create a database connected to SQL to hellp gather that information for GANS

Step 1 - Create static databases with city and competitor information

In [385]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
import pymysql
import os

In [440]:
def static_dbs_setup():
    """
    This function sets up the static databases by performing the following steps:
    1. Establishes a database connection using the connection_setup() function.
    2. Retrieves city data and city names using the city_collection() function.
    3. Scrapes city facts using the city_facts_scraping() function.
    4. Calls an API to retrieve geodata using the geo_data_api_call() function.
    5. Calls an API to retrieve airport data using the airports_api_call() function.
    6. Scrapes competitor data for Tier and Lime using the scrape_comp_cities() function.
    7. Performs data transformation and merging.
    8. Sends the transformed data to SQL using various send functions.
    9. Returns a success message.

    Returns:
    - str: A message indicating the successful upload of data.
    """
    connection_string = connection_setup()
    
    #cities
    city_df = city_collection()[0]
    city_name_lst = city_collection()[1]
    city_facts_df = city_facts_scraping(city_name_lst)
    
    #geodata
    geo_data_df = geo_data_api_call(city_name_lst)
    
    #airports
    latitudes = geo_data_df.latitude.to_list()
    longitudes =geo_data_df.longitude.to_list()
    airports_df = airports_api_call(city_name_lst, latitudes, longitudes)
    
#     #competitors
    comp = {"comp_name":["Tier", "Rent-a-bike","Lime"],"competitor_id":[1,2,3]}
    competitor_names = pd.DataFrame(data = comp)
    url_tier = "https://www.tier.app/en/where-to-find-us"
    url_lime = "https://www.li.me/de-de/locations"
    tier_df = scrape_comp_cities("Tier",url_tier, city_name_lst)
    lime_df = scrape_comp_cities("Lime",url_lime, city_name_lst)
    
    # Transformation of data
    
    geo_data_city_id_df = geo_data_df.merge(city_df)
    geo_data_city_id_df = geo_data_city_id_df.drop(columns=["city_name"])
    
    airports_city_id_df = airports_df.merge(city_df, left_on="city_sql",right_on="city_name")
    airports_city_id_df = airports_city_id_df.drop(columns=["city_sql","city_name"])
    
    tier_df.insert(1,column="competitor_id",value=[1]*tier_df["comp_city"].count())
    lime_df.insert(1,column="competitor_id",value=[3]*lime_df["comp_city"].count())
    comp_df = pd.concat([tier_df,lime_df])
    comp_df = comp_df.merge(city_df,how="inner",left_on="comp_city",right_on="city_name")
    comp_df = comp_df.drop(columns=["comp_city","city_name"])
    city_df = city_df.drop(columns=["city_id"])
   
    # Sending Data to SQL
    send_cities(city_df,connection_string)
    send_city_facts(city_facts_df,connection_string)
    send_competitors(comp_df,competitor_names,connection_string)
    send_geodata(geo_data_city_id_df,connection_string)
    send_airports(airports_city_id_df,connection_string)
    
    return "Data successfully uploaded"

## Extraction

Use the top 5 cities in Germany as starting point

### Cities

In [392]:
# Wikipedia list of top 5 cities by population
def city_collection():
    """
    Retrieves the names of cities in Germany from a Wikipedia page, removes any numeric prefixes,
    and returns a pandas DataFrame with the first 5 city names.

    Returns:
    - city_df (pandas DataFrame): A DataFrame containing the first 5 city names in Germany.
    """
    url = "https://en.wikipedia.org/wiki/List_of_cities_in_Germany_by_population"
    cities_requ = requests.get(url)
    city_soup = BeautifulSoup(cities_requ.content, "html.parser") 
    
    city_names = []
    city_id= []
    names = []

    table = city_soup.find('table', class_='sortable')
    cities = table.find_all(class_="fn org")
    for city in cities:
        city_names.append(city.get_text())

    for city in city_names:
        city_id.append(int(city.split(" ",1)[0]))
        names.append(city.split(" ",1)[1])

    city_df = pd.DataFrame(list(zip(city_id[:5],names[:5])),columns=["city_id","city_name"])
    #city_df = pd.DataFrame(names[:5],columns=["city_name"])
    return city_df, names[:5]

### city_facts

In [394]:
def city_facts_scraping(cities):
    """
    Scrapes city facts such as country and population from Wikipedia pages for a given list of cities.

    Args:
    - cities (list): A list of city names.

    Returns:
    - results_df (pandas DataFrame): A DataFrame containing the city facts including city_id, country, and population.
    """
    
    results = {"city_id": [], "country": [], "population": []}
    idx = 1
    
    for city in cities:
        city_url = f"https://en.wikipedia.org/wiki/{city}"
        response = requests.get(city_url)
        city_soup = BeautifulSoup(response.content, "html.parser")

        # Write city_id into dict
        results["city_id"].append(idx)
        
        # Extract country and insert
        country = city_soup.select("table.infobox.ib-settlement.vcard")
        country = country[0].find(string="Country").find_next("td").get_text()
        results["country"].append(country)

        # extract population and insert
        population = city_soup.select( "table.infobox.ib-settlement.vcard")
        population = population[0].find(string= re.compile(r"Population")).find_next("td").get_text()
        population = int(re.sub(r"(,*)","",population))
        results["population"].append(population)
        idx +=1
        
    results_df = pd.DataFrame(results)
        
    return results_df

### Geodata 

In [396]:
def geo_data_api_call(city_lst):

    """
    Scrape latitude and longitude data for a list of cities using the OpenWeatherMap API.

    Args:
        city_lst (list): A list of city names.

    Returns:
        dict: A dictionary containing the scraped geo data with keys 'city', 'latitude', and 'longitude'.
    """
    
    API_key = "91d16ed59eecaf4ad6eb1e9d19482549"
    country_code = "DE"
    limit = 1
    geo_data_dict = {"city_name": [], "latitude": [], "longitude": []}
    for city in city_lst:
        geo_data_dict["city_name"].append(city)
        url = f"http://api.openweathermap.org/geo/1.0/direct?q={city},{country_code}&limit={limit}&appid={API_key}"
        response = requests.get(url)
        geocode = response.json()
        geo_data_dict["latitude"].append(geocode[0].get("lat",0))
        geo_data_dict["longitude"].append(geocode[0].get("lon",0))
    
    geo_data_df = pd.DataFrame(geo_data_dict)
    return geo_data_df

### Airports

In [398]:
# API CALL 
def airports_api_call(city_names, latitudes, longitudes):
    url = "https://aerodatabox.p.rapidapi.com/airports/search/location"
    airports ={"city_sql":[],"number_airports":[],"iata":[],"airport_name":[]}

    headers = {
        "X-RapidAPI-Key": "4b54815ac4mshe11ca88893efc1ep170ae5jsn49ac4b2d04a5",
        "X-RapidAPI-Host": "aerodatabox.p.rapidapi.com"
        }

    for i in range(len(city_names)):
        querystring = {"lat":str(latitudes[i]),"lon":str(longitudes[i]),"radiusKm":"50","limit":"3","withFlightInfoOnly":"true"}
        response = requests.get(url, headers=headers, params=querystring)
        json_data = response.json()

        for j in range(json_data.get("count",0)):
            airports["city_sql"].append(city_names[i])
            airports["number_airports"].append(json_data.get("count",0))
            airports["iata"].append(json_data["items"][j].get("iata",None))
            airports["airport_name"].append(json_data["items"][j].get("name",None))
    airports_df = pd.DataFrame(airports)
    return airports_df


### Competitors

In [400]:
def scrape_comp_cities(comp_name, url, city_name_lst):
    """
    Scrapes city names from the TIER website and filters them based on a list of city names.
    Args:
        url (str): The URL of the TIER website.
        city_name_lst (list): A list of city names to filter the scraped cities.
    Returns:
        pd.DataFrame: A DataFrame containing the filtered city names from TIER.
    Raises:
        requests.exceptions.RequestException: If there is an error in making the HTTP request.
    """
    try:

        # Get the data from TIER website
        response_comp = requests.get(url)
        comp_soup = BeautifulSoup(response_comp.content, "html.parser")
        
        if  comp_name == "Tier":
            # Scrape city names from TIER
            city_elements = comp_soup.select('.KPICard__Card-sc-1wpi9bh-0 p')
            comp_cities = [element.text.strip() for element in city_elements]
        elif comp_name == "Lime":
            comp_cities = []
            lime_cities = comp_soup.find_all("div", class_= "inline-block mb-1 w-full text-xs text-gray-300 md:mb-4 md:text-lg md:border-gray-300 md:border-opacity-40")
            for item in lime_cities:
                city =item.get_text().strip().replace('\n', '')
                comp_cities.append(city)
                
        # Filter cities that are also in the city_name_lst
        comp_cities_lst = [comp_city for comp_city in comp_cities if comp_city in city_name_lst]

        # Create a DataFrame with the filtered city names
        comp_cities_df = pd.DataFrame(comp_cities_lst, columns=["comp_city"])

        return comp_cities_df

    except requests.exceptions.RequestException as e:
        # Handle any request exceptions
        print(f"An error occurred: {e}")

## Loading to database

In [None]:
def connection_setup():
    """
    Sets up the connection string for connecting to a MySQL database.
    Returns:
    - connection_string (str): The connection string for connecting to the MySQL database.
    """
    schema = "gans_locations"
    host = "host"
    user = "root"
    password = os.environ.get("Password")
    port = 3306
    connection_string = f'mysql+pymysql://{user}:{password}@{host}:{port}/{schema}'
    return connection_string

In [402]:
def send_cities(city_df,connection_string):
    """
    Writes the city names from a pandas DataFrame to a MySQL database table. Replaces - existing Values
    Args:
    - city_df (pandas DataFrame): The DataFrame containing the city names.
    - connection_string (str): The connection string for connecting to the MySQL database.

    Returns:
    - None
    """
    city_df.to_sql('cities',
                  if_exists='append',
                  con=connection_string,
                  index=False
                   )

In [403]:
def send_city_facts(city_facts_df,connection_string):
    """
    Writes the city facts from a pandas DataFrame to a MySQL database table. Replaces - existing Values
    Args:
    - city_fact_df (pandas DataFrame): The DataFrame containing the city facts scraped before
    - connection_string (str): The connection string for connecting to the MySQL database.

    Returns:
    - None
    """
    city_facts_df.to_sql('city_facts',
                  if_exists='append',
                  con=connection_string,
                  index=False
                    )

In [404]:
def send_competitors(comp_df,competitor_names,connection_string):
    """
    Writes the competitor_names and Ids  from a pandas DataFrame to a MySQL database table. Replaces - existing Values
    Secondly writes a dataframe wit competitor cities to a database table
    Args:
    - comp_df: the list of cities per competitor
    - competitor names: assigns an Id per competitor name
    -connection_string: String to connect with MYSQL

    Returns:
    - None
    """
    competitor_names.to_sql('competitors',
                  if_exists='append',
                  con=connection_string,
                  index=False)



    comp_df.to_sql('comp_cities',
                  if_exists='append',
                  con=connection_string,
                  index=False)

In [405]:
def send_geodata(geo_data,connection_string):
    """
    Writes the  geodata for each city from a pandas DataFrame to a MySQL database table. Replaces - existing Values
    Args:
    - geo_data (pandas DataFrame): The DataFrame containing the latitudes and longitudes scraped before
    - connection_string (str): The connection string for connecting to the MySQL database.

    Returns:
    - None
    """
    geo_data.to_sql('geo_data',
                  if_exists='append',
                  con=connection_string,
                  index=False)

In [406]:
def send_airports(airports,connection_string):
    """
    Writes the airports data for each city from a pandas DataFrame to a MySQL database table. Replaces - existing Values
    Args:
    - airports (pandas DataFrame): The DataFrame containing the airports short form scraped before
    - connection_string (str): The connection string for connecting to the MySQL database.

    Returns:
    - None
    """
    airports.to_sql('airports',
                  if_exists='append',
                  con=connection_string,
                  index=False)

## Final

In [444]:
static_dbs_setup()

'Data successfully uploaded'

In [446]:
pd.read_sql("airports", con=connection_string)

Unnamed: 0,number_airports,iata,airport_name,city_id
0,1,BER,Berlin Brandenburg,1
1,1,HAM,Hamburg,2
2,1,MUC,Munich,3
3,1,CGN,Cologne Bonn,4
4,1,FRA,Frankfurt-am-Main,5
