## <center>SerpApi Scraping Notebook</center>

### Necessary Installs

In [1]:
# for using the api itself
#!pip install serpapi

In [2]:
# for using the module to webscrap from api
#!pip install google-search-results

In [4]:
# to be able to get your location gps on a machine
#!pip install geocoder

### Import Statements

In [6]:
# use api 
from serpapi import GoogleSearch

# get location on machine
import geocoder

# write to and manipulate excel files
from openpyxl import load_workbook

# standard imports
import pandas as pd
import numpy as np

### <center>Google Maps Scrap</center>

In [7]:
# get current gps location
# need for start point using google maps / directions
def get_my_loc():
    '''Takes no arguments, and outputs the device's current location in lat
    and long.'''
    
    # me is the reference to your machine based on IP address
    myloc = geocoder.ip('me')
    
    # tuple of location
    return (myloc.latlng[0],myloc.latlng[1])

In [8]:
# scrapes google maps to find the three closest airports
# documentation: https://serpapi.com/google-maps-api
# looks for international airports but search will provide the
## "larger" airports and you can filter by airport tags later
def map_scrap():
    '''Takes no arguments and outputs the closest major airports.'''
    
    # unique key for David Allen, please use your own key
    # limited to 100 scraps per month
    my_key = "7668a9aae9862cea0185da8e75acee37de62aa9e31acf0b7415c341c97aba1f1"
    
    # use function to get current gps loc
    lat,long = get_my_loc()
    
    # api search parameters, all inputs required to be str
    params = {
        "engine": "google_maps",                     # scraps google maps
        "q": "International airport",                # looks for international aiports
        "ll": "@"+str(lat)+","+str(long)+","+"12z",  # "@lat,long,zoom level
        "hl": "en",                                  # language of results
        "type": "search",                            # look for close matches 
        "api_key": my_key}
    
    # get search results in JSON form
    search = GoogleSearch(params)
    results = search.get_dict()
    
    return results

In [9]:
# takes results from google maps search and returns a human
## readable dictionary of airport names to location
# also cuts down JSON dictionary so only get three 
def near_port_locs(res_obj):
    '''Takes a results object from a google maps scrap and outputs a  dict of the 
    three closest and major airports by name and lat,long.'''
    
    # initalize empty dict
    port_loc_dict = dict()
    
    for i in range(0,len(res_obj['local_results'])):
        
        # airport types include: airport, regional airport and international airport
        # only want ones with commerical flights, so filter out airport
        if res_obj['local_results'][i]['type'] != 'Airport':
            # format results to be strings that are easy to read
            lat = float(res_obj['local_results'][i]['gps_coordinates']['latitude'])
            lat = "{:.4f}".format(lat)
            long = float(res_obj['local_results'][i]['gps_coordinates']['longitude'])
            long = "{:.4f}".format(long)
            port_loc_dict[res_obj['local_results'][i]['title']] = [lat,long]
        
        # stop once we get the closest major airports
        # maps scrap is in order of distance, so we don't need to sort for that
        # just filter out non-major ones in order presented
        if len(port_loc_dict) == 3:
            return port_loc_dict

In [10]:
# scrapes google directions to get distances and travel time to airports
# documentation: https://serpapi.com/google-maps-directions-api
def direction_scrap(port_loc):
    '''Takes the lat and long location of an airport and outputs the
    google maps direction results object related to the airport from
    the device's current location.'''
    
    # unique key for David Allen, please use your own key
    # limited to 100 scraps per month
    my_key = "7668a9aae9862cea0185da8e75acee37de62aa9e31acf0b7415c341c97aba1f1"
    
    # use function to get current gps loc
    lat,long = get_my_loc()
    
    # api search parameters, all inputs required to be str
    params = {
        "engine": "google_maps_directions",         # google directions api
        "hl": "en",                                 # results in english
        "travel_mode": "0",                         # means "by car"
        "start_coords": str(lat)+","+str(long),     # your location in str form
        "end_coords": port_loc[0]+","+port_loc[1],  # location from maps of airport
        "distance_unit": "1",                       # use miles as measurement of dist
        "api_key": my_key}
    
    # get search results in JSON form
    search = GoogleSearch(params)
    results = search.get_dict()

    return results

In [11]:
# intermediate function to help convert string time of the form
## "1 hr 20 min" to "80".
# gets direction scrap times in same format as flight scrap times
def conv_to_mins(time_str):
    ''''Takes a string of hours and minutes and outputs the number of 
    total minutes.'''
    
    # how to split string
    parts = time_str.split(' ')
    
    # initialize counter
    total_min = 0
    
    # iterate through parts of string and convert each part
    for i in range(0, len(parts), 2):
        number = int(parts[i])
        if 'hr' in parts[i+1]:
            total_min += number * 60
        elif 'min' in parts[i+1]:
            total_min += number
    
    return str(total_min)

In [12]:
# thats right! you thought it was gone, but its not
# calculates great circle distance between a set of lat and long coords
def great_cdist(lat1,lon1,lat2,lon2):
    """Calculate the great circle distance in kilometers between two points 
    on the (specified in decimal degrees)."""
    # units doesn't matter because we don't keep the distance
    # only used to find closest match in a list of airports
    
    # convert decimal degrees to radians 
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    # haversine formula 
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    
    # radius of earth in kilometers
    r = 6371 
    
    # return distance
    return c * r

In [13]:
# used to get the matching three letter iata code for each airport
# needed to translate a maps scrap -> flight scrap
# originally tried string matching but multiple airports in US with
## similiar spelling / names, but loc is unique enough
def find_close_iata(lat,long):
    """Find the closest point to a given latitude and longitude in the dataframe
    and return the value in the corresponding column labelled 'iata_code'."""
    
    # read in csv file containing all US iata codes
    # found at: https://datahub.io/core/airport-codes#pandas
    ## cols = name, state, iata_code, lat, long
    df = pd.read_csv("US_iata_codes.csv")
    lat = float(lat)
    long = float(long)
    
    # calculate great circle dist for each coord pair
    distances = df.apply(lambda row: great_cdist(lat, long, row['lat'], row['long']), axis=1)
    
    # find index of min dist 
    min_distance_index = distances.idxmin()
    
    # return the 'iata_code' for the location with the minimum distance
    return df.loc[min_distance_index, 'iata_code']

In [14]:
# adds the calculated distance and iata code to the dict made
## with near_port_loc function
def get_port_dist(port_dict):
    '''Takes a dictionary of airport names to lat long coords and outputs
    a modified dictionary with iata code and distance added to the values.'''

    # iterate over key and values of dict
    for name,coord in port_dict.items():
        temp_results = direction_scrap(coord)
        dist = temp_results['directions'][0]['formatted_distance']
        # has consistent str char in front that we don't need
        # slices last 6 elements from it
        dist = dist[:-6]
        dur = temp_results['directions'][0]['formatted_duration']
        dur = conv_to_mins(dur)
        iata = find_close_iata(coord[0],coord[1])
        port_dict[name] = [iata,dist,dur,coord[0],coord[1]]
    
    return port_dict

In [15]:
# build a df from the dict created in get_port_dst
def build_dist_frame():
    '''Takes no arguments and builds a dataframe of the three closests
    major airports with the name as the index and the iata code, travel
    distance, travel time and lat long coords as columns.'''
    
    # execute map scrap -> build temp dict -> build final dict
    ## -> turn into df
    res_obj = map_scrap()
    port_dict = near_port_locs(res_obj)
    modified_dict = get_port_dist(port_dict)
    
    port_df = pd.DataFrame.from_dict(modified_dict, orient = 'index', 
                columns = ['iata','travel_miles','travel_time','lat','long'])
    
    return port_df

### <center>Google Flights Scrap</center>

In [17]:
# function 1 to get a human readable airport names
# scrap returns the arrival airport as a dictionary and
## it has an id key that is what we want
def get_IATA_id(row):
    '''Checks if row contains a dict and the id key, and returns 
    the values paired with the id key.'''
    
    # is it a dict, does it have id key
    if isinstance(row, dict) and 'id' in row:
        return row['id'] # return value of k:v pair
    
    return row

In [18]:
# apply the get_IATA_id function to every row in the df
def chg_arr_dep(df):
    '''Takes a dataframe and returns the dataframe with human readable 
    arrival and departure airport names.'''
    
    # does not need to return, as it changes df in place
    for row in df:
        df[row] = df[row].apply(get_IATA_id)

In [19]:
# define function to create a multi-level index for flight dfs
def make_multi(len_df,start_num):
    '''Takes a the length of a df and the trip start num, builds a multi-level 
    index for it. The outer index is the Trip and the inner is the Legs.'''

    # create Trip index based on a starting point and a df len
    # needs to be variable based because we concat two dfs into
    ## one later and it needs unique trip numbers
    # constructs an array
    outer = ["Trip " + str(start_num)]*len_df
    
    # each trip has a certain number of legs based on its length
    # constructs an array
    inner = ["Leg " + str(i) for i in range(1,len_df+1)]

    # name indexes and use arrays constructed in function
    multi_index = pd.MultiIndex.from_arrays([outer,inner], names = ['Trip', 'Leg'])

    return multi_index

In [20]:
# create a function to concat and clean dfs built from google flights scrap
def clean_concat(df1,df2):
    '''Takes two dataframes consisting of google flights api data, concats them 
    and cleans them before return a single dataframe.'''

    # drops values not needed in optimization model for two input dfs
    # logo is a png image
    # legroom is a measurement in inches
    # extensions are trip "upgrade" packages
    # airplane is the make and model of the aircraft
    # travel_class is the type of seat; first, business etc
    df1.drop(['airline_logo','legroom','extensions','airplane','travel_class'],
            axis = 1, inplace = True)
    
    # need two inputs because google flights scrap puts the flight info in two
    ## spots of the JSON dictionary, and the flight have variable structures
    df2.drop(['airline_logo','legroom','extensions','airplane','travel_class'],
            axis = 1, inplace = True)

    # drop overnight col from both dfs, col of bool values if flight time
    ## overlaps with midnight 
    if 'overnight' in df1.columns:
        df1.drop('overnight', axis = 1, inplace = True)

    elif 'overnight' in df2.columns:
        df2.drop('overnight', axis = 1, inplace = True)

    # concat both frames vertically and keep the custom indexes we built
    clean_df = pd.concat([df1,df2], axis = 0, ignore_index = False)

    return clean_df

In [85]:
# function to scrap google flights and build dfs from it
# function to build dataframe from "best_flights" and "other_flights"
def results_to_frame(res_obj):
    '''Takes a results object from the google flights api and builds a df
    for each best flight, each other flight and combines them. The resulting 
    df is cleaned and concatenated.'''

    # google flights puts in two spots of the JSON dict (res_obj)
    ## best_flights and other_flights are they keys
    
    # abbreviate key names in res_obj
    bfs = 'best_flights'
    ofs = 'other_flights'
    fs = 'flights'
    pr = 'price'
    lo = 'layovers'
    tdr = 'total_duration'
    dr = 'duration'

    # build df from "best flights"
    # len gives number of flights in dict
    num_fly1 = len(res_obj[bfs])
    for i in range(0,num_fly1):
        
        # if first entry in dict
        if i == 0:
            # grab first element
            best_df = pd.DataFrame(res_obj[bfs][0][fs])
            # name cols
            best_df['Trip_Price'] = res_obj[bfs][0][pr]
            best_df['Trip_Duration'] = res_obj[bfs][0][tdr]
            # fix name of airports
            chg_arr_dep(best_df)
            
            # initialize empty dict for airport name: layover duration
            lay_dict = {}
            # len of this tells us the number of layovers, aka legs of the trip
            for j in range(0,len(res_obj[bfs][0][lo])):
                # associate the layover with the departure airport, that way you cant
                ## have a layover before you take your first flight
                lay_dict[res_obj[bfs][0][lo][j]['id']] = res_obj[bfs][0][lo][j][dr]
            
            # map the dict by airport name to the df
            best_df['Layover_duration'] = best_df['departure_airport'].map(lay_dict)
            # turn into multi-index df
            best_df.index = make_multi(len(best_df),i+1)   
        
        # if not first entry
        else:
            # built the same as above with minor differences, only commenting
            ## where it is different
            temp_df = pd.DataFrame(res_obj[bfs][i][fs])
            temp_df['Trip_Price'] = res_obj[bfs][i][pr]
            temp_df['Trip_Duration'] = res_obj[bfs][i][tdr]
            chg_arr_dep(temp_df)
            
            lay_dict = {}
            for j in range(0,len(res_obj[bfs][i][lo])):
                lay_dict[res_obj[bfs][i][lo][j]['id']] = res_obj[bfs][i][lo][j][dr]
                
            temp_df['Layover_duration'] = temp_df['departure_airport'].map(lay_dict)
            temp_df.index = make_multi(len(temp_df),i+1)
            # concat each temp df to the original one, keep the multi-index
            best_df = pd.concat([best_df,temp_df], axis = 0, ignore_index = False)

    # build df from "other flights"
    # trip count is the "start_num" in the multi-index building function
    # keeps track of where best_df left off
    trip_count = len(res_obj[bfs])
    
    # very similiar to above, only commenting where different
    num_fly2 = len(res_obj[ofs])
    for i in range(0,num_fly2):
        if i == 0:
            othr_df = pd.DataFrame(res_obj[ofs][0][fs])
            othr_df['Trip_Price'] = res_obj[ofs][0][pr]
            othr_df['Trip_Duration'] = res_obj[ofs][0][tdr]
            chg_arr_dep(othr_df)
            
            lay_dict = {}
            for j in range(0,len(res_obj[ofs][0][lo])):
                lay_dict[res_obj[ofs][0][lo][j]['id']] = res_obj[ofs][0][lo][j][dr]
    
            othr_df['Layover_duration'] = othr_df['departure_airport'].map(lay_dict)
            othr_df.index = make_multi(len(othr_df),trip_count+1)   
        
        else:
            temp_df = pd.DataFrame(res_obj[ofs][i][fs])
            # some flights don't have prices after a certain point in the res_obj
            # tested and its not by a constant index
            # believe that google flights doesn't scrape for flights that are 
            ## far down its list (like the 100th flight down when manually searching)
            if 'price' in list(res_obj[ofs][i].keys()):
                temp_df['Trip_Price'] = res_obj[ofs][i][pr]
                temp_df['Trip_Duration'] = res_obj[ofs][i][tdr]
                chg_arr_dep(temp_df)
            
                lay_dict = {}
                for j in range(0,len(res_obj[ofs][i][lo])):
                    lay_dict[res_obj[ofs][i][lo][j]['id']] = res_obj[ofs][i][lo][j][dr]
                
                temp_df['Layover_duration'] = temp_df['departure_airport'].map(lay_dict)
                temp_df.index = make_multi(len(temp_df),trip_count+i+1)   
                othr_df = pd.concat([othr_df,temp_df], axis = 0, ignore_index = False)
            
            # if there isnt a price associated, we don't care and increase the counter
            else:
                i += 1

    # use cleaning function with output            
    return clean_concat(best_df,othr_df)

In [22]:
# function that actually does the flight scraping
# alot of smaller functions are necessary to make this useful
def flight_scrap(dep_port,arr_port,dep_date,pass_num,bag_num):
    '''Takes a series of input parameters and produces a dataframe of flight options.
    Inputs (all of str type)
    dep_port: departure airport IATA code ("MRY")
    arr_port: of arrival airport IATA code ("BDL")
    dep_date: date of departure in YYYY-MM-DD format ("2024-03-27")
    pass_num: number of people flying, assumed all adults ("3")
    bag_num: number of checked bags ("2")'''
    
    # specific to a user (Dave Allen) in this case
    # please use your own when scraping as there is a 100 scrap monthly limit
    my_key = "7668a9aae9862cea0185da8e75acee37de62aa9e31acf0b7415c341c97aba1f1"
    
    # specified by api, see documentation for more detail on what each means
    # source: https://serpapi.com/google-flights-api
    # commented out params elements are optional
    params = {
        "engine": "google_flights",
        "departure_id": dep_port,
        "arrival_id": arr_port,
        "hl": "en",
        "currency": "USD",
        "type": "2",
        "outbound_date": dep_date,
       #"return_date": None,
        "travel_class": "1",
        "show_hidden": "true",
        "adults": pass_num,
       #"children": "0",
       #"stops": "0",
       #"exclude_airlines": "UA",
        "bags": bag_num,
       #"outbound_times": "4,8"
       #"return_times": "2,3"
        "api_key": my_key}
    
    # uses func and method to make JSON dict
    search = GoogleSearch(params)
    results = search.get_dict()
    
    # feeds into function to build dict to df
    # which cleans and concats, which uses all other funcs
    return results_to_frame(results)

### <center> Integrating Both Scraps</center>

In [107]:
# short helper function to save space in next function
# needed because some elements of the flight dfs will come
## back as lists, when they are more human readable as strs
def convert_lists_to_strings(x):
    return ', '.join(map(str, x)) if isinstance(x, list) else x

In [109]:
# builds an excel file to feed into the optimization program
# creates file in the cwd
# important: counts as 7x scraps for each call!!
# 1x maps, 3x directions, 3x flights
# can quickly hit monthly 100x limit (30 day period, not calendar)
def flight_options(arr_port,dep_date,pass_num,bag_num):
    '''Takes the same input parameters as the flight_scrap function, and
    executes that function before creating / exporting the resulting dfs
    to an excel file.'''
    
    # call google maps overarching function
    port_df = build_dist_frame()
    
    # get list of iata codes for easy slicing below
    port_list = list(port_df['iata'])
    
    # iteratively call flight_scrap for each distinct arrival airport generated
    ## by closest airports from map / direction scrap
    # build as seperate dfs because they will go on diff excel sheets
    flight_df1 = flight_scrap(port_list[0],arr_port,dep_date,pass_num,bag_num)
    flight_df2 = flight_scrap(port_list[1],arr_port,dep_date,pass_num,bag_num)
    flight_df3 = flight_scrap(port_list[2],arr_port,dep_date,pass_num,bag_num)
    
    # name output file
    file_name = "air_LP_data.xlsx"
    
    # add airport dist df as first sheet
    port_df.to_excel(file_name, index = True)

    # map function to flight dfs
    flight_df1 = flight_df1.applymap(convert_lists_to_strings)
    flight_df2 = flight_df2.applymap(convert_lists_to_strings)
    flight_df3 = flight_df3.applymap(convert_lists_to_strings)

    # use openpyxl to create workbook
    with pd.ExcelWriter(file_name, engine = 'openpyxl', mode = 'a', if_sheet_exists = 'replace') as writer:
        # rename default sheet
        writer.book.active.title = "port_info"
        # write flight dfs to new sheets, name is the iata code
        flight_df1.to_excel(writer, sheet_name = port_list[0], index = True)
        flight_df2.to_excel(writer, sheet_name = port_list[1], index = True)
        flight_df3.to_excel(writer, sheet_name = port_list[2], index = True)
        
    print("Excel File Built!")

In [111]:
# testing the code
arr_port1 = "BDL"
dep_date1 = "2024-03-03"
pass_num1 = "1"
bag_num1 = "1"

# works! can see the new excel file in the cwd
flight_options(arr_port1,dep_date1,pass_num1,bag_num1)

  flight_df1 = flight_df1.applymap(convert_lists_to_strings)
  flight_df2 = flight_df2.applymap(convert_lists_to_strings)
  flight_df3 = flight_df3.applymap(convert_lists_to_strings)


Excel File Built!
