In [2]:
# Import necessary libraries
from time import sleep  # Allows the program to pause for a specified amount of time
import pandas as pd  # Provides data manipulation and analysis tools
from selenium import webdriver  # Allows automated web browsing
from bs4 import BeautifulSoup  # Parses HTML and XML documents
from selenium.webdriver.common.by import By  # Provides a way to locate elements on a webpage
from selenium.webdriver.support.ui import WebDriverWait  # Allows the program to wait for an element to load before continuing
from selenium.webdriver.support import expected_conditions as EC  # Allows the program to specify expected conditions for an element to load
from datetime import date  # Provides tools for working with dates
from unidecode import unidecode  # Allows the program to remove accents and other diacritical marks from characters
import random  # Provides tools for generating random numbers and sequences
import datetime  # Provides tools for working with dates and times

### *Decolar Function*

In [None]:
def scrape_decolar_data(from_location, to_location, departure_date, arrival_date, adult_qty):
    # Initialize search day, search ID and company name
    search_day = date.today()
    search_id = str(search_day) + from_location + to_location + departure_date + arrival_date
    company = 'Decolar'

    # Set up web driver options and open the search URL
    options = webdriver.ChromeOptions()
    options.add_argument('--headless') # Allows to run automated scripts in headless mode, meaning that the browser window wouldn't be visible.
    with webdriver.Chrome(options=options) as driver: # Initializing webdriver
        driver.maximize_window()
        search_Url = 'https://www.decolar.com/shop/flights/results/roundtrip/{fromLocation}/{toLocation}/{departureDate}/{arrivalDate}/1/0/0/NA/NA/NA/NA/NA?from=SB&di={adultQty}-0&reSearch=true'\
        .format(fromLocation = from_location, 
                toLocation = to_location, 
                departureDate = departure_date, 
                arrivalDate = arrival_date, 
                adultQty = adult_qty
                )
        driver.get(search_Url)
        
        # Set up WebDriverWait
        wait = WebDriverWait(driver, 10)

        # Closing popup
        popup_window = '//*[@id="dreck-wrongcountry-modal"]/div[1]/i'
        try:
            wait.until(EC.element_to_be_clickable((By.XPATH, popup_window))).click() 
        except:
            print('No popup window')
        
        # Closing discount popup
        popup_discount = '//*[@id="header"]/nav/div[6]/div[1]/i'
        try:
            wait.until(EC.element_to_be_clickable((By.XPATH, popup_discount))).click() 
        except:
            print("No discount popup")

        # Sorting By Cheapest
        relXpathSortByCheapestButton = '//*[@id="flights-container"]/div/div[3]/div/div[2]/div/div[4]/app-root/app-common/new-sorting-tabs/div/tab-component[2]'
        try: 
            wait.until(EC.element_to_be_clickable((By.XPATH, relXpathSortByCheapestButton))).click()
        except:
            print("No sort by Cheapest Button")

        # Wating until flight containers are present on the page before proceeding
        wait.until(EC.presence_of_element_located((By.XPATH, '//div[@class="cluster-container COMMON"]'))) 
        
        # Scrooling down the page to load more results, and see if the button "load more flights has appeared"
        for i in range(0,5):
            driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
            sleep(1)

        # Seeing if the "load more button has appeared" and assigning a True or False value to the variable
        more_results_button = '//body/div/div/div/div/div/div/div/div/div/app-root/app-common/items/div/div/a[1]'
        try:
            status_search_see_more_flights_button = wait.until(EC.element_to_be_clickable((By.XPATH, more_results_button))).is_displayed()
        except:
            status_search_see_more_flights_button = False

        # While loop to load more fights until the button "load more flights" disappears
        while status_search_see_more_flights_button == True:
            try:
                print("Loading more flights")
                driver.execute_script("window.scrollTo(0,document.body.scrollHeight)") # Scrool down te page
                wait.until(EC.element_to_be_clickable((By.XPATH, more_results_button))).click() # # Clicking the load more button when it loads
                status_search_see_more_flights_button = wait.until(EC.element_to_be_clickable((By.XPATH, more_results_button))).is_displayed() # Checking the status of the "load more flights"
            except:
                print('No more flights to load')
                break

        # Collecting Data
        try:
            flight_containers = driver.find_elements(By.XPATH, '//div[@class = "cluster-container COMMON"]')

            flight_list = []

            for WebElement in flight_containers:
                elementHTML = WebElement.get_attribute('outerHTML')
                elementSoup = BeautifulSoup(elementHTML, 'html.parser')

                # Dictionary to store the flight card data
                flight_data = {}

                # SearchID, SearchDay, and serachUrl
                flight_data['searchID'] = search_id
                flight_data['searchDay'] = search_day
                flight_data['searchUrl'] = search_Url
                flight_data['departureDate'] = departure_date
                flight_data['arrivalDate'] = arrival_date
                flight_data['adultQty'] = adult_qty
                flight_data['company'] = company
                
                # Origin Airport and Destiny Airport
                flight_data['originAirport'] = elementSoup.find('span', class_ = 'route-location route-departure-location').text.strip().split(' ', 1)[0]
                flight_data['destinyAirport'] = elementSoup.find('span', class_ = 'route-location route-arrival-location').text.strip().split(' ', 1)[0]
                
                # # Value Tarif and Taxes
                flight_data['tarif'] = 0
                flight_data['taxes'] = 0

                # Currency and Price
                flight_data['currency'] = elementSoup.find('span', class_ = 'currency price-mask -eva-3-mr-xsm').text
                flight_data['value'] = elementSoup.find('span', class_ = 'amount price-amount').text.replace('.','').strip()

                # CIA and CIA_abv
                airline_img_container = elementSoup.find('span', class_='container-img-airlines')
                list_cia_flight_container = [] # List to store the companies name, because some of them have more than 1
                list_cia_abv_flight_container = [] # List to store the abreviated companies name, because some of them have more than 1
                for img in airline_img_container.find_all('img'):
                    list_cia_flight_container.append(unidecode(img['alt'].strip()))
                    list_cia_abv_flight_container.append(unidecode(img['alt'].strip()[:4]))
                flight_data['cia'] = list_cia_flight_container
                flight_data['cia_abv'] = list_cia_abv_flight_container

                flight_list.append(flight_data)

            dataTypeDict = {"searchID" : 'object', 'searchDay' : 'datetime64[ns]', "originAirport" : 'object', "destinyAirport" : 'object', 
                        "searchUrl": 'object', "departureDate" : 'datetime64[ns]', "arrivalDate" : 'datetime64[ns]', "adultQty" : 'int64', 
                        "company" : 'object', "cia" : 'object', 'cia_abv' : 'object', "currency" : 'object', "tarif" : 'float32', "taxes" : 'float32', 
                        "value" : 'float32'}
            # Creating the Data Frame
            df = pd.DataFrame(flight_list)
            # # Reordering the columns
            cols = list(dataTypeDict.keys())
            df = df[cols]
            # # Changing the columns type
            df = df.astype(dtype=dataTypeDict).sort_values('value', ascending=True)
            ## Exploding the columns cia and cia_abv, because some of them have more than 1 cia separated by ","
            df = df.explode(['cia', 'cia_abv'], ignore_index=True).sort_values('value', ascending=True)
            print(f'{df.shape[0]} flights were scrapped from decolar')

        except:
            print('No flight containers found')
 
    return df

### *Passagens Promo Function*

In [None]:
def scrape_passagens_promo_data(from_location, to_location, departure_date, arrival_date, adult_qty):
    # Initialize search day, search ID and company name
    search_day = date.today()
    search_id = str(search_day) + from_location + to_location + departure_date + arrival_date
    company = 'PP'

    # Set up web driver options and open the search URL
    options = webdriver.ChromeOptions()
    options.add_argument('--headless') # Allows to run automated scripts in headless mode, meaning that the browser window wouldn't be visible.
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--disable-gpu")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36")
    with webdriver.Chrome(options=options) as driver:
        search_Url = 'https://www.passagenspromo.com.br/air/search/{fromLocation}{toLocation}{departureDate}-{toLocation}{fromLocation}{arrivalDate}/{adultQty}/0/0/Y/?'\
        .format(
            fromLocation = from_location, 
            toLocation = to_location, 
            departureDate = departure_date.replace('-', '')[2:], 
            arrivalDate = arrival_date.replace('-', '')[2:], 
            adultQty = adult_qty
        )
        driver.get(search_Url)

        # Set up WebDriverWait
        wait = WebDriverWait(driver, 15)

        # Wait for flight cards to load
        flight_card = '//div[@class="flightgroupcard"]'
        try:
            wait.until(EC.visibility_of_all_elements_located((By.XPATH, flight_card)))
        except:
            print('Not all flight cards were visible')
        
        # Check if "load more" button is visible
        more_results_button = '//button[normalize-space()="Mais resultados"]'
        try:
            status_search_see_more_flights_button = wait.until(EC.element_to_be_clickable((By.XPATH, more_results_button))).is_displayed()
        except:
            print('No load more flights button was found')
            status_search_see_more_flights_button = False
        
        # Load more flights while load more flights button is displayed
        while status_search_see_more_flights_button == True:
            try:
                print("Loading more flights")
                driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
                wait.until(EC.element_to_be_clickable((By.XPATH, more_results_button))).click()
                status_search_see_more_flights_button = wait.until(EC.element_to_be_clickable((By.XPATH, more_results_button))).is_displayed()
            except:
                print('The load more flights button has dissapeared. No more flights to load')
                break

        # Collecting the Data
        try:
            fligths_rows = driver.find_elements(By.CLASS_NAME, "flightcard")
            flight_list = []

            for WebElement in fligths_rows:
                elementHTML = WebElement.get_attribute('outerHTML')
                elementSoup = BeautifulSoup(elementHTML, 'html.parser')

                # Dictionary to store the flight card data
                flight_data = {}

                # SearchID, SearchDay, and serachUrl
                flight_data['searchID'] = search_id
                flight_data['searchDay'] = search_day
                flight_data['searchUrl'] = search_Url
                flight_data['departureDate'] = departure_date
                flight_data['arrivalDate'] = arrival_date
                flight_data['adultQty'] = adult_qty
                flight_data['company'] = company
                
                # Origin Airport and Destiny Airport
                flight_data['originAirport'] = elementSoup.find('label', class_ = 'origin_iata').text
                flight_data['destinyAirport'] = elementSoup.find('label', class_ = 'destiny_iata').text
                
                # Value Tarif and Taxes
                flight_data['tarif'] = elementSoup.find('span', 'value_passenger').text.split(' ', 1)[1].replace('.','')
                flight_data['taxes'] = elementSoup.find('span', 'value_tax').text.split(' ', 1)[1].replace('.','')

                # Currency and Price
                flight_data['currency'] = elementSoup.find('div', class_ = 'total_price').text.split(' ', 1)[0]
                flight_data['value'] = elementSoup.find('div', class_ = 'total_price').text.split(' ', 1)[1].replace('.','')

                # CIA and CIA_abv
                flight_data['cia'] = elementSoup.find('div', class_='logo_cia').text.strip()
                flight_data['cia_abv'] = elementSoup.find('div', class_='logo_cia').text.strip()[:4]

                flight_list.append(flight_data)

            dataTypeDict = {"searchID" : 'object', 'searchDay' : 'datetime64[ns]', "originAirport" : 'object', "destinyAirport" : 'object', 
                        "searchUrl": 'object', "departureDate" : 'datetime64[ns]', "arrivalDate" : 'datetime64[ns]', "adultQty" : 'int64', 
                        "company" : 'object', "cia" : 'object', 'cia_abv' : 'object', "currency" : 'object', "tarif" : 'float32', "taxes" : 'float32', 
                        "value" : 'float32'}
            # Creating the Data Frame
            df = pd.DataFrame(flight_list)
            # # Reordering the columns
            cols = list(dataTypeDict.keys())
            df = df[cols]
            # # Changing the columns type
            df = df.astype(dtype=dataTypeDict).sort_values('value', ascending=True)
            print(f'{df.shape[0]} flights were scrapped from Passagens Promo')

        except:
            print('No flight cards found')
        
    return df

### *Merging both Functions*

In [None]:
def merged_scraper(from_location, to_location, departure_date, arrival_date, adult_qty):
    decolar_df = scrape_decolar_data(from_location, to_location, departure_date, arrival_date, adult_qty)
    passagens_promo_df = scrape_passagens_promo_data(from_location, to_location, departure_date, arrival_date, adult_qty)
    return decolar_df, passagens_promo_df

### *Creating a function to transform the data*

In [None]:
def transform_data(passagens_promo_df, decolar_df):

    # Transforming the passagens_promo_df
    bestPricesPerCiaPP = passagens_promo_df.groupby('cia').min().sort_values('value', ascending=True).reset_index()
    best_cia = bestPricesPerCiaPP.loc[bestPricesPerCiaPP['value'].idxmin(), 'value'] 
    bestPricesPerCiaPP['bestCia'] = bestPricesPerCiaPP['value'].apply(lambda x: 1 if x == best_cia else 0)
    bestPricesPerCiaPP['comparisonWithPP'] = 'PP X PP'

    # Transforming the decolar_df
    bestPricesPerCiaDecolar = decolar_df.groupby('cia').min().sort_values('value', ascending=True).reset_index()
    best_cia = bestPricesPerCiaDecolar.loc[bestPricesPerCiaDecolar['value'].idxmin(), 'value'] 
    bestPricesPerCiaDecolar['bestCia'] = bestPricesPerCiaDecolar['value'].apply(lambda x: 1 if x == best_cia else 0)

    for indexD, rowD in bestPricesPerCiaDecolar.iterrows():
        for indexP, rowP in bestPricesPerCiaPP.iterrows():
            if rowD.cia_abv == rowP.cia_abv:
                if rowD.value > rowP.value:
                    bestPricesPerCiaDecolar.loc[indexD,'comparisonWithPP'] = 'Win / Cheaper'
                else:
                    bestPricesPerCiaDecolar.loc[indexD,'comparisonWithPP'] = 'Loss / Expensiver'
                    
    bestPricesPerCiaDecolar['comparisonWithPP'].fillna('Loss / CIA not returned', inplace = True) 

    # Concatenating both Dataframes
    final_df = pd.concat([bestPricesPerCiaPP, bestPricesPerCiaDecolar], axis=0).reset_index().drop(columns='index')

    return final_df

### *Creating a function to Storage the data*

**ATENTTION:** You must create a excel file to storage the data and substitute the name where is writting: ``YOUR EXCEL FILE NAME HERE`` in the code below:

In [None]:
def load_data_to_spreadsheet(final_df):
    # Loading the Spreadsheet that Storage the Scrapped Flight Data
    fligh_data_worksheet = pd.read_excel('YOUR EXCEL FILE NAME HERE.xlsx')

    # Appending the Data from the final_df into the fligh_data_worksheet if the searchID is not in the fligh_data_worksheet
    if final_df.searchID.unique() not in fligh_data_worksheet.searchID.unique():
        updated_fligh_data_worksheet = pd.concat([fligh_data_worksheet, final_df], axis=0).reset_index().drop(columns='index')
        updated_fligh_data_worksheet.to_excel('YOUR EXCEL FILE NAME HERE', index=False)
        print("Data appended into the spredsheet")
    else:
        print("Data not appended into the spredsheet")

## **Executing the functions and Loading results in an Excel File**

In [None]:
# Importing the DataFrame with the search parameters
search_parameters_df = pd.read_excel('search_parameters.xlsx')
search_parameters_df['Departure Dates'] = pd.to_datetime(search_parameters_df['Departure Dates']).dt.date
search_parameters_df['Arrival Dates'] = pd.to_datetime(search_parameters_df['Arrival Dates']).dt.date
print(f'There is {search_parameters_df.shape[0]} rows of search parameters')

In [None]:
for i in range(len(search_parameters_df)):
    
    # Getting the parameters for the 'search_parameters_df'
    from_location = search_parameters_df['Departure Iata'].loc[i]
    to_location = search_parameters_df['Arrival Iata'].loc[i]
    departure_date = str(search_parameters_df['Departure Dates'].loc[i])
    arrival_date = str(search_parameters_df['Arrival Dates'].loc[i])
    adult_qty = search_parameters_df['Adult Quantity'].loc[i]

    # Executing function to scrape the data
    decolar_df, passagens_promo_df = merged_scraper(from_location, to_location, departure_date, arrival_date, adult_qty)

    # Executing function to transform the data
    final_df = transform_data(passagens_promo_df, decolar_df) 
    
    # Loading the data into the spreadsheet
    load_data_to_spreadsheet(final_df)