In [4]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import smtplib
import time
from datetime import datetime 
import uuid



In [5]:
class FlightScraper:

    """
    
    Instance of this class allows for scraping the basic fight data from kiwi.com website.
    
    ***

    Attributes:

    date_from : date (%Y-%m-%d)
        starting date for the scraping process
    date_to : date (%Y-%m-%d)
        ending date for the scraping process
    origin : str
        place of origin for the flights searched, format - city-country
    destination : str
        desired destination for the flights searched, format - city-country
    trip_duration_days : int
        desired duration of the trip (in days), ex. for Fri-Sun trip, this attribute should be equal to 3
    path_to_driver : str
        valid path to the chrome webdriver
    weekends_only : bool
        optional, allows searching only for flights on the weekends (3 days, Fri - Sun)
    
    
    """

    def __init__(self, date_from, date_to, origin, destination, trip_duration_days, path_to_driver, weekends_only = True):

        """
        Class initialization method. 

        Attributes:

        date_from : date (%Y-%m-%d)
            starting date for the scraping process
        date_to : date (%Y-%m-%d)
            ending date for the scraping process
        origin : str
            place of origin for the flights searched, format - city-country
        destination : str
            desired destination for the flights searched, format - city-country
        trip_duration_days : int
            desired duration of the trip (in days), ex. for Fri-Sun trip, this attribute should be equal to 3
        path_to_driver : str
            valid path to the chrome webdriver
        weekends_only : bool
            optional, allows searching only for flights on the weekends (3 days, Fri - Sun)




        _driver : Object
            Instance of the Chrome Webdriver from Selenium package corresponding to the particular class instance


        Protected Attributes responsible for collecting scraped data:

        self._flight_ids = []
        self._start_location = []
        self._end_location = []
        self._flight_start_dates = []
        self._flight_end_dates = []
        self._flight_start_times = []
        self._flight_end_times = []
        self._flight_durations = []
        self._flight_prices = []
        self._flight_origin_airports = []
        self._flight_destination_airports = []
        self._flight_grounds = []
        self._is_return = []
        
        """


        self.date_from = date_from
        self.date_to = date_to
        self.origin = origin
        self.destination = destination
        self.trip_duration_days = trip_duration_days # TODO: Add functionality to make trip duration adjustable
        self.path_to_driver = path_to_driver
        self.weekends_only = weekends_only # TODO: Add functionality for weekend only trips 

        self._driver = webdriver.Chrome(executable_path=path_to_driver)
        self._flight_ids = []
        self._start_location = []
        self._end_location = []
        self._flight_start_dates = []
        self._flight_end_dates = []
        self._flight_start_times = []
        self._flight_end_times = []
        self._flight_durations = []
        self._flight_prices = []
        self._flight_origin_airports = []
        self._flight_destination_airports = []
        self._flight_grounds = []
        self._is_return = []


    def get_page(self, start_date, end_date):

        """
        
        Method used to load the required page (kiwi.com) with specific parameters before scraping.

        Parameters:

            start_date : date (%Y-%m-%d)
                starting date for the page to search flight for
            end_date : date (%Y-%m-%d)
                ending date for the page to search flight for

        Returns: 
            None
        
        """

        # Declare url to the scraped results page following the convention 'https://www.kiwi.com/pl/search/results/origin/destination/start_date/end_date/?cabinClass=ECONOMY-false'
        page_url = 'https://www.kiwi.com/pl/search/results/' + self.origin + '/' + self.destination + '/' + start_date + '/' + end_date + '?cabinClass=ECONOMY-false'

        self._driver.get(page_url)

        # Each new page can open a pop-up window about cookies. Wait for 10 seconds for it to show up then try to close it.
        time.sleep(10)
        try:
            self._driver.find_element_by_xpath('//*[@id="cookies_accept"]/div').click()
        except:
            pass


    def find_grounds(self):

        """
        
        Helper method used to scrape information about the number of grounds during the flight.

        Parameters: 
            None

        Returns:
            grounds_count : int (non-zero)
            Number of grounds for each particular flight. Separate for outbound and inbound flights. 

            
        """

        # In case of no grounds the counter may not be properly displayed in the page structure
        # To combat this issue, first a parent div containing flight infomration is selected, then a grounds count is located within this parent div
        parent_div_xpath = '//div[contains(@data-test, "ResultCardSectorWrapper")]'
        child_div_xpath = ".//div[contains(@data-test, 'StopCountBadge')]"

        parent_divs = self._driver.find_elements_by_xpath(parent_div_xpath)

        child_divs = [parent_div.find_element_by_xpath(child_div_xpath) for parent_div in parent_divs] # For each flight result card find a grounds counter

        ground_details = [div.text for div in child_divs]


        def find_grounds_count(landing_string):

            """Helper function to transform the raw grounds data into numerical values"""

            return int('0'+''.join(d for d in landing_string if d.isdigit())) # In case of no grounds, no number is displayed on the page, so the '0' must be added manually
        
        grounds_count = [find_grounds_count(s) for s in ground_details] # Use helper function to turn raw strings from the page into corresponing numbers

        return grounds_count
    

    def get_weekends(self):

        """
        
        Helper method used to get all the weekends between date_from and date_to.

        Parameters: 
            None

        Returns:
            weekends_zipped : List[(date (%Y-%m-%d), date (%Y-%m-%d))]
            Pairs of dates representing the begining of a weekend (Friday) and the end of a weekend (Sunday) within the provided time window. 


        """


        # Create list of dates for all Fridays and Sundays between two given dates
        weekends_list = list(pd.bdate_range(start=self.date_from, end=self.date_to, freq='C', weekmask="Fri Sun"))

        # If the time window begins after a Friday or ends before a Sunday remove those dates from the list (we are only looking for full weekends)
        if datetime.strptime(self.date_from, '%Y-%m-%d').weekday() in (5,6):

            weekends_list = weekends_list[1:] # Remove unmached Friday

        if datetime.strptime(self.date_to, '%Y-%m-%d').weekday() in (4,5):

            weekends_list = weekends_list[:-1] # Remove unmached Sunday

        # Divide dates into sepearate lists for Fridays (start) and Sundays (end)
        weekend_start = weekends_list[::2]
        weekend_end = weekends_list[1::2]

        weekends_zipped = [*zip(weekend_start,weekend_end)] # zip and unpack into a list the dates to obtain pairs of start-end for each weekend

        return weekends_zipped
    

        
    def set_flight_ids(self, count):

        """
        
        Helper method used to get unique id identifiers for the scraped flights.

        Parameters: 
            count : int
            Number of flights to generate ids for

        Returns:
            ids : List[UUID]
            List containing pairs of unique identifiers 


        """

        ids = []

        for i in range(count):
            ids += [uuid.uuid4()]*2 # For each trip 2 of the same id are generated, one for the outbound and one for the inbound flight
        return ids
    




    def scrape_data(self):

        """ 
        
        Main class method used for scraping data and collecting it in the list attributes. 
        Only the first 3 result (6 flights) of each page are scraped.

        Parameters: 
            None
        
        Returns:
            None


        """ 

    
        weekends_list = self.get_weekends() # First the helper function get_weekends is used to obtain the weekends betweeen given dates

        for start, end in weekends_list: # For each weekend attempt to scrape the corresponding website




            for i in range(5): # To avoid problems with website loading incorectly, attempt to scrape each website up to 5 times


                try:
                    print(f'Scraping data for dates: {start.strftime("%Y-%m-%d")} - {end.strftime("%Y-%m-%d")} and locations {self.origin} -> {self.destination}.\n Scraping progress: {int(weekends_list.index((start,end)))/len(weekends_list)*100}%')

                    self.get_page(start_date=start.strftime('%Y-%m-%d'), end_date=end.strftime('%Y-%m-%d')) # Load the page using helper method get_page
                    
                    
                    time.sleep(1) # After each operation a wait time of 1s is added to ensure all data is scraped properly

                    flight_times =  [timestamp.text for timestamp in self._driver.find_elements_by_xpath('//time[@datetime]')][0:18] # For 6 flights we need total of 18 datetime objects

                    flight_start_times = flight_times[::3]
                    flight_end_times = flight_times[2::3]
                    flight_durations = flight_times[1::3]
                    
                    
                    time.sleep(1)

                    flight_prices = [price.text for price in 
                                     self._driver.find_elements_by_xpath('//span[contains(@class, "length-6") or contains(@class, "length-7")]') 
                                     for _ in (0,1)][0:6] # Since each price appears only once in each search result it is needed to duplicate them for data integrity purposes
                    

                    time.sleep(1)
                    
                    # For 6 flights we need total of 12 airports
                    flight_destinations = [dest.text for dest in self._driver.find_elements_by_xpath('//div[contains(@data-test, "stationName")]')][0:12] 

                    flight_origin_airports = flight_destinations[::2]
                    flight_destination_airports = flight_destinations[1::2]


                    time.sleep(1)

                    flight_grounds = self.find_grounds()[0:6] # Find the grounds for scraped flights, 6 in total

                    time.sleep(1)


                    # If the scraping was successful, the following need to be true: No information is missing, and every piece of information was scraped for exactly 6 flights
                    if all([flight_start_times,flight_end_times,flight_durations,flight_prices,flight_origin_airports,flight_destination_airports]):
                        if all(6==len(x) for x in [flight_start_times,flight_end_times,flight_durations,flight_prices,flight_origin_airports,flight_destination_airports]):


                            # In case the scraping was successful, the scraped data can be added to the list attributes of the scraper
                            print(f"Scraping for date {start} - {end} successful!")


                            self._start_location += [self.origin] * 6
                            self._end_location += [self.destination] * 6
                            
                            self._flight_start_times += flight_start_times
                            self._flight_end_times += flight_end_times
                            self._flight_durations += flight_durations

                            self._flight_prices += flight_prices

                            self._flight_origin_airports += flight_origin_airports
                            self._flight_destination_airports += flight_destination_airports

                            self._flight_grounds += flight_grounds


                            # The folowing data is not scraped from the webiste, it is added manually for convenience purposes
                            count = len(flight_times[::3])//2 # Number of scraped trips

                            self._flight_ids += self.set_flight_ids(count) # Generate unique ids for each flight (pairwise matching)

                            self._flight_start_dates += [start.strftime('%Y-%m-%d')]  * 2 * count # Start and end dates are set based on the initial conditions
                            self._flight_end_dates += [end.strftime('%Y-%m-%d')]  * 2 * count

                            self._is_return += [False,True] * count # The return flights are marked to allow for later distinction

                            
                            break # In case the scraping was successful break out of the inner loop and continue scraping for next dates 
                        else:
                            print(f"There was an error in attempt {i+1}/5, retrying...")
                    else:
                        print(f"There was an error in attempt {i+1}/5, retrying...")


                except:
                    print(f"Error occured for this date {start} - {end}")
                    continue # In case all 5 of the scraping attempts failed, continue scraping for the next dates and pring the message below


                if not all([flight_start_times,flight_end_times,flight_durations,flight_prices,flight_origin_airports,flight_destination_airports]):
                        if not all(6==len(x) for x in [flight_start_times,flight_end_times,flight_durations,flight_prices,flight_origin_airports,flight_destination_airports]):
                            print(f"Scraping for date {start} - {end} unsuccesfull :()")

          



        
    def get_dataframe(self):


        """
        
        Helper method used to compose scraped data into a pandas DataFrame.

        Parameters: 
            None

        Returns:
            flight_data : pd.DataFrame()
            pandas DataFrame object containing raw data scraped for each of the list attributes


        """


        flight_data = pd.DataFrame({
            'Flight_id': self._flight_ids,
            'Start_location': self._start_location,
            'End_location': self._end_location,
            'Start_date': self._flight_start_dates,
            'End_date': self._flight_end_dates,
            "Start_time": self._flight_start_times,
            "End_time": self._flight_end_times,
            "Duration": self._flight_durations,
            "Price": self._flight_prices,
            "Origin_airport": self._flight_origin_airports,
            "Destinantion_airport": self._flight_destination_airports,
            'Number_of_grounds': self._flight_grounds,
            "Is_return": self._is_return
        })

        return flight_data
    

In [6]:
origin_list = ['katowice-polska','wroclaw-polska','krakow-polska','warszawa-polska']
destination_list = ['londyn-wielka-brytania', 'barcelona-hiszpania', 'nowy-jork-nowy-jork-stany-zjednoczone']


def scrape_data(origin_list, destination_list, date_from, date_to):

    trip_duration_days = 3

    path_to_driver = r'C://Users//mrceb//Desktop//chromedriver-win64//chromedriver.exe'

    df_list = []

    
    for origin in origin_list:
        for destination in destination_list:
            
            Scraper = FlightScraper(date_from=date_from, date_to=date_to, origin=origin, destination=destination, trip_duration_days=trip_duration_days, path_to_driver=path_to_driver)
            Scraper.scrape_data()
            print([len(x) for x in Scraper.__dict__.values() if isinstance(x,list)])
            df_list.append(Scraper.get_dataframe())


    return (pd.concat(df_list))
    




In [7]:
df = scrape_data(origin_list=origin_list, destination_list=destination_list, date_from='2024-01-28', date_to='2024-08-31')


  self._driver = webdriver.Chrome(executable_path=path_to_driver)


Scraping data for dates: 2024-02-02 - 2024-02-04 and locations katowice-polska -> londyn-wielka-brytania.
 Scraping progress: 0.0%


  self._driver.find_element_by_xpath('//*[@id="cookies_accept"]/div').click()
  flight_times =  [timestamp.text for timestamp in self._driver.find_elements_by_xpath('//time[@datetime]')][0:18] # For 6 flights we need total of 18 datetime objects
  self._driver.find_elements_by_xpath('//span[contains(@class, "length-6") or contains(@class, "length-7")]')
  flight_destinations = [dest.text for dest in self._driver.find_elements_by_xpath('//div[contains(@data-test, "stationName")]')][0:12]
  parent_divs = self._driver.find_elements_by_xpath(parent_div_xpath)
  child_divs = [parent_div.find_element_by_xpath(child_div_xpath) for parent_div in parent_divs] # For each flight result card find a grounds counter


Scraping for date 2024-02-02 00:00:00 - 2024-02-04 00:00:00 successful!
Scraping data for dates: 2024-02-09 - 2024-02-11 and locations katowice-polska -> londyn-wielka-brytania.
 Scraping progress: 3.3333333333333335%
Error occured for this date 2024-02-09 00:00:00 - 2024-02-11 00:00:00
Scraping data for dates: 2024-02-09 - 2024-02-11 and locations katowice-polska -> londyn-wielka-brytania.
 Scraping progress: 3.3333333333333335%
Scraping for date 2024-02-09 00:00:00 - 2024-02-11 00:00:00 successful!
Scraping data for dates: 2024-02-16 - 2024-02-18 and locations katowice-polska -> londyn-wielka-brytania.
 Scraping progress: 6.666666666666667%
Scraping for date 2024-02-16 00:00:00 - 2024-02-18 00:00:00 successful!
Scraping data for dates: 2024-02-23 - 2024-02-25 and locations katowice-polska -> londyn-wielka-brytania.
 Scraping progress: 10.0%
Scraping for date 2024-02-23 00:00:00 - 2024-02-25 00:00:00 successful!
Scraping data for dates: 2024-03-01 - 2024-03-03 and locations katowice-p

In [10]:
df.isna().sum()

Flight_id               0
Start_location          0
End_location            0
Start_date              0
End_date                0
Start_time              0
End_time                0
Duration                0
Price                   0
Origin_airport          0
Destinantion_airport    0
Number_of_grounds       0
Is_return               0
dtype: int64