# Scenario
Prediction of flight prices for different dates and times
- Flights from 10th Oct - 10h Jan
- Adult 1 Passenger
- Economy
- One way

# Extraction Variables
Extracting:
1. Date -> categorical
2. Flight Name -> categorical
3. Stops -> numerical
4. Price -> numerical
5. Duration -> numerical
6. Departure-Time -> categorical
7. Arrival-Time -> categorical

## Issues Faced
1. Multiple elements sharing same class making it hard to extract relevant info
eg.  class="vmXl vmXl-mod-variant-default" is being shared with start time of flight, duration and number of stops.
**Solved by checking further, if they have span tags or div tags as their children**
2. While running loop to get data from 10/10/2024 till 10/01/2025, website blocked, and asked for captcha 
**Solved this by using sleep (to sleep for some time)**

<h5> Comments <h5>
- +1 in flight arrival time indicates that the flight lands on the next day.


# Installing require imports

In [19]:
from selenium import webdriver
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import time
import csv

# Defining a function to extract between one month

- used sleep() for few seconds in order to not be detected as a bot

In [20]:

# Fxn to scrape flight data for a given month
def scrape_flights_for_month(start_date, end_date):
    driver = webdriver.Chrome()
    
    flight_details = []
    current_date = start_date

    while current_date <= end_date:

        formatted_date = current_date.strftime('%Y-%m-%d')
        url = f'https://www.kayak.ie/flights/DUB-LHR/{formatted_date}?sort=bestflight_a'
        
        driver.get(url)
        
        # Pause to allow the page to load [Sleep for 4 seconds]
        time.sleep(10) 
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        flight_containers = soup.findAll('div', class_="yuAt yuAt-pres-rounded yuAt-mod-box-shadow")
        for item in flight_containers:
            try:
                flight_name = item.find('div', class_="J0g6-operator-text").get_text(strip=True)
                stops = item.find('span', class_="JWEO-stops-text").get_text(strip=True)
                price = item.find('div', class_="f8F1-price-text").get_text(strip=True)
                
                #Extracting duration by skipping time and stops
                duration = None
                divs = item.findAll('div', class_="vmXl vmXl-mod-variant-default")

                for div in divs:
                    if not div.find('span'):  # If there are no spans inside the div, it's most likely the duration
                        duration = div.get_text(strip=True)
                        break

                times_div = item.find('div', class_="vmXl vmXl-mod-variant-large")
                if times_div:
                    times = times_div.get_text(strip=True).split("–")
                    departure_time = times[0].strip() if len(times) > 0 else "Departure not found"
                    arrival_time = times[1].strip() if len(times) > 1 else "Arrival not found"
                else:
                    departure_time, arrival_time = "Departure not found", "Arrival not found"

                flight_details.append([formatted_date, flight_name, stops, price, duration, departure_time, arrival_time])
            except Exception as e:
                print(f"Error extracting data: {e}")
        
        current_date += timedelta(days=1)

    driver.quit()
    return flight_details


In [None]:

start_date = datetime(2024, 10, 10)
end_date = datetime(2024, 11, 9)

all_flight_details = []

while start_date <= datetime(2025, 1, 10):
    #print(f"Scraping flights from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
    monthly_flights = scrape_flights_for_month(start_date, end_date)
    
    all_flight_details.extend(monthly_flights)
    
    start_date = end_date + timedelta(days=1)
    end_date = start_date + timedelta(days=30)
    
    # Sleep for 5 seconds to avoid detection
    time.sleep(5)



# Putting data extracted data into a csv file

In [22]:
csv_file_name = 'flight_details.csv'

header = ['Date', 'Flight Name', 'Stops', 'Price', 'Duration', 'Departure-Time', 'Arrival-Time']

with open(csv_file_name, mode='w', newline='') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(header)
    writer.writerows(all_flight_details)

print(f"Flight details have been written to {csv_file_name}")

Flight details have been written to flight_details.csv
