# We are going to write a script that will web scrape flight information for multiple flights from the official Berlin Brandenburg Airport website:

---

### Import the necessary libraries

In [1]:
# this is the library that we will use to create break times in order to mimic human behaviour
import time
from getpass import getpass


# Juicy stuff- these are the Classes we will use for interaction with a webpage:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager


# library for interacting with the operating system
import os

# you know pandas it's your best buddy
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re

# library for directory location:

import pathlib
from os.path import join

#Ignore warning -- Some methods are going to be deprecated and I didn't change all (mainly in the function scrapper)
import warnings
warnings.filterwarnings('ignore')

### Load a driver and the website

In [2]:
# This line of code opens a new web browser/driver

chrome_driver_path = "/Users/martynas/anaconda3/lib/python3.11/site-packages/webdriver_manager/chromedriver"

driver = webdriver.Chrome(service=Service(chrome_driver_path))

In [3]:
# We will be using this url to collect our data
driver.get('https://ber.berlin-airport.de/en.html')

### Click cookies

In [4]:
# Our url prompts cookies so we will click "OK", if it's already clicked the cell will print "Cookies already accepted."
try:
    cookie_button = driver.find_element(By.ID, "CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll")
    cookie_button.click()
    time.sleep(2.5)
except:
    print("Cookies already accepted.")

### Click to select a date

In [5]:
# We find the button to click to select the date we want to scrape
date_button = driver.find_element(By.CSS_SELECTOR, "div[class^='flight-search__date--input-wrapper']")
date_button.click()
time.sleep(2.5)

---

## Once you have found your desired flights, let the scraping begin:

### Get the html page source

In [9]:
# Get the source code of the page and create a beautiful soup
html = driver.page_source
soup = BeautifulSoup(html)

### Get all the flight links and save to a list

In [10]:
# You already know how beautiful soup works, so I've done the digging for you

flights = soup.find_all('div', attrs= {'class' : 'cmp-flightlist__list__items'})
result_set_html = ''.join(str(tag) for tag in flights)
new_flights = BeautifulSoup(result_set_html, 'html.parser')

# Set the base URL to connect to the new links
base_url = "https://ber.berlin-airport.de"

# Find the location of the links in the beautiful soup
a_tags = new_flights.find_all('a')

# Save all the links to a list variable "href_links"
href_list = [base_url + tag['href'] for tag in a_tags if 'href' in tag.attrs]

### Save the links to a file for safe keeps

In [11]:
# Input the date you chose
date_of_scrape = input("What Day: ")

# Input arrival or departure
arr_dep = input("Arrival or Departure: ")

# Set the file path to your directory with name of new text file
file_path = f"Href Links/{arr_dep}_links_{date_of_scrape}.txt"

#Save it as a .txt file
with open(file_path, 'w') as file:
    for href in href_list:
        file.write(f"{href}\n")

What Day: 02
Arrival or Departure: arrival


---

# Testing for Elements

### Open the links file into a list

#### Departures

In [12]:
# Select the date
date_of_scrape = input("What Day: ")

# Open the text file
with open(f"Href Links/departure_links_{date_of_scrape}.txt", 'r') as file:
    href_list_dep = [line.strip() for line in file]

What Day: 02


#### Arrivals

In [13]:
# Select the date
date_of_scrape = input("What Day: ")

# Open the text file
with open(f"Href Links/arrival_links_{date_of_scrape}.txt", 'r') as file:
    href_list_arr = [line.strip() for line in file]

What Day: 02


### Test for elements

#### Departures

In [23]:
# Check the list of links for an example flight
href_list_dep[3]

'https://ber.berlin-airport.de/en/flying/departures-arrivals/flugdetails.html?flightId=1320422'

In [25]:
# Open one of the links
driver.get(href_list_dep[3])

In [30]:
# Get the scheduled time of the arrival flight
new_element = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='dep_scheduled_time']").text

In [31]:
new_element

'02/08/2024 20:25'

#### Arrivals

In [14]:
# Check the list of links for an example flight
href_list_arr[3]

'https://ber.berlin-airport.de/en/flying/departures-arrivals/flugdetails.html?flightId=1320470'

In [15]:
# Open one of the links
driver.get(href_list_arr[2])

In [21]:
# Get the scheduled time of the arrival flight
new_element = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='arr_expected_time']").text

In [22]:
new_element

'02/08/2024 22:07'

## Use the function below to scrape all the flights in the list of links and create a database

### Departures

In [35]:
len(href_list_dep)

29

In [33]:
flight_data_dep = []
date_of_scrape = input("What Day: ")
for_loading = 0

with open(f"Href Links/departure_links_{date_of_scrape}.txt", 'r') as file:
    href_list_dep = [line.strip() for line in file]
    
for link in href_list_dep[]:
    
    driver.get(link)
    time.sleep(1)
    
    try:
        cookie_button = driver.find_element(By.ID, "CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll")
        cookie_button.click()
        time.sleep(1.5)
    except:
        pass
    
    driver.execute_script("document.body.style.zoom='100%'")

    driver.execute_script("document.body.style.zoom='67%'")
    time.sleep(1.3)
    
    
    depart_air = driver.find_element(By.CSS_SELECTOR,"h2[data-flight-data^='dep_airport_name']").text
    arrival_air = driver.find_element(By.CSS_SELECTOR,"h2[data-flight-data^='arr_airport_name']").text
    date_time = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='dep_scheduled_time']").text
    actual_time = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='dep_expected_time']").text
    terminal = driver.find_element(By.CSS_SELECTOR,"i[data-flight-data^='terminal']").text
    check_in = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='checkin_counter']").text
    gate = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='gate']").text
    airline = driver.find_element(By.CSS_SELECTOR,"span[data-flight-data^='airline_name']").text
    flight_nr = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='flight_number_details']").text
    aircraft_type = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='aircraft_type']").text
    plane_reg = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='aircraft_reg']").text
    status = driver.find_element(By.CSS_SELECTOR,"u[data-flight-data^='flight_status_label']").text

    flight_idx_list_dep = [depart_air, arrival_air, date_time, actual_time, terminal, check_in, gate, flight_nr, aircraft_type, plane_reg, status, airline]
    
    flight_data_dep.append(flight_idx_list_dep)
    
    
    for_loading = for_loading+1
    print(f"{for_loading} out of {len(href_list_dep)} scraped")
    
departure_dataframe = pd.DataFrame(flight_data_dep,
                                 columns = ["Departure", "Destination", "Date and Time", "Actual Departure", "Terminal", "Check In" , "Gate", "Flight Number", "Aircraft Type", "Plane Reg", "Flight Status", "Airline" ]
                                )

departure_dataframe.to_excel(f"Data/departure_info_{date_of_scrape}.xlsx",
                           sheet_name='Flights',
                           index= False)

What Day: 02
1 out of 29 scraped
2 out of 29 scraped
3 out of 29 scraped
4 out of 29 scraped
5 out of 29 scraped


### Arrivals

In [36]:
flight_data_arr = []
date_of_scrape = input("What Date: ")
for_loading = 0

with open(f"Href Links/arrival_links_{date_of_scrape}.txt", 'r') as file:
    href_list_arr = [line.strip() for line in file]

for link in href_list_arr[]:
    
    driver.get(link)
    time.sleep(1)
    
    try:
        cookie_button = driver.find_element(By.ID, "CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll")
        cookie_button.click()
        time.sleep(1.5)
    except:
        pass
    
    driver.execute_script("document.body.style.zoom='100%'")

    driver.execute_script("document.body.style.zoom='67%'")
    time.sleep(1.3)
    
    
    depart_air = driver.find_element(By.CSS_SELECTOR,"h2[data-flight-data^='dep_airport_name']").text
    arrival_air = driver.find_element(By.CSS_SELECTOR,"h2[data-flight-data^='arr_airport_name']").text
    date_time = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='arr_scheduled_time']").text
    actual_time = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='arr_expected_time']").text
    terminal = driver.find_element(By.CSS_SELECTOR,"i[data-flight-data^='terminal_arr']").text
    check_in = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='arr_belt']").text
    gate = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='gate_arr']").text
    airline = driver.find_element(By.CSS_SELECTOR,"span[data-flight-data^='airline_name']").text
    flight_nr = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='flight_number_details']").text
    aircraft_type = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='aircraft_type']").text
    plane_reg = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='aircraft_reg']").text
    status = driver.find_element(By.CSS_SELECTOR,"u[data-flight-data^='flight_status_label']").text

    flight_idx_list_arr = [depart_air, arrival_air, date_time, actual_time, terminal, check_in, gate, flight_nr, aircraft_type, plane_reg, status, airline]
    
    flight_data_arr.append(flight_idx_list_arr)
    
    
    for_loading = for_loading+1
    print(f"{for_loading} out of {len(href_list_arr)} scraped")
    
arrival_dataframe = pd.DataFrame(flight_data_arr,
                                 columns = ["Departure", "Destination", "Date and Time", "Actual Arrival", "Terminal", "Baggage Claim" , "Gate", "Flight Number", "Aircraft Type", "Plane Reg", "Flight Status", "Airline" ]
                                )

arrival_dataframe.to_excel(f"Data/arrival_info_{date_of_scrape}.xlsx",
                           sheet_name='Flights',
                           index= False)

What Date: 02
1 out of 56 scraped
2 out of 56 scraped
3 out of 56 scraped
4 out of 56 scraped
5 out of 56 scraped


### Close Driver

In [None]:
driver.close()

---

## Check the data

In [37]:
departures = pd.read_excel("Data/departure_info_02.xlsx")
departures.head()

Unnamed: 0,Departure,Destination,Date and Time,Actual Departure,Terminal,Check In,Gate,Flight Number,Aircraft Type,Plane Reg,Flight Status,Airline
0,Berlin Brandenburg Airport,Copenhagen,02/08/2024 19:40,02/08/2024 21:00,T1,611-616,A31,EJU5267,A319,OELQI,Departed,easyJet Europe (EJU)
1,Berlin Brandenburg Airport,Saarbrücken,02/08/2024 19:50,02/08/2024 21:00,T1,115-116,B36,DX 126,ATR 42-500,OYRUO,Departed,Danish Air Transport (DX)
2,Berlin Brandenburg Airport,Belgrade,02/08/2024 19:55,02/08/2024 21:05,T1,714-716,C07,JU 357,A319,YUAPN,Departed,Air Serbia (JU)
3,#####,#####,##.##.#### ##:##,##.##.#### ##:##,##,##,##,##,##,##,- - -,### (##)
4,Berlin Brandenburg Airport,Palma de Mallorca,02/08/2024 20:30,02/08/2024 22:00,T1,611-616,A34,EJU7336,A320-200 Sharklets,OEIZG,Departed,easyJet Europe (EJU)


In [39]:
arrivals = pd.read_excel("Data/arrival_info_02.xlsx")
arrivals.head()

Unnamed: 0,Departure,Destination,Date and Time,Actual Arrival,Terminal,Baggage Claim,Gate,Flight Number,Aircraft Type,Plane Reg,Flight Status,Airline
0,Naples,Berlin Brandenburg Airport,02/08/2024 18:00,02/08/2024 21:19,T1,A3,A32,EJU5054,A320-200,OEIDD,Arrived,easyJet Europe (EJU)
1,Palma de Mallorca,Berlin Brandenburg Airport,02/08/2024 19:35,02/08/2024 21:32,T1,A1,A34,EJU7335,A320-200 Sharklets,OEIZG,Arrived,easyJet Europe (EJU)
2,Antalya,Berlin Brandenburg Airport,02/08/2024 20:25,02/08/2024 22:07,T1,B2,Y03,XQ 662,737-800 Winglets,TCSUU,Arrived,Sun Express (XQ)
3,Nice,Berlin Brandenburg Airport,02/08/2024 20:30,02/08/2024 21:40,T1,A3,A33,EJU5146,A320-200 Sharklets,OEIBF,Arrived,easyJet Europe (EJU)
4,#####,#####,##.##.#### ##:##,##.##.#### ##:##,##,##,##,##,##,##,- - -,### (##)
