# We are going to write a script that will web scrape flight information for multiple flights from the official Berlin Brandenburg Airport website:

---

### Import the necessary libraries

In [2]:
# this is the library that we will use to create break times in order to mimic human behaviour
import time
from getpass import getpass


# Juicy stuff- these are the Classes we will use for interaction with a webpage:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager


# library for interacting with the operating system
import os

# you know pandas it's your best buddy
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re

# library for directory location:

import pathlib
from os.path import join

#Ignore warning -- Some methods are going to be deprecated and I didn't change all (mainly in the function scrapper)
import warnings
warnings.filterwarnings('ignore')

### Load a driver and the website

In [3]:
# This line of code opens a new web browser/driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

In [6]:
# We will be using this url to collect our data
driver.get('https://ber.berlin-airport.de/en.html')

### Click cookies

In [5]:
# Our url prompts cookies so we will click "OK", if it's already clicked the cell will print "Cookies already accepted."
try:
    cookie_button = driver.find_element(By.ID, "CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll")
    cookie_button.click()
    time.sleep(2.5)
except:
    print("Cookies already accepted.")

---

## Once you have found your desired flights, let the scraping begin:

### Get the html page source

In [11]:
# Get the source code of the page and create a beautiful soup
html = driver.page_source
soup = BeautifulSoup(html)

### Get all the flight links and save to a list

In [17]:
# You already know how beautiful soup works, so I've done the digging for you

flights = soup.find_all('div', attrs= {'class' : 'cmp-flightlist__list__items'})
result_set_html = ''.join(str(tag) for tag in flights)
new_flights = BeautifulSoup(result_set_html, 'html.parser')

# Set the base URL to connect to the new links
base_url = "https://ber.berlin-airport.de"

# Find the location of the links in the beautiful soup
a_tags = new_flights.find_all('a')

# Save all the links to a list variable "href_links"
href_list = [base_url + tag['href'] for tag in a_tags if 'href' in tag.attrs]

### Save the links to a file for safe keeps

In [19]:
# Input the date you chose
date_of_scrape = input("What Day: ")

# Input arrival or departure
arr_dep = input("Arrival or Departure: ")

# Set the file path to your directory with name of new text file
file_path = f"Href Links/{arr_dep}_links_{date_of_scrape}.txt"

#Save it as a .txt file
with open(file_path, 'w') as file:
    for href in href_list:
        file.write(f"{href}\n")

What Day: 28
Arrival or Departure: departure


---

# Testing for Elements

### Open the links file into a list

#### Departures

In [21]:
# Select the date
date_of_scrape = input("What Day: ")

# Open the text file
with open(f"Href Links/departure_links_{date_of_scrape}.txt", 'r') as file:
    href_list_dep = [line.strip() for line in file]

What Day: 28


#### Arrivals

In [22]:
# Select the date
date_of_scrape = input("What Day: ")

# Open the text file
with open(f"Href Links/arrival_links_{date_of_scrape}.txt", 'r') as file:
    href_list_arr = [line.strip() for line in file]

What Day: 28


### Test for elements

In [26]:
# Check the list of links for an example flight
href_list_arr[2]

'https://ber.berlin-airport.de/en/flying/departures-arrivals/flugdetails.html?flightId=1312634'

In [27]:
# Open one of the links
driver.get(href_list_arr[2])

In [28]:
# Get the scheduled time of the arrival flight
new_element = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='arr_expected_time']").text

In [29]:
new_element

'28/07/2024 21:11'

## Use the function below to scrape all the flights in the list of links and create a database

### Departures

In [12]:
flight_data_dep = []
date_of_scrape = input("What Day: ")
for_loading = 0

with open(f"Href Links/departure_links_{date_of_scrape}.txt", 'r') as file:
    href_list_dep = [line.strip() for line in file]
    
for link in href_list_dep:
    
    driver.get(link)
    time.sleep(1)
    
    try:
        cookie_button = driver.find_element(By.ID, "CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll")
        cookie_button.click()
        time.sleep(1.5)
    except:
        pass
    
    driver.execute_script("document.body.style.zoom='100%'")

    driver.execute_script("document.body.style.zoom='67%'")
    time.sleep(1.3)
    
    
    depart_air = driver.find_element(By.CSS_SELECTOR,"h2[data-flight-data^='dep_airport_name']").text
    arrival_air = driver.find_element(By.CSS_SELECTOR,"h2[data-flight-data^='arr_airport_name']").text
    date_time = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='dep_scheduled_time']").text
    actual_time = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='dep_expected_time']").text
    terminal = driver.find_element(By.CSS_SELECTOR,"i[data-flight-data^='terminal']").text
    check_in = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='checkin_counter']").text
    gate = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='gate']").text
    airline = driver.find_element(By.CSS_SELECTOR,"span[data-flight-data^='airline_name']").text
    flight_nr = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='flight_number_details']").text
    aircraft_type = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='aircraft_type']").text
    plane_reg = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='aircraft_reg']").text
    status = driver.find_element(By.CSS_SELECTOR,"u[data-flight-data^='flight_status_label']").text

    flight_idx_list_dep = [depart_air, arrival_air, date_time, actual_time, terminal, check_in, gate, flight_nr, aircraft_type, plane_reg, status, airline]
    
    flight_data_dep.append(flight_idx_list_dep)
    
    
    for_loading = for_loading+1
    print(f"{for_loading} out of {len(href_list_dep)} scraped")
    
departure_dataframe = pd.DataFrame(flight_data_dep,
                                 columns = ["Departure", "Destination", "Date and Time", "Actual Departure", "Terminal", "Check In" , "Gate", "Flight Number", "Aircraft Type", "Plane Reg", "Flight Status", "Airline" ]
                                )

departure_dataframe.to_excel(f"Data/departure_info_{date_of_scrape}.xlsx",
                           sheet_name='Flights',
                           index= False)

What Day: 28
1 out of 24 scraped
2 out of 24 scraped
3 out of 24 scraped
4 out of 24 scraped
5 out of 24 scraped
6 out of 24 scraped
7 out of 24 scraped
8 out of 24 scraped
9 out of 24 scraped
10 out of 24 scraped
11 out of 24 scraped
12 out of 24 scraped
13 out of 24 scraped
14 out of 24 scraped
15 out of 24 scraped
16 out of 24 scraped
17 out of 24 scraped
18 out of 24 scraped
19 out of 24 scraped
20 out of 24 scraped
21 out of 24 scraped
22 out of 24 scraped
23 out of 24 scraped
24 out of 24 scraped


### Arrivals

In [17]:
flight_data_arr = []
date_of_scrape = input("What Date: ")
for_loading = 0

with open(f"Href Links/arrival_links_{date_of_scrape}.txt", 'r') as file:
    href_list_arr = [line.strip() for line in file]

for link in href_list_arr:
    
    driver.get(link)
    time.sleep(1)
    
    try:
        cookie_button = driver.find_element(By.ID, "CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll")
        cookie_button.click()
        time.sleep(1.5)
    except:
        pass
    
    driver.execute_script("document.body.style.zoom='100%'")

    driver.execute_script("document.body.style.zoom='67%'")
    time.sleep(1.3)
    
    
    depart_air = driver.find_element(By.CSS_SELECTOR,"h2[data-flight-data^='dep_airport_name']").text
    arrival_air = driver.find_element(By.CSS_SELECTOR,"h2[data-flight-data^='arr_airport_name']").text
    date_time = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='arr_scheduled_time']").text
    actual_time = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='arr_expected_time']").text
    terminal = driver.find_element(By.CSS_SELECTOR,"i[data-flight-data^='terminal_arr']").text
    check_in = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='arr_belt']").text
    gate = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='gate_arr']").text
    airline = driver.find_element(By.CSS_SELECTOR,"span[data-flight-data^='airline_name']").text
    flight_nr = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='flight_number_details']").text
    aircraft_type = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='aircraft_type']").text
    plane_reg = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='aircraft_reg']").text
    status = driver.find_element(By.CSS_SELECTOR,"u[data-flight-data^='flight_status_label']").text

    flight_idx_list_arr = [depart_air, arrival_air, date_time, actual_time, terminal, check_in, gate, flight_nr, aircraft_type, plane_reg, status, airline]
    
    flight_data_arr.append(flight_idx_list_arr)
    
    
    for_loading = for_loading+1
    print(f"{for_loading} out of {len(href_list_arr)} scraped")
    
arrival_dataframe = pd.DataFrame(flight_data_arr,
                                 columns = ["Departure", "Destination", "Date and Time", "Actual Arrival", "Terminal", "Baggage Claim" , "Gate", "Flight Number", "Aircraft Type", "Plane Reg", "Flight Status", "Airline" ]
                                )

arrival_dataframe.to_excel(f"Data/arrival_info_{date_of_scrape}.xlsx",
                           sheet_name='Flights',
                           index= False)

What Date: 28
1 out of 48 scraped
2 out of 48 scraped
3 out of 48 scraped
4 out of 48 scraped
5 out of 48 scraped
6 out of 48 scraped
7 out of 48 scraped
8 out of 48 scraped
9 out of 48 scraped
10 out of 48 scraped
11 out of 48 scraped
12 out of 48 scraped
13 out of 48 scraped
14 out of 48 scraped
15 out of 48 scraped
16 out of 48 scraped
17 out of 48 scraped
18 out of 48 scraped
19 out of 48 scraped
20 out of 48 scraped
21 out of 48 scraped
22 out of 48 scraped
23 out of 48 scraped
24 out of 48 scraped
25 out of 48 scraped
26 out of 48 scraped
27 out of 48 scraped
28 out of 48 scraped
29 out of 48 scraped
30 out of 48 scraped
31 out of 48 scraped
32 out of 48 scraped
33 out of 48 scraped
34 out of 48 scraped
35 out of 48 scraped
36 out of 48 scraped
37 out of 48 scraped
38 out of 48 scraped
39 out of 48 scraped
40 out of 48 scraped
41 out of 48 scraped
42 out of 48 scraped
43 out of 48 scraped
44 out of 48 scraped
45 out of 48 scraped
46 out of 48 scraped
47 out of 48 scraped
48 out o

### Close Driver

In [7]:
driver.close()

---

## Check the data

In [15]:
departures = pd.read_excel("Data/departure_info_28.xlsx")
departures.head(10)

Unnamed: 0,Departure,Destination,Date and Time,Actual Departure,Terminal,Check In,Gate,Flight Number,Aircraft Type,Plane Reg,Flight Status,Airline
0,Berlin Brandenburg Airport,Antalya,28/07/2024 18:50,28/07/2024 21:00,T1,521-524,C10,PC 5008,A320 Neo,TCNCZ,Departed,Pegasus Airlines (PC)
1,Berlin Brandenburg Airport,Paris CDG,28/07/2024 19:05,28/07/2024 21:00,T1,811-816,A07,AF 1535,A321-100,FGMZA,Departed,Air France (AF)
2,Berlin Brandenburg Airport,Krakow,28/07/2024 20:05,28/07/2024 21:00,T2,158-165,B38,FR 2535,737 MAX 8,9HVUH,Departed,Ryanair (FR)
3,Berlin Brandenburg Airport,London LHR,28/07/2024 21:00,,T1,221-225,C01,BA 995,A319,GDBCA,Departed,British Airways (BA)
4,Berlin Brandenburg Airport,Vienna,28/07/2024 21:00,,T1,421-426,B06,OS 238,A320-200,OELBX,Departed,Austrian Airlines (OS)
5,Berlin Brandenburg Airport,Kaunas,28/07/2024 21:05,28/07/2024 21:45,T2,158-165,B32,FR 709,737-800 Winglets,SPRNK,Departed,Ryanair (FR)
6,Berlin Brandenburg Airport,Copenhagen,28/07/2024 21:05,,T1,511-516,B24,SK 1680,ATR 72-600,ESATI,Departed,SAS-Scandinavian Airlines System (SK)
7,Berlin Brandenburg Airport,Milan LIN,28/07/2024 21:10,,T1,611-616,A33,EJU5073,A320-200 Sharklets,OEIBF,Departed,easyJet Europe (EJU)
8,Berlin Brandenburg Airport,Luxembourg,28/07/2024 21:10,,T1,121-122,B25,LG 9474,DHC-8-400 / Q400,LXLGE,Departed,Luxair (LG)
9,Berlin Brandenburg Airport,Antalya,28/07/2024 21:15,,T1,724-726,C19,XQ 663,737 MAX 8,TCSMI,Departed,Sun Express (XQ)


In [18]:
arrivals = pd.read_excel("Data/arrival_info_28.xlsx")
arrivals.head(10)

Unnamed: 0,Departure,Destination,Date and Time,Actual Arrival,Terminal,Baggage Claim,Gate,Flight Number,Aircraft Type,Plane Reg,Flight Status,Airline
0,Barcelona,Berlin Brandenburg Airport,28/07/2024 17:25,28/07/2024 21:02,T1,A4,A35,EJU5124,A320-200,OEIDO,Arrived,easyJet Europe (EJU)
1,Kos,Berlin Brandenburg Airport,28/07/2024 19:10,28/07/2024 23:01,T1,A3,A37,EW 8671,A320-200,9HAMK,Arrived,Eurowings (EW)
2,Kaunas,Berlin Brandenburg Airport,28/07/2024 20:40,28/07/2024 21:11,T2,C1,Z32,FR 710,737-800 Winglets,SPRNK,Arrived,Ryanair (FR)
3,Reggio Calabria,Berlin Brandenburg Airport,28/07/2024 20:55,28/07/2024 21:46,T2,C3,Z34,FR 8603,737-800 Winglets,9HQEJ,Arrived,Ryanair (FR)
4,Malta,Berlin Brandenburg Airport,28/07/2024 20:55,28/07/2024 21:17,T1,B3,B02,KM 376,A320-200,9HAEP,Arrived,Air Malta (KM)
5,Frankfurt,Berlin Brandenburg Airport,28/07/2024 20:55,28/07/2024 21:19,T1,B1,S01,LH 200,A321-100,DAIRO,Arrived,Lufthansa (LH)
6,Edinburgh,Berlin Brandenburg Airport,28/07/2024 21:05,28/07/2024 21:27,T1,A2,X10,EZY3261,A320-200 Sharklets,GEZPB,Arrived,easyJet UK (U2)
7,Bologna,Berlin Brandenburg Airport,28/07/2024 21:05,28/07/2024 22:16,T2,C2,Z33,FR 137,737 MAX 8,9HVUL,Arrived,Ryanair (FR)
8,Palma de Mallorca,Berlin Brandenburg Airport,28/07/2024 21:05,28/07/2024 21:32,T2,C1,Z29,FR 263,737-800 Winglets,9HQCH,Arrived,Ryanair (FR)
9,Istanbul IST,Berlin Brandenburg Airport,28/07/2024 21:05,28/07/2024 21:13,T1,A3,X03,TK 1727,A321-200 Sharklets,TCJSN,Arrived,Turkish Airlines (TK)
