# We are going to write a script that will web scrape flight information for multiple flights from the official Lisbon Airport website:

---

### Import the necessary libraries

In [18]:
# this is the library that we will use to create break times in order to mimic human behaviour
import time
from getpass import getpass


# Juicy stuff- these are the Classes we will use for interaction with a webpage:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementNotInteractableException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


# library for interacting with the operating system
import os

# you know pandas it's your best buddy
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re

# library for directory location:

import pathlib
from os.path import join

#Ignore warning -- Some methods are going to be deprecated and I didn't change all (mainly in the function scrapper)
import warnings
warnings.filterwarnings('ignore')

from playsound import playsound
from datetime import datetime
import mysql.connector

### Load a driver and the website

In [2]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

In [3]:
driver.get("https://www.lisbonairport.pt/en/lis/flights-destinations/find-flights/real-time-departures")

## Run to create a database

### Departures

In [19]:
date_of_scrape = input("date: ")

data_list = []
sound_path = '/Users/martynas/Desktop/Ironhack/Project - Final/ding.mp3'

# Loop through the pages
while True:
    try:
        # Scroll to the end of the page (if needed for loading more content)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        
        # Add a delay to allow the page to load
        time.sleep(1)  # Adjust if needed

        # Get page HTML using BeautifulSoup
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        # Find all the rows you need to scrape (replace with the actual row class/structure)
        rows = soup.find_all('tr', attrs={'class': 'list_alt animated fadeIn color-departed'})

        # Iterate through each row and extract 7 elements
        for row in rows:
            # Extracting data from within each row
            date = row.find('td', attrs={'class': 'td-day'}).text.strip()
            scheduled_time = row.find('td', attrs={'class': 'td-time'}).text.strip()
            terminal = row.find('td', attrs={'class': 'td-terminal'}).text.strip()
            flight_nr = row.find('td', attrs={'class': 'td-flight'}).text.strip()
            destination = row.find('td', attrs={'class': 'td-destination'}).text.strip()
            airline = row.find('td', attrs={'class': 'td-airline'}).text.strip()
            status_and_time = row.find('td', attrs={'class': 'td-arrive'}).text.strip()

            # Append the extracted data to the list as a dictionary
            data_list.append({
                'Date': date,
                'Scheduled Time': scheduled_time,
                'Terminal': terminal,
                'Flight Numbers': flight_nr,
                'Destination': destination,
                'Airline': airline,
                'Status and Actual Time': status_and_time
            })

        # Wait until the 'next' button is clickable
        try:
            next_page = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.ID, 'next_pag'))
            )

            # Scroll to the 'next' button to make sure it's visible
            driver.execute_script("arguments[0].scrollIntoView(true);", next_page)

            # Try clicking the 'next' button using JavaScript to avoid interactability issues
            driver.execute_script("arguments[0].click();", next_page)

            # Add a delay to avoid issues with page loading
            time.sleep(1)

        except TimeoutException:
            print("Next page button not found or not clickable.")
            break

    except (NoSuchElementException, ElementNotInteractableException):
        print("No more pages or button not interactable.")
        break  

# Convert the list of dictionaries into a pandas DataFrame
df = pd.DataFrame(data_list)

print(len(df))

# Save the DataFrame to an Excel file
df.to_excel(f"/Users/martynas/Desktop/Ironhack/Project - Final/Data Collection/Lisbon Data/departures_september_{date_of_scrape}.xlsx", index=False)

# Display the first 10 rows of the DataFrame
display(df.head(10))

playsound(sound_path)

date: 11
Next page button not found or not clickable.
331


Unnamed: 0,Date,Scheduled Time,Terminal,Flight Numbers,Destination,Airline,Status and Actual Time
0,11/09/2024,00:00,T1TT1,TP 1531,Sao Tome,TAP AIR PORTUGAL,Departed 00:14
1,11/09/2024,00:05,T1TT1,TP 087 +33AD 7123LY 9091EK 4814,"São Paulo, Guarulhos",TAP AIR PORTUGAL,Departed 00:21
2,11/09/2024,05:00,T1TT1,KL 1578 +55AM 6404DL 9596G3 5563KQ 1694RO 9313,Amsterdam,KLM,Departed 05:21
3,11/09/2024,05:05,T1TT1,LH 1497 +44AC 9357OS 7220TP 6705UA 9159,Frankfurt,LUFTHANSA,Departed 05:37
4,11/09/2024,05:40,T1TT1,AF 1125 +88DL 8439EY 5926G3 5122KE 6476KQ 3791...,"Paris, Ch. de Gaulle",AIR FRANCE,Departed 05:49
5,11/09/2024,05:55,T1TT1,TP 2357,Madeira,TAP AIR PORTUGAL,Departed 06:08
6,11/09/2024,05:55,T1TT1,LX 2083 +11TP 8205,Zurich,SWISS INTERNATIONAL,Departed 06:10
7,11/09/2024,05:55,T2TT2,FR 1143,"Berlin, Brandenburg",RYANAIR,Departed 06:19
8,11/09/2024,06:00,T1TT1,EJU6731,Nice,EASYJET EUROPE,Departed 07:11
9,11/09/2024,06:00,T1TT1,S4 121 +11TP 6617,Ponta Delgada,AZORES AIRLINES,Departed 06:12


### Arrivals

In [20]:
date_of_scrape = input("date: ")

data_list = []
sound_path = '/Users/martynas/Desktop/Ironhack/Project - Final/ding.mp3'

# Loop through the pages
while True:
    try:
        # Scroll to the end of the page (if needed for loading more content)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        
        # Add a delay to allow the page to load
        time.sleep(1)  # Adjust if needed

        # Get page HTML using BeautifulSoup
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        # Find all the rows you need to scrape (replace with the actual row class/structure)
        rows = soup.find_all('tr', attrs={'class': 'list_alt animated fadeIn color-arrived'})

        # Iterate through each row and extract 7 elements
        for row in rows:
            # Extracting data from within each row
            date = row.find('td', attrs={'class': 'td-day'}).text.strip()
            scheduled_time = row.find('td', attrs={'class': 'td-time'}).text.strip()
            terminal = row.find('td', attrs={'class': 'td-terminal'}).text.strip()
            flight_nr = row.find('td', attrs={'class': 'td-flight'}).text.strip()
            destination = row.find('td', attrs={'class': 'td-destination'}).text.strip()
            airline = row.find('td', attrs={'class': 'td-airline'}).text.strip()
            status_and_time = row.find('td', attrs={'class': 'td-arrive'}).text.strip()

            # Append the extracted data to the list as a dictionary
            data_list.append({
                'Date': date,
                'Scheduled Time': scheduled_time,
                'Terminal': terminal,
                'Flight Numbers': flight_nr,
                'Origin': destination,
                'Airline': airline,
                'Status and Actual Time': status_and_time
            })

        # Wait until the 'next' button is clickable
        try:
            next_page = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.ID, 'next_pag'))
            )

            # Scroll to the 'next' button to make sure it's visible
            driver.execute_script("arguments[0].scrollIntoView(true);", next_page)

            # Try clicking the 'next' button using JavaScript to avoid interactability issues
            driver.execute_script("arguments[0].click();", next_page)

            # Add a delay to avoid issues with page loading
            time.sleep(1)

        except TimeoutException:
            print("Next page button not found or not clickable.")
            break

    except (NoSuchElementException, ElementNotInteractableException):
        print("No more pages or button not interactable.")
        break  

# Convert the list of dictionaries into a pandas DataFrame
df = pd.DataFrame(data_list)

print(len(df))

# Save the DataFrame to an Excel file
df.to_excel(f"/Users/martynas/Desktop/Ironhack/Project - Final/Data Collection/Lisbon Data/arrivals_september_{date_of_scrape}.xlsx", index=False)

# Display the first 10 rows of the DataFrame
display(df.head(10))

playsound(sound_path)

date: 11
Next page button not found or not clickable.
317


Unnamed: 0,Date,Scheduled Time,Terminal,Flight Numbers,Origin,Airline,Status and Actual Time
0,11/09/2024,00:00,T1TT1,BA 504 +55AA 6758CX 7203AY 5984JL 7733QR 5912,"London, Heathrow",BRITISH AIRWAYS,Arrived 23:51 10/09
1,11/09/2024,00:00,T1TT1,FR 1884,"London, Stansted",RYANAIR,Arrived 00:02
2,11/09/2024,01:05,T1TT1,S4 128 +11TP 6626,Ponta Delgada,AZORES AIRLINES,Arrived 01:13
3,11/09/2024,04:45,T1TT1,DT 650,Luanda,TAAG,Arrived 04:26
4,11/09/2024,05:20,T1TT1,TP 074 +44AD 7114JD 5438EY 2720EK 4813,Rio de Janeiro,TAP AIR PORTUGAL,Arrived 05:14
5,11/09/2024,05:20,T1TT1,TP 082 +33AD 7120JD 5456EY 2723,"São Paulo, Guarulhos",TAP AIR PORTUGAL,Arrived 05:28
6,11/09/2024,05:30,T1TT1,TP 202 +11AC 2704,Newark,TAP AIR PORTUGAL,Arrived 06:20
7,11/09/2024,05:30,T1TT1,TP 232,"Washington, Dulles",TAP AIR PORTUGAL,Arrived 05:19
8,11/09/2024,05:55,T1TT1,TP 224 +11EY 2730,Miami,TAP AIR PORTUGAL,Arrived 05:56
9,11/09/2024,06:05,T1TT1,TP 218,Boston,TAP AIR PORTUGAL,Arrived 06:07


### Close Driver

In [21]:
driver.close()

# Organising Unstructured Data

### Open and Concat departures

In [52]:
folder_path = '/Users/martynas/Desktop/Ironhack/Project - Final/Data Collection/Lisbon Data'

# Initialize an empty list to store the dataframes
dfs = []

# Loop through the file names and load each Excel file into a dataframe
for day in range(1, 32):
    try:
        file_name = f'departures_september_{day:02}.xlsx'  # Formatted to have leading zero
        file_path = os.path.join(folder_path, file_name)  # Get full file path

        df = pd.read_excel(file_path)
        dfs.append(df)
    except:
        pass

# Concatenate all dataframes into one
lisbon_dep = pd.concat(dfs, ignore_index=True)

# Display the final dataframe (or use it for further analysis)
display(lisbon_dep.head())
display(lisbon_dep.shape)

Unnamed: 0,Date,Scheduled Time,Terminal,Flight Numbers,Destination,Airline,Status and Actual Time
0,07/09/2024,00:00,T1TT1,TP 015 +22AD 7132LY 9096,Recife,TAP AIR PORTUGAL,Departed 00:22
1,07/09/2024,00:05,T1TT1,8F 507,Sao Tome,STP AIRWAYS,Departed 00:34
2,07/09/2024,01:15,T1TT1,W6 1594,Warsaw,WIZZ AIR HUNGARY,Departed 03:21
3,07/09/2024,05:00,T1TT1,KL 1578 +55AM 6404DL 9596G3 5563KQ 1694RO 9313,Amsterdam,KLM,Departed 05:27
4,07/09/2024,05:05,T1TT1,LH 1497 +44AC 9357OS 7220TP 6705UA 9159,Frankfurt,LUFTHANSA,Departed 05:29


(1275, 7)

In [22]:
date_new = input("What date: ")
lisbon_dep = pd.read_excel(f'/Users/martynas/Desktop/Ironhack/Project - Final/Data Collection/Lisbon Data/departures_september_{date_new}.xlsx')

What date: 11


In [23]:
lisbon_dep[['Flight Status', 'Time Departed']] = lisbon_dep['Status and Actual Time'].str.extract(r'(\w+)\s+(\d{2}:\d{2})')
lisbon_dep['Actual Departure Date'] = lisbon_dep['Status and Actual Time'].str.extract(r'(\d{2}/\d{2})')
lisbon_dep['Actual Departure Date'] = np.where(lisbon_dep['Actual Departure Date'].isna(), lisbon_dep['Date'], lisbon_dep['Actual Departure Date'] + '/2024')
lisbon_dep['Actual Departure Date'] = pd.to_datetime(lisbon_dep['Actual Departure Date'], format='%d/%m/%Y', errors='coerce')
lisbon_dep.drop(columns=['Status and Actual Time'], inplace=True)
lisbon_dep['Actual Departure Date'] = lisbon_dep['Actual Departure Date'].astype(str)
lisbon_dep['Scheduled DateTime'] = pd.to_datetime(lisbon_dep['Actual Departure Date'] + ' ' + lisbon_dep['Scheduled Time'])
lisbon_dep['Actual Departure DateTime'] = pd.to_datetime(lisbon_dep['Actual Departure Date'] + ' ' + lisbon_dep['Time Departed'])
lisbon_dep['Time Difference'] = lisbon_dep['Actual Departure DateTime'] - lisbon_dep['Scheduled DateTime']
lisbon_dep['Departure Time Difference'] = (lisbon_dep['Time Difference'].dt.total_seconds() / 60)
lisbon_dep['Departure Time Difference'] = lisbon_dep['Departure Time Difference'].fillna(0).round().astype(int)
mask_positive = lisbon_dep['Departure Time Difference'] > 1000
mask_negative = lisbon_dep['Departure Time Difference'] < -1000
lisbon_dep.loc[mask_positive, 'Departure Time Difference'] = lisbon_dep['Departure Time Difference'] - 1440
lisbon_dep.loc[mask_negative, 'Departure Time Difference'] = lisbon_dep['Departure Time Difference'] + 1440
lisbon_dep.drop(columns=['Time Difference', 'Actual Departure DateTime', 'Scheduled DateTime', 'Date'], inplace=True)

In [24]:
def classify_departure_time_difference(diff):
    if diff == 0:
        return 'On Time'
    elif diff > 0:
        return 'Delayed'
    else:
        return 'Early'

# Apply the function to create the new column
lisbon_dep['Departure Status'] = lisbon_dep['Departure Time Difference'].apply(classify_departure_time_difference)
lisbon_dep.loc[lisbon_dep['Flight Status'] == 'CANCELLED', 'Departure Status'] = 'Cancelled'

In [25]:
city_country_df = pd.read_excel('/Users/martynas/Desktop/Ironhack/Project - Final/city_country_mapping.xlsx')
lisbon_dep = lisbon_dep.merge(city_country_df, left_on='Destination', right_on='Destination City', how='left')
lisbon_dep.drop(columns='Destination City', inplace=True)
lisbon_dep['Terminal'] = lisbon_dep['Terminal'].replace({'T1TT1': 'T1', 'T2TT2': 'T2'})
lisbon_dep['Flight Numbers'] = lisbon_dep['Flight Numbers'].str.extract(r'(^\w+\s*\d+)')
airline_filter = pd.read_csv('/Users/martynas/Desktop/Ironhack/Project - Final/airlines_list.csv')
lisbon_dep = pd.merge(lisbon_dep, airline_filter, on='Airline', how='left')
lisbon_dep['Origin'] = "Lisbon"

In [26]:
display(lisbon_dep.shape)
lisbon_dep.drop_duplicates(inplace=True)
lisbon_dep.shape

(331, 13)

(331, 13)

### Open and Concat arrivals

In [57]:
folder_path = '/Users/martynas/Desktop/Ironhack/Project - Final/Data Collection/Lisbon Data'

# Initialize an empty list to store the dataframes
dfs = []

# Loop through the file names and load each Excel file into a dataframe
for day in range(1, 32):
    try:
        file_name = f'arrivals_september_{day:02}.xlsx'  # Formatted to have leading zero
        file_path = os.path.join(folder_path, file_name)  # Get full file path

        df = pd.read_excel(file_path)
        dfs.append(df)
    except:
        pass

# Concatenate all dataframes into one
lisbon_arr = pd.concat(dfs, ignore_index=True)

# Display the final dataframe (or use it for further analysis)
display(lisbon_arr.head())
display(lisbon_arr.shape)

Unnamed: 0,Date,Scheduled Time,Terminal,Flight Numbers,Origin,Airline,Status and Actual Time
0,07/09/2024,00:10,T1TT1,TP 1125 +22AD 7171S4 8368,Alicante,TAP AIR PORTUGAL,Arrived 00:01
1,07/09/2024,00:30,T1TT1,TP 1273 +88AI 7903OS 8153BT 5317LY 9007ET 1734...,Vienna,TAP AIR PORTUGAL,Arrived 00:25
2,07/09/2024,00:35,T1TT1,W6 1593,Warsaw,WIZZ AIR HUNGARY,Arrived 02:13
3,07/09/2024,01:05,T1TT1,S4 128 +11TP 6626,Ponta Delgada,AZORES AIRLINES,Arrived 01:17
4,07/09/2024,05:20,T1TT1,TP 074 +44AD 7114JD 5438EY 2720EK 4813,Rio de Janeiro,TAP AIR PORTUGAL,Arrived 04:28


(1320, 7)

In [27]:
date_new = input("What date: ")
lisbon_arr = pd.read_excel(f'/Users/martynas/Desktop/Ironhack/Project - Final/Data Collection/Lisbon Data/arrivals_september_{date_new}.xlsx')

What date: 11


In [28]:
lisbon_arr[['Flight Status', 'Time Arrived']] = lisbon_arr['Status and Actual Time'].str.extract(r'(\w+)\s+(\d{2}:\d{2})')
lisbon_arr['Actual Arrival Date'] = lisbon_arr['Status and Actual Time'].str.extract(r'(\d{2}/\d{2})')
lisbon_arr['Actual Arrival Date'] = np.where(lisbon_arr['Actual Arrival Date'].isna(), lisbon_arr['Date'], lisbon_arr['Actual Arrival Date'] + '/2024')
lisbon_arr['Actual Arrival Date'] = pd.to_datetime(lisbon_arr['Actual Arrival Date'], format='%d/%m/%Y', errors='coerce')
lisbon_arr.drop(columns=['Status and Actual Time'], inplace=True)
lisbon_arr['Actual Arrival Date'] = lisbon_arr['Actual Arrival Date'].astype(str)
lisbon_arr['Scheduled DateTime'] = pd.to_datetime(lisbon_arr['Actual Arrival Date'] + ' ' + lisbon_arr['Scheduled Time'])
lisbon_arr['Actual Arrival DateTime'] = pd.to_datetime(lisbon_arr['Actual Arrival Date'] + ' ' + lisbon_arr['Time Arrived'])
lisbon_arr['Time Difference'] = lisbon_arr['Actual Arrival DateTime'] - lisbon_arr['Scheduled DateTime']
lisbon_arr['Arrival Time Difference'] = (lisbon_arr['Time Difference'].dt.total_seconds() / 60)
lisbon_arr['Arrival Time Difference'] = lisbon_arr['Arrival Time Difference'].fillna(0).round().astype(int)
mask_positive = lisbon_arr['Arrival Time Difference'] > 1000
mask_negative = lisbon_arr['Arrival Time Difference'] < -1000
lisbon_arr.loc[mask_positive, 'Arrival Time Difference'] = lisbon_arr['Arrival Time Difference'] - 1440
lisbon_arr.loc[mask_negative, 'Arrival Time Difference'] = lisbon_arr['Arrival Time Difference'] + 1440
lisbon_arr.drop(columns=['Time Difference', 'Actual Arrival DateTime', 'Scheduled DateTime', 'Date'], inplace=True)

In [29]:
def classify_arrival_time_difference(diff):
    if diff == 0:
        return 'On Time'
    elif diff > 0:
        return 'Delayed'
    else:
        return 'Early'

# Apply the function to create the new column
lisbon_arr['Arrival Status'] = lisbon_arr['Arrival Time Difference'].apply(classify_arrival_time_difference)
lisbon_arr.loc[lisbon_arr['Flight Status'] == 'CANCELLED', 'Arrival Status'] = 'Cancelled'

In [30]:
city_country_df = pd.read_excel('/Users/martynas/Desktop/Ironhack/Project - Final/city_country_mapping.xlsx')
lisbon_arr = pd.merge(lisbon_arr, city_country_df, left_on='Origin', right_on='Destination City', how='left')
drop_cols = ['Destination City']
lisbon_arr = lisbon_arr.drop(columns=drop_cols)
lisbon_arr['Terminal'] = lisbon_arr['Terminal'].replace({'T1TT1': 'T1', 'T2TT2': 'T2'})
lisbon_arr['Flight Numbers'] = lisbon_arr['Flight Numbers'].str.extract(r'(^\w+\s*\d+)')
airline_filter = pd.read_csv('/Users/martynas/Desktop/Ironhack/Project - Final/airlines_list.csv')
lisbon_arr = pd.merge(lisbon_arr, airline_filter, on='Airline', how='left')
lisbon_arr['Destination'] = "Lisbon"

In [31]:
display(lisbon_arr.shape)
lisbon_arr.drop_duplicates(inplace=True)
lisbon_arr.shape

(317, 13)

(317, 13)

## Export to SQL

### Departures

In [32]:
# Step 1: Connect to the MySQL Database
connection = mysql.connector.connect(
    host="127.0.0.1",         # Database host (use "localhost" if running locally)
    user="root",              # Your MySQL username
    password="Lolablades1.",   # Your MySQL password
    database="Flights"         # Database name
)

cursor = connection.cursor()

# Step 2: Double-check the connection
if connection.is_connected():
    print("Connection to the database is successful!")
    
    # Step 3: Run a simple query to verify the connection
    try:
        cursor.execute("SHOW TABLES;")
        tables = cursor.fetchall()
        print("Tables in the database:", tables)
    except mysql.connector.Error as err:
        print(f"Error: {err}")
else:
    print("Connection failed.")

Connection to the database is successful!
Tables in the database: [('airlines_list',), ('arrivals',), ('berlin_arrivals',), ('berlin_departures',), ('departures',), ('heathrow_arrivals',), ('heathrow_departures',), ('lisbon_arrivals',), ('lisbon_departures',)]


In [33]:
lisbon_dep['Time Departed'] = lisbon_dep['Time Departed'].replace(np.nan, None)
lisbon_dep['Scheduled Time'] = lisbon_dep['Scheduled Time'].replace(np.nan, None)
lisbon_dep['Terminal'] = lisbon_dep['Terminal'].apply(lambda x: None if pd.isna(x) else x)

In [34]:
# Step 3: Create the table (if it doesn't exist) with additional columns adjusted for SQL types
create_table_query = """
CREATE TABLE IF NOT EXISTS lisbon_departures (
    id INT AUTO_INCREMENT PRIMARY KEY,
    flight_number VARCHAR(10),                  -- Flight number as VARCHAR
    airline VARCHAR(100),                       -- Airline as VARCHAR
    status VARCHAR(50),                         -- Flight status as VARCHAR
    time_departed TIME,                         -- Time of departure as TIME
    departure_time_scheduled TIME,              -- Scheduled departure time as TIME
    terminal VARCHAR(8),                        -- Terminal as VARCHAR
    date_depart DATE,                           -- Departure date as DATE
    departure_city VARCHAR(100),                -- Departure city as VARCHAR
    destination_city VARCHAR(100),              -- Destination city as VARCHAR
    country VARCHAR(100),                       -- Country as VARCHAR
    departure_time_difference INT,              -- Difference in departure time as INT
    airline_type VARCHAR(8),                    -- Budget or Premium as VARCHAR
    departure_status VARCHAR(20)                -- Departure status as VARCHAR
);
"""
cursor.execute(create_table_query)


# Step 4: Prepare SQL query to insert data
insert_query = """
INSERT INTO lisbon_departures (
    flight_number, airline, status, time_departed,
    departure_time_scheduled, terminal, date_depart,
    departure_city, destination_city, country,
    departure_time_difference, airline_type, departure_status
)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""

# Step 5: Iterate over the DataFrame and insert each row into the table
for i, row in lisbon_dep.iterrows():
    try:
        cursor.execute(insert_query, (
            row['Flight Numbers'],                # flight_number
            row['Airline'],                       # airline
            row['Flight Status'],                 # status
            row['Time Departed'],                 # time_departed
            row['Scheduled Time'],                # departure_time_scheduled
            row['Terminal'],                      # terminal
            row['Actual Departure Date'],         # date_depart
            row['Origin'],                        # departure_city
            row['Destination'],                   # destination_city
            row['Country'],                       # country
            row['Departure Time Difference'],     # departure_time_difference
            row['Type'],                          # airline_type (budget/premium)
            row['Departure Status']               # departure_status
        ))
    except mysql.connector.Error as err:
        print(f"Error at row {i}: {err}")

# Step 6: Commit the transaction and close the connection
connection.commit()

cursor.close()
connection.close()

print("Data uploaded successfully!")

Data uploaded successfully!


### Arrivals

In [35]:
# Step 1: Connect to the MySQL Database
connection = mysql.connector.connect(
    host="127.0.0.1",         # Database host (use "localhost" if running locally)
    user="root",              # Your MySQL username
    password="Lolablades1.",   # Your MySQL password
    database="Flights"         # Database name
)

cursor = connection.cursor()

# Step 2: Double-check the connection
if connection.is_connected():
    print("Connection to the database is successful!")
    
    # Step 3: Run a simple query to verify the connection
    try:
        cursor.execute("SHOW TABLES;")
        tables = cursor.fetchall()
        print("Tables in the database:", tables)
    except mysql.connector.Error as err:
        print(f"Error: {err}")
else:
    print("Connection failed.")

Connection to the database is successful!
Tables in the database: [('airlines_list',), ('arrivals',), ('berlin_arrivals',), ('berlin_departures',), ('departures',), ('heathrow_arrivals',), ('heathrow_departures',), ('lisbon_arrivals',), ('lisbon_departures',)]


In [36]:
lisbon_arr['Time Arrived'] = lisbon_arr['Time Arrived'].replace(np.nan, None)
lisbon_arr['Scheduled Time'] = lisbon_arr['Scheduled Time'].replace(np.nan, None)
lisbon_arr['Terminal'] = lisbon_arr['Terminal'].apply(lambda x: None if pd.isna(x) else x)

In [37]:
# Step 3: Create the table (if it doesn't exist) with additional columns adjusted for SQL types
create_table_query = """
CREATE TABLE IF NOT EXISTS lisbon_arrivals (
    id INT AUTO_INCREMENT PRIMARY KEY,
    flight_number VARCHAR(10),                  -- Flight number as VARCHAR
    airline VARCHAR(100),                       -- Airline as VARCHAR
    status VARCHAR(50),                         -- Flight status as VARCHAR
    time_arrived TIME,                         -- Time of departure as TIME
    arrive_time_scheduled TIME,                -- Scheduled departure time as TIME
    terminal VARCHAR(8),                        -- Terminal as VARCHAR
    date_arrive DATE,                           -- Departure date as DATE
    departure_city VARCHAR(100),                -- Departure city as VARCHAR
    destination_city VARCHAR(100),              -- Destination city as VARCHAR
    country VARCHAR(100),                       -- Country as VARCHAR
    arrive_time_difference INT,                -- Difference in departure time as INT
    airline_type VARCHAR(8),                    -- Budget or Premium as VARCHAR
    arrival_status VARCHAR(20)                  -- Departure status as VARCHAR
);
"""
cursor.execute(create_table_query)


# Step 4: Prepare SQL query to insert data
insert_query = """
INSERT INTO lisbon_arrivals (
    flight_number, airline, status, time_arrived,
    arrive_time_scheduled, terminal, date_arrive,
    departure_city, destination_city, country,
    arrive_time_difference, airline_type, arrival_status
)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""

# Step 5: Iterate over the DataFrame and insert each row into the table
for i, row in lisbon_arr.iterrows():
    try:
        cursor.execute(insert_query, (
            row['Flight Numbers'],                # flight_number
            row['Airline'],                       # airline
            row['Flight Status'],                 # status
            row['Time Arrived'],                  # time_arrived
            row['Scheduled Time'],                # arrive_time_scheduled
            row['Terminal'],                      # terminal
            row['Actual Arrival Date'],           # date_arrive
            row['Origin'],                        # departure_city
            row['Destination'],                   # destination_city
            row['Country'],                       # country
            row['Arrival Time Difference'],       # arrive_time_difference
            row['Type'],                          # airline_type (budget/premium)
            row['Arrival Status']                 # arrival_status
        ))
    except mysql.connector.Error as err:
        print(f"Error at row {i}: {err}")

# Step 6: Commit the transaction and close the connection
connection.commit()

cursor.close()
connection.close()

print("Data uploaded successfully!")

Data uploaded successfully!
