# 02 — Data Integration Notebook  
This notebook integrates the cleaned CAA flight delay data with MIDAS weather data.  
It aggregates meteorological observations to a monthly level and merges them with airport–station records  
to produce the final integrated dataset for feature engineering.

In [None]:
# 02 — Data Integration Notebook
# Integrate CAA flight delay data (with src_id) and MIDAS weather data.

import pandas as pd
from pathlib import Path

base_path = Path(r"C:\Users\NEOWIN AUTOS\Documents\Github\weather_delay_project")
processed = base_path / "data" / "processed"

# Load processed data from Notebook 01
flights_df  = pd.read_csv(processed / "flights_with_stations.csv")
stations_df = pd.read_csv(processed / "midas_stations_clean.csv")

print(f" Flights: {flights_df.shape} Stations: {stations_df.shape}")
display(flights_df.head(5))
display(stations_df.head(5))

 Flights: (5354, 27) Stations: (1489, 10)


Unnamed: 0,run_date,reporting_period,reporting_airport,origin_destination_country,origin_destination,airline_name,arrival_departure,scheduled_charter,number_flights_matched,actual_flights_unmatched,...,flights_between_121_and_180_minutes_late_percent,flights_between_181_and_360_minutes_late_percent,flights_more_than_360_minutes_late_percent,flights_unmatched_percent,flights_cancelled_percent,average_delay_mins,previous_year_month_flights_matched,previous_year_month_early_to_15_mins_late_percent,previous_year_month_average_delay,src_id
0,14/03/2025 09:52,2025-01-01,ABERDEEN,POLAND,GDANSK,WIZZ AIR,A,S,13,0,...,0.0,0.0,0.0,0.0,0.0,4.0,9,44.444444,67.0,161.0
1,14/03/2025 09:52,2025-01-01,ABERDEEN,POLAND,GDANSK,WIZZ AIR,D,S,13,0,...,0.0,0.0,0.0,0.0,0.0,14.0,9,44.444444,70.0,161.0
2,14/03/2025 09:52,2025-01-01,ABERDEEN,UNITED KINGDOM,BELFAST CITY (GEORGE BEST),LOGANAIR LTD,A,S,16,0,...,0.0,5.882353,5.882353,0.0,5.882353,52.0,15,38.888889,49.0,161.0
3,14/03/2025 09:52,2025-01-01,ABERDEEN,UNITED KINGDOM,BELFAST CITY (GEORGE BEST),LOGANAIR LTD,D,S,16,0,...,0.0,5.882353,5.882353,0.0,5.882353,53.0,16,44.444444,47.0,161.0
4,14/03/2025 09:52,2025-01-01,ABERDEEN,UNITED KINGDOM,BIRMINGHAM,LOGANAIR LTD,A,S,50,0,...,3.773585,3.773585,0.0,0.0,5.660377,23.0,53,75.438596,15.0,161.0


Unnamed: 0,src_id,station_name,station_file_name,historic_county,authority,station_latitude,station_longitude,station_elevation,first_year,last_year
0,63,INVERPOLLY,inverpolly,ross-and-cromarty,Met Office,58.068,-5.267,14.0,1972.0,1997.0
1,64,PLOCKTON,plockton,ross-and-cromarty,Met Office,57.337,-5.653,12.0,1979.0,2019.0
2,65,ACHNASHELLACH,achnashellach,ross-and-cromarty,Met Office,57.49,-5.275,67.0,1926.0,1982.0
3,66,KINLOCHEWE,kinlochewe,ross-and-cromarty,Met Office,57.613,-5.306,25.0,1953.0,2024.0
4,67,LOCH GLASCARNOCH,loch-glascarnoch,ross-and-cromarty,Met Office,57.725,-4.895,269.0,1992.0,2024.0


In [4]:
import pandas as pd
from pathlib import Path

# --- File path (already loaded earlier) ---
file_path = r"C:\Users\NEOWIN AUTOS\Documents\Github\weather_delay_project\data\raw\midas_hourly\midas-open_uk-hourly-weather-obs_dv-202507_avon_00676_filton_qcv-1_2018.csv"

# --- Load the data (skip metadata rows) ---
bristol_df = pd.read_csv(file_path, skiprows=283, engine="python", on_bad_lines="skip")

# --- Show basic info ---
print(f" Loaded Bristol hourly weather data: {bristol_df.shape[0]} rows × {bristol_df.shape[1]} columns")
display(bristol_df.head())

# --- Save a cleaned copy to processed folder ---
processed_path = Path(r"C:\Users\NEOWIN AUTOS\Documents\Github\weather_delay_project\data\processed")
processed_path.mkdir(parents=True, exist_ok=True)

# Keep only essential weather variables (no rainfall)
cols_to_keep = [
    "src_id", "ob_time", "wind_direction", "wind_speed",
    "air_temperature", "dewpoint", "rltv_hum",
    "visibility", "msl_pressure"
]

bristol_df = bristol_df[cols_to_keep].copy()

# Convert to datetime and numeric
bristol_df["ob_time"] = pd.to_datetime(bristol_df["ob_time"], dayfirst=True, errors="coerce")
for col in bristol_df.columns:
    if col not in ["src_id", "ob_time"]:
        bristol_df[col] = pd.to_numeric(bristol_df[col], errors="coerce")

print("Cleaned Filton hourly weather data.")
display(bristol_df.head())


# Save as a CSV for reuse
bristol_df.to_csv(processed_path / "bristol_hourly.csv", index=False)

print(f" Saved Bristol hourly data to: {processed_path / 'bristol_hourly.csv'}")


 Loaded Bristol hourly weather data: 6899 rows × 104 columns


Unnamed: 0,ob_time,id,id_type,met_domain_name,version_num,src_id,rec_st_ind,wind_speed_unit_id,src_opr_type,wind_direction,...,wetb_temp_j,rltv_hum_j,vert_vsby_j,stn_pres_j,alt_pres_j,q10mnt_mxgst_spd_j,meto_stmp_time,midas_stmp_etime,drv_hr_sun_dur,drv_hr_sun_dur_q
0,2018-01-01 00:00:00,3628.0,WMO,SYNOP,1.0,676.0,1011.0,4.0,5.0,250.0,...,K,A,,,,,2017-12-31 23:52:00,0.0,0.0,1.0
1,2018-01-01 01:00:00,3628.0,WMO,SYNOP,1.0,676.0,1011.0,4.0,7.0,250.0,...,K,A,,,,,2018-01-01 00:52:00,0.0,0.0,1.0
2,2018-01-01 02:00:00,3628.0,WMO,SYNOP,1.0,676.0,1011.0,4.0,7.0,230.0,...,K,A,,,,,2018-01-01 01:51:00,0.0,0.0,1.0
3,2018-01-01 03:00:00,3628.0,WMO,SYNOP,1.0,676.0,1011.0,4.0,7.0,240.0,...,K,A,,,,,2018-01-01 02:51:00,0.0,0.0,1.0
4,2018-01-01 04:00:00,3628.0,WMO,SYNOP,1.0,676.0,1011.0,4.0,7.0,220.0,...,K,A,,,,,2018-01-01 03:54:00,0.0,0.0,1.0


Cleaned Filton hourly weather data.


Unnamed: 0,src_id,ob_time,wind_direction,wind_speed,air_temperature,dewpoint,rltv_hum,visibility,msl_pressure
0,676.0,2018-01-01 00:00:00,250.0,9.0,5.5,3.2,84.5,1600.0,998.3
1,676.0,2018-01-01 01:00:00,250.0,15.0,5.9,3.3,83.1,3500.0,998.9
2,676.0,2018-01-01 02:00:00,230.0,18.0,6.4,3.8,83.5,1900.0,998.6
3,676.0,2018-01-01 03:00:00,240.0,12.0,6.0,3.7,84.8,1800.0,999.0
4,676.0,2018-01-01 04:00:00,220.0,15.0,6.3,3.9,84.9,1500.0,998.3


 Saved Bristol hourly data to: C:\Users\NEOWIN AUTOS\Documents\Github\weather_delay_project\data\processed\bristol_hourly.csv


In [None]:
import pandas as pd
from pathlib import Path

file_path = r"C:\Users\NEOWIN AUTOS\Documents\Github\weather_delay_project\data\raw\midas_hourly\midas-open_uk-hourly-weather-obs_dv-202507_lanarkshire_00987_drumalbin_qcv-1_2024.csv"

# Skip the first 283 metadata lines
glasgow_df = pd.read_csv(file_path, skiprows=283, engine="python", on_bad_lines="skip")

print(f" Loaded Glasgow (Drumalbin) hourly data: {glasgow_df.shape[0]} rows × {glasgow_df.shape[1]} columns")
display(glasgow_df.head())

# Keep only the essential weather variables
cols_to_keep = [
    "src_id", "ob_time", "wind_direction", "wind_speed",
    "air_temperature", "dewpoint", "rltv_hum",
    "visibility", "msl_pressure"
]
glasgow_df = glasgow_df[cols_to_keep].copy()

# Convert datatypes
glasgow_df["ob_time"] = pd.to_datetime(glasgow_df["ob_time"], dayfirst=True, errors="coerce")
for c in glasgow_df.columns:
    if c not in ["src_id", "ob_time"]:
        glasgow_df[c] = pd.to_numeric(glasgow_df[c], errors="coerce")

# Save to processed folder
processed_path = Path(r"C:\Users\NEOWIN AUTOS\Documents\Github\weather_delay_project\data\processed")
processed_path.mkdir(parents=True, exist_ok=True)
glasgow_df.to_csv(processed_path / "glasgow_hourly.csv", index=False)

print(" Cleaned Glasgow hourly weather data.")    
display(glasgow_df.head())
print(f" Saved Glasgow (Drumalbin) hourly data to: {processed_path / 'glasgow_hourly.csv'}")


✅ Loaded Glasgow (Drumalbin) hourly data: 8764 rows × 104 columns


Unnamed: 0,ob_time,id,id_type,met_domain_name,version_num,src_id,rec_st_ind,wind_speed_unit_id,src_opr_type,wind_direction,...,wetb_temp_j,rltv_hum_j,vert_vsby_j,stn_pres_j,alt_pres_j,q10mnt_mxgst_spd_j,meto_stmp_time,midas_stmp_etime,drv_hr_sun_dur,drv_hr_sun_dur_q
0,2024-01-01 00:00:00,3155.0,WMO,SYNOP,1.0,987.0,1011.0,4.0,7.0,230.0,...,,,,,,,2024-01-01 00:02:13,,,
1,2024-01-01 01:00:00,3155.0,WMO,SYNOP,1.0,987.0,1011.0,4.0,7.0,220.0,...,,,,,,,2024-01-01 01:01:12,,,
2,2024-01-01 02:00:00,3155.0,WMO,SYNOP,1.0,987.0,1011.0,4.0,7.0,230.0,...,,,,,,,2024-01-01 02:01:12,,,
3,2024-01-01 03:00:00,3155.0,WMO,SYNOP,1.0,987.0,1011.0,4.0,6.0,240.0,...,,,,,,,2024-01-01 03:01:11,,,
4,2024-01-01 04:00:00,3155.0,WMO,SYNOP,1.0,987.0,1011.0,4.0,7.0,240.0,...,,,,,,,2024-01-01 04:01:08,,,


 Cleaned Glasgow hourly weather data.


Unnamed: 0,src_id,ob_time,wind_direction,wind_speed,air_temperature,dewpoint,rltv_hum,visibility,msl_pressure
0,987.0,2024-01-01 00:00:00,230.0,3.0,3.0,2.8,98.6,160.0,987.2
1,987.0,2024-01-01 01:00:00,220.0,3.0,3.2,3.0,98.7,340.0,988.0
2,987.0,2024-01-01 02:00:00,230.0,3.0,3.1,2.9,98.5,600.0,988.5
3,987.0,2024-01-01 03:00:00,240.0,5.0,3.5,3.1,97.3,1700.0,989.2
4,987.0,2024-01-01 04:00:00,240.0,8.0,3.6,2.9,95.4,1200.0,990.1


 Saved Glasgow (Drumalbin) hourly data to: C:\Users\NEOWIN AUTOS\Documents\Github\weather_delay_project\data\processed\glasgow_hourly.csv


In [28]:
import pandas as pd
from pathlib import Path

# --- File path ---
file_path = r"C:\Users\DELL\Documents\GitHub\NElson Air\weather_delay_project\data\raw\midas_hourly\midas-open_uk-hourly-weather-obs_dv-202507_greater-london_00708_heathrow_qcv-1_2024.csv"

# --- Read the file, skipping the first 283 comment lines ---
heathrow_df = pd.read_csv(file_path, skiprows=283, engine="python", on_bad_lines="skip")

print(f" Loaded Heathrow hourly weather data: {heathrow_df.shape[0]} rows × {heathrow_df.shape[1]} columns")
display(heathrow_df.head())

# --- Keep only essential weather variables (consistent with other airports) ---
cols_to_keep = [
    "src_id", "ob_time", "wind_direction", "wind_speed",
    "air_temperature", "dewpoint", "rltv_hum",
    "visibility", "msl_pressure"
]

heathrow_df = heathrow_df[cols_to_keep].copy()

# --- Convert datetime and numeric columns ---
heathrow_df["ob_time"] = pd.to_datetime(heathrow_df["ob_time"], dayfirst=True, errors="coerce")
for col in heathrow_df.columns:
    if col not in ["src_id", "ob_time"]:
        heathrow_df[col] = pd.to_numeric(heathrow_df[col], errors="coerce")

print(" Cleaned Heathrow hourly weather data.")
display(heathrow_df.head())

# --- Save to processed folder ---
processed_path = Path(r"C:\Users\DELL\Documents\GitHub\NElson Air\weather_delay_project\data\processed")
processed_path.mkdir(parents=True, exist_ok=True)

heathrow_df.to_csv(processed_path / "heathrow_hourly.csv", index=False)

print(f"Saved Heathrow hourly data to: {processed_path / 'heathrow_hourly.csv'}")


 Loaded Heathrow hourly weather data: 8785 rows × 104 columns


Unnamed: 0,ob_time,id,id_type,met_domain_name,version_num,src_id,rec_st_ind,wind_speed_unit_id,src_opr_type,wind_direction,...,wetb_temp_j,rltv_hum_j,vert_vsby_j,stn_pres_j,alt_pres_j,q10mnt_mxgst_spd_j,meto_stmp_time,midas_stmp_etime,drv_hr_sun_dur,drv_hr_sun_dur_q
0,2024-01-01 00:00:00,3772.0,WMO,SYNOP,1.0,708.0,1011.0,4.0,5.0,230.0,...,,,,,,,2024-01-01 00:02:16,,0.0,1.0
1,2024-01-01 01:00:00,3772.0,WMO,SYNOP,1.0,708.0,1011.0,4.0,7.0,240.0,...,,,,,,,2024-01-01 01:01:12,,0.0,1.0
2,2024-01-01 02:00:00,3772.0,WMO,SYNOP,1.0,708.0,1011.0,4.0,6.0,240.0,...,,,,,,,2024-01-01 02:01:11,,0.0,1.0
3,2024-01-01 03:00:00,3772.0,WMO,SYNOP,1.0,708.0,1011.0,4.0,5.0,240.0,...,,,,,,,2024-01-01 03:01:13,,0.0,1.0
4,2024-01-01 04:00:00,3772.0,WMO,SYNOP,1.0,708.0,1011.0,4.0,5.0,250.0,...,,,,,,,2024-01-01 04:01:12,,0.0,1.0


 Cleaned Heathrow hourly weather data.


Unnamed: 0,src_id,ob_time,wind_direction,wind_speed,air_temperature,dewpoint,rltv_hum,visibility,msl_pressure
0,708.0,2024-01-01 00:00:00,230.0,17.0,8.1,5.3,82.7,2800.0,995.6
1,708.0,2024-01-01 01:00:00,240.0,17.0,7.8,4.8,81.8,3000.0,996.3
2,708.0,2024-01-01 02:00:00,240.0,17.0,8.0,4.5,78.8,3000.0,997.2
3,708.0,2024-01-01 03:00:00,240.0,17.0,8.2,4.5,77.3,3000.0,998.1
4,708.0,2024-01-01 04:00:00,250.0,15.0,8.0,3.9,75.6,3500.0,999.2


Saved Heathrow hourly data to: C:\Users\DELL\Documents\GitHub\NElson Air\weather_delay_project\data\processed\heathrow_hourly.csv
