In [1]:
import pandas as pd
from tqdm.auto import tqdm
from ydata_profiling import ProfileReport
tqdm.pandas()

In [2]:
airports = pd.read_csv("../data/US 2023 Civil Flights  delays meteo and aircrafts/airports_geolocation.csv")
cancelled_diverted = pd.read_csv("../data/US 2023 Civil Flights  delays meteo and aircrafts/Cancelled_Diverted_2023.csv")
flights = pd.read_csv(filepath_or_buffer="../data/US 2023 Civil Flights  delays meteo and aircrafts/US_flights_2023.csv")
weather = pd.read_csv("../data/US 2023 Civil Flights  delays meteo and aircrafts/weather_meteo_by_airport.csv")

# TO DO
* Get the info and datatype for each column.
* Fix the typing in each column.
* Describe each dataset.
* Rename columns properly (normalize them to be lower case and with _ instead of spaces).
* Check for duplicates .
* Count the nulls in each column.
* Fill/remove corrupt data (missing values).

# Flights info

- **index:** Unique flight ID (primary key)
- **Day_Of_Week:** Contains the days of the week from 1 to 7
- **Dep_Delay:** Departure delay ranging from -99 minutes to 4413 minutes with an average of 12.20 minutes
- **Dep_Delay_Tag:** Delay tag where 1 represents a delay of more than 5 minutes
- **Arr_Delay:** Arrival delay ranging from -119 minutes to 4405 minutes with an average of 6.62 minutes
- **Flight_Duration:** Flight duration ranging from 0 to 795 minutes with an average of 140 minutes
- **Delay_Carrier:** Delays caused by the airline ranging from 0 minutes to 3957 minutes
- **Delay_Weather:** Weather-related delays ranging from 0 to 1860 minutes
- **Delay_NAS:** Delays related to air traffic control (National Aviation System) ranging from 0 to 1708 minutes
- **Delay_Security:** Delays related to security checks ranging from 0 to 1460 minutes
- **Delay_LastAircraft:** Delays due to waiting for the aircraft ranging from 0 to 3581 minutes
- **Aircraft_age:** Age of the aircraft ranging from 1 to 57 years with an average of 13.48 years
- **Airline:** Contains the names of the **15 airlines** present in the dataset.
- **Tail_Number:** Unique identifier of the aircraft. **5963 aircraft** are present.
- **Dep_Airport:** Unique identifier of each departure airport. **350 airports** are represented.
- **Dep_CityName:** Names of the departure cities. **344 cities** are represented.
- **DepTime_label:** Division of the day into 6-hour periods (Night - Morning - Afternoon - Evening).
- **Dep_Delay_Type:** 3 categories of departure delays (Low < 5min - Medium > 15min - High > 60min).
- **CityName:** Names of the arrival cities. 344 cities are represented.
- **Arr_Delay_Type:** 3 categories of arrival delays (Low < 5min - Medium > 15min - High > 60min).
- **Distance_type:** 3 categories of flight distances in miles (Short Haul < 1500Mi, Medium Haul < 3000Mi, Long Haul < 6000Mi).
- **Manufacturer:** Aircraft manufacturer. **5 aircraft manufacturers** are present.
- **Model:** Model name given by the manufacturer. 21 aircraft models are present.


In [3]:
day_of_week_mapping = {
    1: "Monday",
    2: "Tuesday",
    3: "Wednesday",
    4: "Thursday",
    5: "Friday",
    6: "Saturday",
    7: "Sunday",
}
flights["Day_Of_Week"] = flights["Day_Of_Week"].map(day_of_week_mapping)
cancelled_diverted["Day_Of_Week"] = cancelled_diverted["Day_Of_Week"].map(day_of_week_mapping)

In [4]:
print("flights: ")
flights.info()

flights: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6743404 entries, 0 to 6743403
Data columns (total 24 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   FlightDate          object
 1   Day_Of_Week         object
 2   Airline             object
 3   Tail_Number         object
 4   Dep_Airport         object
 5   Dep_CityName        object
 6   DepTime_label       object
 7   Dep_Delay           int64 
 8   Dep_Delay_Tag       int64 
 9   Dep_Delay_Type      object
 10  Arr_Airport         object
 11  Arr_CityName        object
 12  Arr_Delay           int64 
 13  Arr_Delay_Type      object
 14  Flight_Duration     int64 
 15  Distance_type       object
 16  Delay_Carrier       int64 
 17  Delay_Weather       int64 
 18  Delay_NAS           int64 
 19  Delay_Security      int64 
 20  Delay_LastAircraft  int64 
 21  Manufacturer        object
 22  Model               object
 23  Aicraft_age         int64 
dtypes: int64(10), object(14)
memory usage: 1

In [5]:
old = flights.memory_usage(deep=True) / (1024**2)
old

KeyboardInterrupt: 

In [None]:
flights["FlightDate"] = flights["FlightDate"].astype("datetime64[ns]")
cancelled_diverted["FlightDate"] = cancelled_diverted["FlightDate"].astype("datetime64[ns]")

In [None]:
categorical_columns = [
    "Airline",
    "Day_Of_Week",
    "Dep_Airport",
    "Arr_Airport",
    "Dep_CityName",
    "Arr_CityName",
    "DepTime_label",
    "Dep_Delay_Type",
    "Arr_Delay_Type",
    "Distance_type",
    "Manufacturer",
    "Model",
]

for column in categorical_columns:
    flights[column] = flights[column].astype("category")

In [None]:
print("flights: ")
flights.info()

In [None]:
new = flights.memory_usage(deep=True) / (1024**2)

In [None]:
for i,j,index in zip(old,new,old.index):
    if i != j:
        print(f"{index}: {i:0.2f} MB => {j:0.2f} MB. a {i/j:0.2f} times reduction in size")

In [None]:
flights.sample(5)

In [None]:
flights.select_dtypes(include=["object","category"]).describe(include="all").T

In [None]:
flights.describe().T

# Cancelled diverted info

- **index:** Unique flight ID (primary key)
- **Day_Of_Week:** Contains the days of the week from 1 to 7
- **Dep_Delay:** Departure delay ranging from -99 minutes to 4413 minutes with an average of 12.20 minutes
- **Dep_Delay_Tag:** Delay tag where 1 represents a delay of more than 5 minutes
- **Arr_Delay:** Arrival delay ranging from -119 minutes to 4405 minutes with an average of 6.62 minutes
- **Flight_Duration:** Flight duration ranging from 0 to 795 minutes with an average of 140 minutes
- **Delay_Carrier:** Delays caused by the airline ranging from 0 minutes to 3957 minutes
- **Delay_Weather:** Weather-related delays ranging from 0 to 1860 minutes
- **Delay_NAS:** Delays related to air traffic control (National Aviation System) ranging from 0 to 1708 minutes
- **Delay_Security:** Delays related to security checks ranging from 0 to 1460 minutes
- **Delay_LastAircraft:** Delays due to waiting for the aircraft ranging from 0 to 3581 minutes
- **Aircraft_age:** Age of the aircraft ranging from 1 to 57 years with an average of 13.48 years
- **Airline:** Contains the names of the **15 airlines** present in the dataset.
- **Tail_Number:** Unique identifier of the aircraft. **5963 aircraft** are present.
- **Dep_Airport:** Unique identifier of each departure airport. **350 airports** are represented.
- **Dep_CityName:** Names of the departure cities. **344 cities** are represented.
- **DepTime_label:** Division of the day into 6-hour periods (Night - Morning - Afternoon - Evening).
- **Dep_Delay_Type:** 3 categories of departure delays (Low < 5min - Medium > 15min - High > 60min).
- **CityName:** Names of the arrival cities. 344 cities are represented.
- **Arr_Delay_Type:** 3 categories of arrival delays (Low < 5min - Medium > 15min - High > 60min).
- **Distance_type:** 3 categories of flight distances in miles (Short Haul < 1500Mi, Medium Haul < 3000Mi, Long Haul < 6000Mi).
- **Manufacturer:** Aircraft manufacturer. **5 aircraft manufacturers** are present.
- **Model:** Model name given by the manufacturer. 21 aircraft models are present.
- **Canceled flight (bool)**
- **Diverted flight (bool)**

In [None]:
print("cancelled_diverted: ")
cancelled_diverted.info()

In [None]:
categorical_columns = [
    "Day_Of_Week",
    "Airline",
    "Cancelled",
    "Diverted",
    "Dep_Airport",
    "Dep_CityName",
    "DepTime_label",
    "Dep_Delay_Tag",
    "Dep_Delay_Type",
    "Arr_Airport",
    "Arr_CityName",
    "Arr_Delay_Type",
    "Distance_type",
]

for column in categorical_columns:
    cancelled_diverted[column] = cancelled_diverted[column].astype("category")

In [None]:
cancelled_diverted.sample(5)

In [None]:
cancelled_diverted[cancelled_diverted["Tail_Number"] == "0"]["Cancelled"].eq(1.0).all()

In [None]:
cancelled_diverted[cancelled_diverted["Cancelled"] == 1.0]["Tail_Number"].eq("0").value_counts()

In [None]:
cancelled_diverted.select_dtypes(["object", "category"]).describe(include="all").T

In [None]:
cancelled_diverted.describe().T

# Weather Info

- **time** Timestamps associated with the weather data (yyyy-mm-dd). 
- **tavg** Average Temperature (°C). 
- **tmin** Minimum Temperature (°C).
- **tmax** Maximum Temperature (°C).
- **prcp** Total precipitation amount in mm. 
- **snow** Snow Depth.
- **wdir** Wind (From) Direction (Degrees).
- **wspd** Average Wind Speed (km/h). 
- **pres** Sea-Level Air Pressure (hPa)
- **airport_id** Unique identifier for the airport where the weather data was collected. 


In [None]:
print("weather: ")
weather.info()

In [None]:
weather.sample(5)

In [None]:
weather.describe().T

In [None]:
weather["time"] = weather["time"].astype("datetime64[ns]")

# Airports info

- **IATA_CODE**	Unique identifier for the airport.
- **AIRPORT**	Full name of the airport.
- **CITY**	City where the airport is located.
- **STATE**	State (US) where the airport is located.
- **COUNTRY**	Country where the airport is located (Always US).
- **LATITUDE**	Geographic latitude of the airport in decimal degrees.
- **LONGITUDE**	Geographic longitude of the airport in decimal degrees.

In [None]:
print("airports: ")
airports.info()

In [None]:
categorical_columns = ["AIRPORT", "CITY", "STATE", "COUNTRY"]

for column in categorical_columns:
    airports[column] = airports[column].astype("category")

In [None]:
airports.sample(5)

In [None]:
airports.describe().T

In [None]:
airports.select_dtypes(["object","category"]).describe(include="all").T

# Normalizing column names

In [None]:
# Function to normalize column names
def normalize_columns(df):
    df.columns = df.columns.str.lower().str.replace(" ", "_")
    return df

In [None]:
# Normalize column names
airports = normalize_columns(airports)
cancelled_diverted = normalize_columns(cancelled_diverted)
flights = normalize_columns(flights)
weather = normalize_columns(weather)

In [None]:
print("airports: " , airports.columns)
print(50*"*")
print("cancelled_diverted: " , cancelled_diverted.columns)
print(50*"*")
print("flights: " , flights.columns)
print(50*"*")
print("weather: " , weather.columns)
print(50*"*")

airports:  Index(['iata_code', 'airport', 'city', 'state', 'country', 'latitude',
       'longitude'],
      dtype='object')
**************************************************
cancelled_diverted:  Index(['flightdate', 'day_of_week', 'airline', 'tail_number', 'cancelled',
       'diverted', 'dep_airport', 'dep_cityname', 'deptime_label', 'dep_delay',
       'dep_delay_tag', 'dep_delay_type', 'arr_airport', 'arr_cityname',
       'arr_delay', 'arr_delay_type', 'flight_duration', 'distance_type',
       'delay_carrier', 'delay_weather', 'delay_nas', 'delay_security',
       'delay_lastaircraft'],
      dtype='object')
**************************************************
flights:  Index(['flightdate', 'day_of_week', 'airline', 'tail_number', 'dep_airport',
       'dep_cityname', 'deptime_label', 'dep_delay', 'dep_delay_tag',
       'dep_delay_type', 'arr_airport', 'arr_cityname', 'arr_delay',
       'arr_delay_type', 'flight_duration', 'distance_type', 'delay_carrier',
       'delay_weather'

# Getting to know the duplicates

In [None]:
# Check for duplicates and remove them
print("airports", airports.duplicated().sum())
print("cancelled_diverted", cancelled_diverted.duplicated().sum())
print("flights", flights.duplicated().sum())
print("weather", weather.duplicated().sum())

airports 0
cancelled_diverted 945
flights 31
weather 0


In [None]:
# Check for duplicates and remove them
cancelled_diverted = cancelled_diverted.drop_duplicates()
flights = flights.drop_duplicates()

In [None]:
# Check for duplicates and remove them
print("airports", airports.duplicated().sum())
print("cancelled_diverted", cancelled_diverted.duplicated().sum())
print("flights", flights.duplicated().sum())
print("weather", weather.duplicated().sum())

airports 0
cancelled_diverted 0
flights 0
weather 0


# Find missing values

In [None]:
# Count the nas in each column
airports_nas = airports.isna().sum()
cancelled_diverted_nas = cancelled_diverted.isna().sum()
flights_nas = flights.isna().sum()
weather_nas = weather.isna().sum()

print("airports_nas: ")
print(airports_nas)
print(50 * "*")

print("cancelled_diverted_nas: ")
print(cancelled_diverted_nas)
print(50 * "*")

print("flights_nas: ")
print(flights_nas)
print(50 * "*")

print("weather_nas: ")
print(weather_nas)

airports_nas: 
iata_code    0
airport      0
city         0
state        0
country      0
latitude     0
longitude    0
dtype: int64
**************************************************
cancelled_diverted_nas: 
flightdate            0
day_of_week           0
airline               0
tail_number           0
cancelled             0
diverted              0
dep_airport           0
dep_cityname          0
deptime_label         0
dep_delay             0
dep_delay_tag         0
dep_delay_type        0
arr_airport           0
arr_cityname          0
arr_delay             0
arr_delay_type        0
flight_duration       0
distance_type         0
delay_carrier         0
delay_weather         0
delay_nas             0
delay_security        0
delay_lastaircraft    0
dtype: int64
**************************************************
flights_nas: 
flightdate            0
day_of_week           0
airline               0
tail_number           0
dep_airport           0
dep_cityname          0
deptime_label    

In [None]:
print(f"airports df info:")
airports.info()
print(50 * "=")
print(f"cancelled_diverted df info:")
cancelled_diverted.info()
print(50 * "=")
print(f"flights df info:")
flights.info()
print(50 * "=")
print(f"weather df info:")
weather.info()
print(50 * "=")

# Data profiling

In [None]:
from ydata_profiling import ProfileReport

profile_airports = ProfileReport(df=airports, title="profile_airports", explorative=True)
profile_airports.to_file(output_file="./profiling reports/profile_airports.html")

profile_cancelled_diverted = ProfileReport(
    df=cancelled_diverted, title="profile_cancelled_diverted", explorative=True
)
profile_cancelled_diverted.to_file(output_file="./profiling reports/profile_cancelled_diverted.html")

profile_flights = ProfileReport(df=flights, title="profile_flights", explorative=True)
profile_flights.to_file(output_file="./profiling reports/profile_flights.html")

profile_weather = ProfileReport(df=weather, title="profile_weather", explorative=True)
profile_weather.to_file(output_file="./profiling reports/profile_weather.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

(using `df.profile_report(correlations={"auto": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'could not convert string to float: 'Endeavor Air'')


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

(using `df.profile_report(correlations={"auto": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'could not convert string to float: 'Endeavor Air'')


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]