In [34]:
import pandas as pd
from tqdm.auto import tqdm
from ydata_profiling import ProfileReport
tqdm.pandas()

In [35]:
airports = pd.read_csv("../data/US 2023 Civil Flights  delays meteo and aircrafts/airports_geolocation.csv")
cancelled_diverted = pd.read_csv("../data/US 2023 Civil Flights  delays meteo and aircrafts/Cancelled_Diverted_2023.csv")
flights = pd.read_csv("../data/US 2023 Civil Flights  delays meteo and aircrafts/US_flights_2023.csv")
weather = pd.read_csv("../data/US 2023 Civil Flights  delays meteo and aircrafts/weather_meteo_by_airport.csv")

# TO DO
* get the info and datatype for each column
* fix the typing in each column
* describe each dataset
* count the nulls in each column
* rename columns properly (normalize them to be lower case and with _ instead of spaces)
* check for duplicates 
* fill/remove corrupt data (missing values)

# Flights info

- **index:** Unique flight ID (primary key)
- **Day_Of_Week:** Contains the days of the week from 1 to 7
- **Dep_Delay:** Departure delay ranging from -99 minutes to 4413 minutes with an average of 12.20 minutes
- **Dep_Delay_Tag:** Delay tag where 1 represents a delay of more than 5 minutes
- **Arr_Delay:** Arrival delay ranging from -119 minutes to 4405 minutes with an average of 6.62 minutes
- **Flight_Duration:** Flight duration ranging from 0 to 795 minutes with an average of 140 minutes
- **Delay_Carrier:** Delays caused by the airline ranging from 0 minutes to 3957 minutes
- **Delay_Weather:** Weather-related delays ranging from 0 to 1860 minutes
- **Delay_NAS:** Delays related to air traffic control (National Aviation System) ranging from 0 to 1708 minutes
- **Delay_Security:** Delays related to security checks ranging from 0 to 1460 minutes
- **Delay_LastAircraft:** Delays due to waiting for the aircraft ranging from 0 to 3581 minutes
- **Aircraft_age:** Age of the aircraft ranging from 1 to 57 years with an average of 13.48 years
- **Airline:** Contains the names of the **15 airlines** present in the dataset.
- **Tail_Number:** Unique identifier of the aircraft. **5963 aircraft** are present.
- **Dep_Airport:** Unique identifier of each departure airport. **350 airports** are represented.
- **Dep_CityName:** Names of the departure cities. **344 cities** are represented.
- **DepTime_label:** Division of the day into 6-hour periods (Night - Morning - Afternoon - Evening).
- **Dep_Delay_Type:** 3 categories of departure delays (Low < 5min - Medium > 15min - High > 60min).
- **CityName:** Names of the arrival cities. 344 cities are represented.
- **Arr_Delay_Type:** 3 categories of arrival delays (Low < 5min - Medium > 15min - High > 60min).
- **Distance_type:** 3 categories of flight distances in miles (Short Haul < 1500Mi, Medium Haul < 3000Mi, Long Haul < 6000Mi).
- **Manufacturer:** Aircraft manufacturer. **5 aircraft manufacturers** are present.
- **Model:** Model name given by the manufacturer. 21 aircraft models are present.


In [63]:
print("flights: ")
flights.info()

flights: 
<class 'pandas.core.frame.DataFrame'>
Index: 6743373 entries, 0 to 6743403
Data columns (total 24 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   flightdate          object
 1   day_of_week         int64 
 2   airline             object
 3   tail_number         object
 4   dep_airport         object
 5   dep_cityname        object
 6   deptime_label       object
 7   dep_delay           int64 
 8   dep_delay_tag       int64 
 9   dep_delay_type      object
 10  arr_airport         object
 11  arr_cityname        object
 12  arr_delay           int64 
 13  arr_delay_type      object
 14  flight_duration     int64 
 15  distance_type       object
 16  delay_carrier       int64 
 17  delay_weather       int64 
 18  delay_nas           int64 
 19  delay_security      int64 
 20  delay_lastaircraft  int64 
 21  manufacturer        object
 22  model               object
 23  aicraft_age         int64 
dtypes: int64(11), object(13)
memory usage: 1.3+ G

In [51]:
flights.sample(5)

Unnamed: 0,flightdate,day_of_week,airline,tail_number,dep_airport,dep_cityname,deptime_label,dep_delay,dep_delay_tag,dep_delay_type,...,flight_duration,distance_type,delay_carrier,delay_weather,delay_nas,delay_security,delay_lastaircraft,manufacturer,model,aicraft_age
4583476,2023-09-19,2,JetBlue Airways,N655JB,AUS,"Austin, TX",Afternoon,-5,0,Low <5min,...,229,Medium Haul <3000Mi,0,0,0,0,0,AIRBUS,A320,17
174537,2023-01-26,4,Delta Air Lines Inc,N127DN,PHX,"Phoenix, AZ",Evening,16,1,Medium >15min,...,213,Short Haul >1500Mi,16,0,31,0,0,AIRBUS,A321,3
3066756,2023-06-26,1,Skywest Airlines Inc.,N602UX,PSP,"Palm Springs, CA",Morning,11,1,Low <5min,...,112,Short Haul >1500Mi,0,0,25,0,0,EMBRAER,170/175,5
4036211,2023-08-17,4,Delta Air Lines Inc,N307DX,ATL,"Atlanta, GA",Morning,-2,0,Low <5min,...,150,Short Haul >1500Mi,0,0,0,0,0,AIRBUS,A321,8
5718151,2023-11-18,6,Alaska Airlines Inc.,N590AS,SEA,"Seattle, WA",Afternoon,-3,0,Low <5min,...,384,Medium Haul <3000Mi,0,0,0,0,0,BOEING,737 NG,17


In [68]:
flights.select_dtypes("object").describe(include="all").T

Unnamed: 0,count,unique,top,freq
flightdate,6743373,365,2023-11-26,20511
airline,6743373,15,Southwest Airlines Co.,1421229
tail_number,6743373,5963,N488HA,3327
dep_airport,6743373,350,ATL,332934
dep_cityname,6743373,344,"Chicago, IL",338766
deptime_label,6743373,4,Morning,2611546
dep_delay_type,6743373,3,Low <5min,5409706
arr_airport,6743373,350,ATL,332939
arr_cityname,6743373,344,"Chicago, IL",338319
arr_delay_type,6743373,3,Low <5min,5403696


In [69]:
flights.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
day_of_week,6743373.0,3.982794,2.001763,1.0,2.0,4.0,6.0,7.0
dep_delay,6743373.0,12.201062,55.079476,-99.0,-5.0,-2.0,9.0,4413.0
dep_delay_tag,6743373.0,0.379003,0.485139,0.0,0.0,0.0,1.0,1.0
arr_delay,6743373.0,6.627235,57.079037,-119.0,-15.0,-6.0,9.0,4405.0
flight_duration,6743373.0,140.298096,72.872159,0.0,87.0,124.0,171.0,795.0
delay_carrier,6743373.0,5.169827,36.457406,0.0,0.0,0.0,0.0,3957.0
delay_weather,6743373.0,0.742854,14.353961,0.0,0.0,0.0,0.0,1860.0
delay_nas,6743373.0,2.566969,15.004876,0.0,0.0,0.0,0.0,1708.0
delay_security,6743373.0,0.030649,1.628927,0.0,0.0,0.0,0.0,1460.0
delay_lastaircraft,6743373.0,5.681134,30.446536,0.0,0.0,0.0,0.0,3581.0


# Cancelled diverted info

- **index:** Unique flight ID (primary key)
- **Day_Of_Week:** Contains the days of the week from 1 to 7
- **Dep_Delay:** Departure delay ranging from -99 minutes to 4413 minutes with an average of 12.20 minutes
- **Dep_Delay_Tag:** Delay tag where 1 represents a delay of more than 5 minutes
- **Arr_Delay:** Arrival delay ranging from -119 minutes to 4405 minutes with an average of 6.62 minutes
- **Flight_Duration:** Flight duration ranging from 0 to 795 minutes with an average of 140 minutes
- **Delay_Carrier:** Delays caused by the airline ranging from 0 minutes to 3957 minutes
- **Delay_Weather:** Weather-related delays ranging from 0 to 1860 minutes
- **Delay_NAS:** Delays related to air traffic control (National Aviation System) ranging from 0 to 1708 minutes
- **Delay_Security:** Delays related to security checks ranging from 0 to 1460 minutes
- **Delay_LastAircraft:** Delays due to waiting for the aircraft ranging from 0 to 3581 minutes
- **Aircraft_age:** Age of the aircraft ranging from 1 to 57 years with an average of 13.48 years
- **Airline:** Contains the names of the **15 airlines** present in the dataset.
- **Tail_Number:** Unique identifier of the aircraft. **5963 aircraft** are present.
- **Dep_Airport:** Unique identifier of each departure airport. **350 airports** are represented.
- **Dep_CityName:** Names of the departure cities. **344 cities** are represented.
- **DepTime_label:** Division of the day into 6-hour periods (Night - Morning - Afternoon - Evening).
- **Dep_Delay_Type:** 3 categories of departure delays (Low < 5min - Medium > 15min - High > 60min).
- **CityName:** Names of the arrival cities. 344 cities are represented.
- **Arr_Delay_Type:** 3 categories of arrival delays (Low < 5min - Medium > 15min - High > 60min).
- **Distance_type:** 3 categories of flight distances in miles (Short Haul < 1500Mi, Medium Haul < 3000Mi, Long Haul < 6000Mi).
- **Manufacturer:** Aircraft manufacturer. **5 aircraft manufacturers** are present.
- **Model:** Model name given by the manufacturer. 21 aircraft models are present.
- **Canceled flight (bool)**
- **Diverted flight (bool)**

In [66]:
print("cancelled_diverted: ")
cancelled_diverted.info()

cancelled_diverted: 
<class 'pandas.core.frame.DataFrame'>
Index: 103543 entries, 0 to 104487
Data columns (total 23 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   flightdate          103543 non-null  object 
 1   day_of_week         103543 non-null  int64  
 2   airline             103543 non-null  object 
 3   tail_number         103543 non-null  object 
 4   cancelled           103543 non-null  float64
 5   diverted            103543 non-null  float64
 6   dep_airport         103543 non-null  object 
 7   dep_cityname        103543 non-null  object 
 8   deptime_label       103543 non-null  object 
 9   dep_delay           103543 non-null  float64
 10  dep_delay_tag       103543 non-null  int64  
 11  dep_delay_type      103543 non-null  object 
 12  arr_airport         103543 non-null  object 
 13  arr_cityname        103543 non-null  object 
 14  arr_delay           103543 non-null  float64
 15  arr_delay_type    

In [67]:
cancelled_diverted.sample(5)

Unnamed: 0,flightdate,day_of_week,airline,tail_number,cancelled,diverted,dep_airport,dep_cityname,deptime_label,dep_delay,...,arr_cityname,arr_delay,arr_delay_type,flight_duration,distance_type,delay_carrier,delay_weather,delay_nas,delay_security,delay_lastaircraft
3886,2023-01-12,4,Hawaiian Airlines Inc.,0,1.0,0.0,HNL,"Honolulu, HI",Morning,0.0,...,"Lihue, HI",0.0,No Arrival Delay,0.0,Short Haul,0.0,0.0,0.0,0.0,0.0
102798,2023-12-24,7,Southwest Airlines Co.,N8617E,1.0,0.0,BDL,"Hartford, CT",Afternoon,0.0,...,"Tampa, FL",0.0,No Arrival Delay,0.0,Short Haul,0.0,0.0,0.0,0.0,0.0
21806,2023-03-11,6,Skywest Airlines Inc.,N746SK,0.0,1.0,DFW,"Dallas/Fort Worth, TX",Afternoon,-1.0,...,"Aspen, CO",0.0,No Arrival Delay,0.0,Short Haul,0.0,0.0,0.0,0.0,0.0
85413,2023-08-07,1,United Air Lines Inc.,0,1.0,0.0,EWR,"Newark, NJ",Morning,0.0,...,"Los Angeles, CA",0.0,No Arrival Delay,0.0,Medium Haul,0.0,0.0,0.0,0.0,0.0
104422,2023-12-21,4,JetBlue Airways,N957JB,0.0,1.0,SAN,"San Diego, CA",Evening,1.0,...,"Boston, MA",0.0,No Arrival Delay,0.0,Medium Haul,0.0,0.0,0.0,0.0,0.0


In [70]:
cancelled_diverted.select_dtypes("object").describe(include="all").T

Unnamed: 0,count,unique,top,freq
flightdate,103543,365,2023-02-01,2127
airline,103543,15,Southwest Airlines Co.,17212
tail_number,103543,5438,0,14651
dep_airport,103543,345,DFW,5189
dep_cityname,103543,339,"New York, NY",8302
deptime_label,103543,4,Afternoon,37530
dep_delay_type,103543,3,No Departure Delay,95798
arr_airport,103543,346,DFW,5464
arr_cityname,103543,340,"New York, NY",8563
arr_delay_type,103543,1,No Arrival Delay,103543


In [71]:
cancelled_diverted.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
day_of_week,103543.0,3.969993,2.021613,1.0,2.0,4.0,6.0,7.0
cancelled,103543.0,0.840144,0.366474,0.0,1.0,1.0,1.0,1.0
diverted,103543.0,0.159856,0.366474,0.0,0.0,0.0,0.0,1.0
dep_delay,103543.0,7.870025,49.841918,-31.0,0.0,0.0,0.0,2414.0
dep_delay_tag,103543.0,0.094598,0.292661,0.0,0.0,0.0,0.0,1.0
arr_delay,103543.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
flight_duration,103543.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
delay_carrier,103543.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
delay_weather,103543.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
delay_nas,103543.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Weather Info

- **time** Timestamps associated with the weather data (yyyy-mm-dd). 
- **tavg** Average Temperature (°C). 
- **tmin** Minimum Temperature (°C).
- **tmax** Maximum Temperature (°C).
- **prcp** Total precipitation amount in mm. 
- **snow** Snow Depth.
- **wdir** Wind (From) Direction (Degrees).
- **wspd** Average Wind Speed (km/h). 
- **pres** Sea-Level Air Pressure (hPa)
- **airport_id** Unique identifier for the airport where the weather data was collected. 


In [52]:
print("weather: ")
weather.info()

weather: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132860 entries, 0 to 132859
Data columns (total 10 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   time        132860 non-null  object 
 1   tavg        132860 non-null  float64
 2   tmin        132860 non-null  float64
 3   tmax        132860 non-null  float64
 4   prcp        132860 non-null  float64
 5   snow        132860 non-null  float64
 6   wdir        132860 non-null  float64
 7   wspd        132860 non-null  float64
 8   pres        132860 non-null  float64
 9   airport_id  132860 non-null  object 
dtypes: float64(8), object(2)
memory usage: 10.1+ MB


In [53]:
weather.sample(5)

Unnamed: 0,time,tavg,tmin,tmax,prcp,snow,wdir,wspd,pres,airport_id
20432,2023-12-24,13.2,10.0,18.9,0.0,0.0,301.0,4.7,1019.5,BUR
69086,2023-04-12,13.6,3.9,22.0,0.0,0.0,212.0,27.2,1006.3,LAR
17162,2023-01-08,23.6,21.0,28.0,2.5,0.0,99.0,13.7,1018.4,BQN
89494,2023-03-11,-0.4,-1.0,1.7,6.6,0.0,87.0,15.8,1012.3,OMA
20953,2023-05-29,16.2,9.4,22.8,0.0,0.0,8.0,8.0,1011.1,BZN


In [73]:
weather.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
tavg,132860.0,14.042641,10.672859,-39.1,6.3,15.4,22.5,42.2
tmin,132860.0,8.699833,10.680441,-76.0,1.1,9.4,17.2,37.2
tmax,132860.0,19.617149,11.392223,-35.0,11.7,21.2,28.9,50.0
prcp,132860.0,2.520604,7.915755,0.0,0.0,0.0,1.0,571.5
snow,132860.0,6.810131,50.935283,0.0,0.0,0.0,0.0,1780.0
wdir,132860.0,188.686042,114.629099,0.0,79.0,197.0,297.0,360.0
wspd,132860.0,12.424556,6.211261,0.0,7.9,11.3,15.7,78.3
pres,132860.0,1015.650368,6.944599,964.3,1011.6,1015.4,1019.7,1051.0


# Airports info

- **IATA_CODE**	Unique identifier for the airport.
- **AIRPORT**	Full name of the airport.
- **CITY**	City where the airport is located.
- **STATE**	State (US) where the airport is located.
- **COUNTRY**	Country where the airport is located (Always US).
- **LATITUDE**	Geographic latitude of the airport in decimal degrees.
- **LONGITUDE**	Geographic longitude of the airport in decimal degrees.

In [54]:
print("airports: ")
airports.info()

airports: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 364 entries, 0 to 363
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   iata_code  364 non-null    object 
 1   airport    364 non-null    object 
 2   city       364 non-null    object 
 3   state      364 non-null    object 
 4   country    364 non-null    object 
 5   latitude   364 non-null    float64
 6   longitude  364 non-null    float64
dtypes: float64(2), object(5)
memory usage: 20.0+ KB


In [55]:
airports.sample(5)

Unnamed: 0,iata_code,airport,city,state,country,latitude,longitude
314,SHR,Sheridan Country Airport,Sheridan,WY,USA,44.7972,-106.9562
235,MSO,Missoula International Airport,Missoula,MT,USA,46.91631,-114.09056
62,CEC,Del Norte County Airport (Jack McNamara Field),Crescent City,CA,USA,41.78016,-124.23653
13,AKN,King Salmon Airport,King Salmon,AK,USA,58.6768,-156.64922
293,ROA,Roanoke Regional Airport (Woodrum Field),Roanoke,VA,USA,37.32547,-79.97543


In [75]:
airports.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
latitude,364.0,38.81097,8.757505,-14.2781,33.672655,39.25193,43.11311,71.28545
longitude,364.0,-96.809033,27.471654,-176.64603,-110.788562,-93.74314,-82.501792,145.7333


In [76]:
airports.select_dtypes("object").describe(include="all").T

Unnamed: 0,count,unique,top,freq
iata_code,364,364,ABE,1
airport,364,363,Port Columbus International Airport,2
city,364,346,Columbus,3
state,364,55,CA,26
country,364,1,USA,364


# Normalizing columns

In [42]:
# Function to normalize column names
def normalize_columns(df):
    df.columns = df.columns.str.lower().str.replace(" ", "_")
    return df

In [43]:
# Normalize column names
airports = normalize_columns(airports)
cancelled_diverted = normalize_columns(cancelled_diverted)
flights = normalize_columns(flights)
weather = normalize_columns(weather)

# Data profiling

In [44]:
# profile_airports = ProfileReport(airports, title="Profiling Report",explorative=True)
# profile_airports.to_file("profile_airports.html")
# profile_cancelled_diverted = ProfileReport(cancelled_diverted, title="Profiling Report",explorative=True)
# profile_cancelled_diverted.to_file("profile_cancelled_diverted.html")
# profile_flights = ProfileReport(flights, title="Profiling Report",explorative=True)
# profile_flights.to_file("profile_flights.html")
# profile_weather = ProfileReport(weather, title="Profiling Report",explorative=True)
# profile_weather.to_file("profile_weather.html")


# Getting to know the duplicates

In [56]:
# Check for duplicates and remove them
print("airports", airports.duplicated().sum())
print("cancelled_diverted", cancelled_diverted.duplicated().sum())
print("flights", flights.duplicated().sum())
print("weather", weather.duplicated().sum())

airports 0
cancelled_diverted 0
flights 0
weather 0


In [57]:
# Check for duplicates and remove them
airports = airports.drop_duplicates()
cancelled_diverted = cancelled_diverted.drop_duplicates()
flights = flights.drop_duplicates()
weather = weather.drop_duplicates()

In [58]:
# Check for duplicates and remove them
print("airports", airports.duplicated().sum())
print("cancelled_diverted", cancelled_diverted.duplicated().sum())
print("flights", flights.duplicated().sum())
print("weather", weather.duplicated().sum())

airports 0
cancelled_diverted 0
flights 0
weather 0


In [None]:
# Describe each dataset
airports.describe()

Unnamed: 0,tavg,tmin,tmax,prcp,snow,wdir,wspd,pres
count,132860.0,132860.0,132860.0,132860.0,132860.0,132860.0,132860.0,132860.0
mean,14.042641,8.699833,19.617149,2.520604,6.810131,188.686042,12.424556,1015.650368
std,10.672859,10.680441,11.392223,7.915755,50.935283,114.629099,6.211261,6.944599
min,-39.1,-76.0,-35.0,0.0,0.0,0.0,0.0,964.3
25%,6.3,1.1,11.7,0.0,0.0,79.0,7.9,1011.6
50%,15.4,9.4,21.2,0.0,0.0,197.0,11.3,1015.4
75%,22.5,17.2,28.9,1.0,0.0,297.0,15.7,1019.7
max,42.2,37.2,50.0,571.5,1780.0,360.0,78.3,1051.0


In [None]:
cancelled_diverted.describe()

Unnamed: 0,day_of_week,cancelled,diverted,dep_delay,dep_delay_tag,arr_delay,flight_duration,delay_carrier,delay_weather,delay_nas,delay_security,delay_lastaircraft
count,103543.0,103543.0,103543.0,103543.0,103543.0,103543.0,103543.0,103543.0,103543.0,103543.0,103543.0,103543.0
mean,3.969993,0.840144,0.159856,7.870025,0.094598,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,2.021613,0.366474,0.366474,49.841918,0.292661,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,1.0,0.0,0.0,-31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,6.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,7.0,1.0,1.0,2414.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
flights.describe()

Unnamed: 0,day_of_week,dep_delay,dep_delay_tag,arr_delay,flight_duration,delay_carrier,delay_weather,delay_nas,delay_security,delay_lastaircraft,aicraft_age
count,6743373.0,6743373.0,6743373.0,6743373.0,6743373.0,6743373.0,6743373.0,6743373.0,6743373.0,6743373.0,6743373.0
mean,3.982794,12.20106,0.3790028,6.627235,140.2981,5.169827,0.7428539,2.566969,0.03064891,5.681134,13.48064
std,2.001763,55.07948,0.4851388,57.07904,72.87216,36.45741,14.35396,15.00488,1.628927,30.44654,7.891499
min,1.0,-99.0,0.0,-119.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,2.0,-5.0,0.0,-15.0,87.0,0.0,0.0,0.0,0.0,0.0,7.0
50%,4.0,-2.0,0.0,-6.0,124.0,0.0,0.0,0.0,0.0,0.0,12.0
75%,6.0,9.0,1.0,9.0,171.0,0.0,0.0,0.0,0.0,0.0,20.0
max,7.0,4413.0,1.0,4405.0,795.0,3957.0,1860.0,1708.0,1460.0,3581.0,57.0


In [None]:
weather.describe()

Unnamed: 0,tavg,tmin,tmax,prcp,snow,wdir,wspd,pres
count,132860.0,132860.0,132860.0,132860.0,132860.0,132860.0,132860.0,132860.0
mean,14.042641,8.699833,19.617149,2.520604,6.810131,188.686042,12.424556,1015.650368
std,10.672859,10.680441,11.392223,7.915755,50.935283,114.629099,6.211261,6.944599
min,-39.1,-76.0,-35.0,0.0,0.0,0.0,0.0,964.3
25%,6.3,1.1,11.7,0.0,0.0,79.0,7.9,1011.6
50%,15.4,9.4,21.2,0.0,0.0,197.0,11.3,1015.4
75%,22.5,17.2,28.9,1.0,0.0,297.0,15.7,1019.7
max,42.2,37.2,50.0,571.5,1780.0,360.0,78.3,1051.0


# Find missing values

In [None]:
# Count the nas in each column
airports_nas = airports.isna().sum()
cancelled_diverted_nas = cancelled_diverted.isna().sum()
flights_nas = flights.isna().sum()
weather_nas = weather.isna().sum()

print("airports_nas: ")
print(airports_nas)
print(50 * "*")

print("cancelled_diverted_nas: ")
print(cancelled_diverted_nas)
print(50 * "*")

print("flights_nas: ")
print(flights_nas)
print(50 * "*")

print("weather_nas: ")
print(weather_nas)

airports_nas: 
iata_code    0
airport      0
city         0
state        0
country      0
latitude     0
longitude    0
dtype: int64
**************************************************
cancelled_diverted_nas: 
flightdate            0
day_of_week           0
airline               0
tail_number           0
cancelled             0
diverted              0
dep_airport           0
dep_cityname          0
deptime_label         0
dep_delay             0
dep_delay_tag         0
dep_delay_type        0
arr_airport           0
arr_cityname          0
arr_delay             0
arr_delay_type        0
flight_duration       0
distance_type         0
delay_carrier         0
delay_weather         0
delay_nas             0
delay_security        0
delay_lastaircraft    0
dtype: int64
**************************************************
flights_nas: 
flightdate            0
day_of_week           0
airline               0
tail_number           0
dep_airport           0
dep_cityname          0
deptime_label    