In [6]:
import pandas as pd
from tqdm.auto import tqdm
from ydata_profiling import ProfileReport
tqdm.pandas()

In [7]:
airports = pd.read_csv("../data/US 2023 Civil Flights  delays meteo and aircrafts/airports_geolocation.csv")
cancelled_diverted = pd.read_csv("../data/US 2023 Civil Flights  delays meteo and aircrafts/Cancelled_Diverted_2023.csv")
flights = pd.read_csv(filepath_or_buffer="../data/US 2023 Civil Flights  delays meteo and aircrafts/US_flights_2023.csv")
weather = pd.read_csv("../data/US 2023 Civil Flights  delays meteo and aircrafts/weather_meteo_by_airport.csv")

# TO DO
* Get the info and datatype for each column.
* Fix the typing in each column.
* Describe each dataset.
* Rename columns properly (normalize them to be lower case and with _ instead of spaces).
* Check for duplicates .
* Count the nulls in each column.
* Fill/remove corrupt data (missing values).

# Flights info

- **index:** Unique flight ID (primary key)
- **Day_Of_Week:** Contains the days of the week from 1 to 7
- **Dep_Delay:** Departure delay ranging from -99 minutes to 4413 minutes with an average of 12.20 minutes
- **Dep_Delay_Tag:** Delay tag where 1 represents a delay of more than 5 minutes
- **Arr_Delay:** Arrival delay ranging from -119 minutes to 4405 minutes with an average of 6.62 minutes
- **Flight_Duration:** Flight duration ranging from 0 to 795 minutes with an average of 140 minutes
- **Delay_Carrier:** Delays caused by the airline ranging from 0 minutes to 3957 minutes
- **Delay_Weather:** Weather-related delays ranging from 0 to 1860 minutes
- **Delay_NAS:** Delays related to air traffic control (National Aviation System) ranging from 0 to 1708 minutes
- **Delay_Security:** Delays related to security checks ranging from 0 to 1460 minutes
- **Delay_LastAircraft:** Delays due to waiting for the aircraft ranging from 0 to 3581 minutes
- **Aircraft_age:** Age of the aircraft ranging from 1 to 57 years with an average of 13.48 years
- **Airline:** Contains the names of the **15 airlines** present in the dataset.
- **Tail_Number:** Unique identifier of the aircraft. **5963 aircraft** are present.
- **Dep_Airport:** Unique identifier of each departure airport. **350 airports** are represented.
- **Dep_CityName:** Names of the departure cities. **344 cities** are represented.
- **DepTime_label:** Division of the day into 6-hour periods (Night - Morning - Afternoon - Evening).
- **Dep_Delay_Type:** 3 categories of departure delays (Low < 5min - Medium > 15min - High > 60min).
- **CityName:** Names of the arrival cities. 344 cities are represented.
- **Arr_Delay_Type:** 3 categories of arrival delays (Low < 5min - Medium > 15min - High > 60min).
- **Distance_type:** 3 categories of flight distances in miles (Short Haul < 1500Mi, Medium Haul < 3000Mi, Long Haul < 6000Mi).
- **Manufacturer:** Aircraft manufacturer. **5 aircraft manufacturers** are present.
- **Model:** Model name given by the manufacturer. 21 aircraft models are present.


In [8]:
day_of_week_mapping = {
    1: "Monday",
    2: "Tuesday",
    3: "Wednesday",
    4: "Thursday",
    5: "Friday",
    6: "Saturday",
    7: "Sunday",
}
flights["Day_Of_Week"] = flights["Day_Of_Week"].map(day_of_week_mapping)
cancelled_diverted["Day_Of_Week"] = cancelled_diverted["Day_Of_Week"].map(day_of_week_mapping)

In [9]:
print("flights: ")
flights.info()

flights: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6743404 entries, 0 to 6743403
Data columns (total 24 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   FlightDate          object
 1   Day_Of_Week         object
 2   Airline             object
 3   Tail_Number         object
 4   Dep_Airport         object
 5   Dep_CityName        object
 6   DepTime_label       object
 7   Dep_Delay           int64 
 8   Dep_Delay_Tag       int64 
 9   Dep_Delay_Type      object
 10  Arr_Airport         object
 11  Arr_CityName        object
 12  Arr_Delay           int64 
 13  Arr_Delay_Type      object
 14  Flight_Duration     int64 
 15  Distance_type       object
 16  Delay_Carrier       int64 
 17  Delay_Weather       int64 
 18  Delay_NAS           int64 
 19  Delay_Security      int64 
 20  Delay_LastAircraft  int64 
 21  Manufacturer        object
 22  Model               object
 23  Aicraft_age         int64 
dtypes: int64(10), object(14)
memory usage: 1

In [10]:
old = flights.memory_usage(deep=True) / (1024**2)
old

Index                   0.000122
FlightDate            430.877750
Day_Of_Week           412.335052
Airline               495.177408
Tail_Number           405.137172
Dep_Airport           385.860672
Dep_CityName          450.460842
DepTime_label         415.689714
Dep_Delay              51.448090
Dep_Delay_Tag          51.448090
Dep_Delay_Type        429.099715
Arr_Airport           385.860672
Arr_CityName          450.466407
Arr_Delay              51.448090
Arr_Delay_Type        429.124532
Flight_Duration        51.448090
Distance_type         483.129905
Delay_Carrier          51.448090
Delay_Weather          51.448090
Delay_NAS              51.448090
Delay_Security         51.448090
Delay_LastAircraft     51.448090
Manufacturer          415.923480
Model                 399.199087
Aicraft_age            51.448090
dtype: float64

In [11]:
flights["FlightDate"] = flights["FlightDate"].astype("datetime64[ns]")
cancelled_diverted["FlightDate"] = cancelled_diverted["FlightDate"].astype("datetime64[ns]")

In [12]:
categorical_columns = [
    "Airline",
    "Day_Of_Week",
    "Dep_Airport",
    "Arr_Airport",
    "Dep_CityName",
    "Arr_CityName",
    "DepTime_label",
    "Dep_Delay_Type",
    "Arr_Delay_Type",
    "Distance_type",
    "Manufacturer",
    "Model",
]

for column in categorical_columns:
    flights[column] = flights[column].astype("category")

In [13]:
print("flights: ")
flights.info()

flights: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6743404 entries, 0 to 6743403
Data columns (total 24 columns):
 #   Column              Dtype         
---  ------              -----         
 0   FlightDate          datetime64[ns]
 1   Day_Of_Week         category      
 2   Airline             category      
 3   Tail_Number         object        
 4   Dep_Airport         category      
 5   Dep_CityName        category      
 6   DepTime_label       category      
 7   Dep_Delay           int64         
 8   Dep_Delay_Tag       int64         
 9   Dep_Delay_Type      category      
 10  Arr_Airport         category      
 11  Arr_CityName        category      
 12  Arr_Delay           int64         
 13  Arr_Delay_Type      category      
 14  Flight_Duration     int64         
 15  Distance_type       category      
 16  Delay_Carrier       int64         
 17  Delay_Weather       int64         
 18  Delay_NAS           int64         
 19  Delay_Security      int64       

In [14]:
new = flights.memory_usage(deep=True) / (1024**2)

In [15]:
for i,j,index in zip(old,new,old.index):
    if i != j:
        print(f"{index}: {i:0.2f} MB => {j:0.2f} MB. a {i/j:0.2f} times reduction in size")

FlightDate: 430.88 MB => 51.45 MB. a 8.38 times reduction in size
Day_Of_Week: 412.34 MB => 6.43 MB. a 64.11 times reduction in size
Airline: 495.18 MB => 6.43 MB. a 76.98 times reduction in size
Dep_Airport: 385.86 MB => 12.89 MB. a 29.93 times reduction in size
Dep_CityName: 450.46 MB => 12.89 MB. a 34.94 times reduction in size
DepTime_label: 415.69 MB => 6.43 MB. a 64.63 times reduction in size
Dep_Delay_Type: 429.10 MB => 6.43 MB. a 66.72 times reduction in size
Arr_Airport: 385.86 MB => 12.89 MB. a 29.93 times reduction in size
Arr_CityName: 450.47 MB => 12.89 MB. a 34.94 times reduction in size
Arr_Delay_Type: 429.12 MB => 6.43 MB. a 66.72 times reduction in size
Distance_type: 483.13 MB => 6.43 MB. a 75.12 times reduction in size
Manufacturer: 415.92 MB => 6.43 MB. a 64.67 times reduction in size
Model: 399.20 MB => 6.43 MB. a 62.06 times reduction in size


In [16]:
flights.sample(5)

Unnamed: 0,FlightDate,Day_Of_Week,Airline,Tail_Number,Dep_Airport,Dep_CityName,DepTime_label,Dep_Delay,Dep_Delay_Tag,Dep_Delay_Type,...,Flight_Duration,Distance_type,Delay_Carrier,Delay_Weather,Delay_NAS,Delay_Security,Delay_LastAircraft,Manufacturer,Model,Aicraft_age
779819,2023-02-02,Thursday,Southwest Airlines Co.,N461WN,MDW,"Chicago, IL",Morning,-6,0,Low <5min,...,161,Short Haul >1500Mi,0,0,0,0,0,BOEING,737 NG,20
3614148,2023-07-10,Monday,Southwest Airlines Co.,N7732A,PDX,"Portland, OR",Night,6,1,Low <5min,...,166,Short Haul >1500Mi,6,0,11,0,0,BOEING,737 NG,18
6574944,2023-12-21,Thursday,Southwest Airlines Co.,N8708Q,OKC,"Oklahoma City, OK",Night,-1,0,Low <5min,...,134,Short Haul >1500Mi,0,0,0,0,0,BOEING,737 NG,7
4362271,2023-08-19,Saturday,Southwest Airlines Co.,N205WN,LAS,"Las Vegas, NV",Afternoon,23,1,Medium >15min,...,135,Short Haul >1500Mi,0,0,0,0,0,BOEING,737 NG,19
4955051,2023-09-20,Wednesday,Southwest Airlines Co.,N461WN,RIC,"Richmond, VA",Afternoon,-5,0,Low <5min,...,88,Short Haul >1500Mi,0,0,0,0,0,BOEING,737 NG,20


In [17]:
flights.select_dtypes(include=["object","category"]).describe(include="all").T

Unnamed: 0,count,unique,top,freq
Day_Of_Week,6743404,7,Friday,1003622
Airline,6743404,15,Southwest Airlines Co.,1421238
Tail_Number,6743404,5963,N488HA,3327
Dep_Airport,6743404,350,ATL,332935
Dep_CityName,6743404,344,"Chicago, IL",338766
DepTime_label,6743404,4,Morning,2611567
Dep_Delay_Type,6743404,3,Low <5min,5409737
Arr_Airport,6743404,350,ATL,332941
Arr_CityName,6743404,344,"Chicago, IL",338319
Arr_Delay_Type,6743404,3,Low <5min,5403727


In [18]:
flights.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
FlightDate,6743404.0,2023-07-04 07:06:16.929396480,2023-01-01 00:00:00,2023-04-06 00:00:00,2023-07-06 00:00:00,2023-10-02 00:00:00,2023-12-31 00:00:00,
Dep_Delay,6743404.0,12.200987,-99.0,-5.0,-2.0,9.0,4413.0,55.079361
Dep_Delay_Tag,6743404.0,0.379001,0.0,0.0,0.0,1.0,1.0,0.485138
Arr_Delay,6743404.0,6.627152,-119.0,-15.0,-6.0,9.0,4405.0,57.078921
Flight_Duration,6743404.0,140.297779,0.0,87.0,124.0,171.0,795.0,72.872157
Delay_Carrier,6743404.0,5.169804,0.0,0.0,0.0,0.0,3957.0,36.457324
Delay_Weather,6743404.0,0.74285,0.0,0.0,0.0,0.0,1860.0,14.353928
Delay_NAS,6743404.0,2.566957,0.0,0.0,0.0,0.0,1708.0,15.004842
Delay_Security,6743404.0,0.030649,0.0,0.0,0.0,0.0,1460.0,1.628923
Delay_LastAircraft,6743404.0,5.681108,0.0,0.0,0.0,0.0,3581.0,30.446469


# Cancelled diverted info

- **index:** Unique flight ID (primary key)
- **Day_Of_Week:** Contains the days of the week from 1 to 7
- **Dep_Delay:** Departure delay ranging from -99 minutes to 4413 minutes with an average of 12.20 minutes
- **Dep_Delay_Tag:** Delay tag where 1 represents a delay of more than 5 minutes
- **Arr_Delay:** Arrival delay ranging from -119 minutes to 4405 minutes with an average of 6.62 minutes
- **Flight_Duration:** Flight duration ranging from 0 to 795 minutes with an average of 140 minutes
- **Delay_Carrier:** Delays caused by the airline ranging from 0 minutes to 3957 minutes
- **Delay_Weather:** Weather-related delays ranging from 0 to 1860 minutes
- **Delay_NAS:** Delays related to air traffic control (National Aviation System) ranging from 0 to 1708 minutes
- **Delay_Security:** Delays related to security checks ranging from 0 to 1460 minutes
- **Delay_LastAircraft:** Delays due to waiting for the aircraft ranging from 0 to 3581 minutes
- **Aircraft_age:** Age of the aircraft ranging from 1 to 57 years with an average of 13.48 years
- **Airline:** Contains the names of the **15 airlines** present in the dataset.
- **Tail_Number:** Unique identifier of the aircraft. **5963 aircraft** are present.
- **Dep_Airport:** Unique identifier of each departure airport. **350 airports** are represented.
- **Dep_CityName:** Names of the departure cities. **344 cities** are represented.
- **DepTime_label:** Division of the day into 6-hour periods (Night - Morning - Afternoon - Evening).
- **Dep_Delay_Type:** 3 categories of departure delays (Low < 5min - Medium > 15min - High > 60min).
- **CityName:** Names of the arrival cities. 344 cities are represented.
- **Arr_Delay_Type:** 3 categories of arrival delays (Low < 5min - Medium > 15min - High > 60min).
- **Distance_type:** 3 categories of flight distances in miles (Short Haul < 1500Mi, Medium Haul < 3000Mi, Long Haul < 6000Mi).
- **Manufacturer:** Aircraft manufacturer. **5 aircraft manufacturers** are present.
- **Model:** Model name given by the manufacturer. 21 aircraft models are present.
- **Canceled flight (bool)**
- **Diverted flight (bool)**

In [19]:
print("cancelled_diverted: ")
cancelled_diverted.info()

cancelled_diverted: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104488 entries, 0 to 104487
Data columns (total 23 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   FlightDate          104488 non-null  datetime64[ns]
 1   Day_Of_Week         104488 non-null  object        
 2   Airline             104488 non-null  object        
 3   Tail_Number         104488 non-null  object        
 4   Cancelled           104488 non-null  float64       
 5   Diverted            104488 non-null  float64       
 6   Dep_Airport         104488 non-null  object        
 7   Dep_CityName        104488 non-null  object        
 8   DepTime_label       104488 non-null  object        
 9   Dep_Delay           104488 non-null  float64       
 10  Dep_Delay_Tag       104488 non-null  int64         
 11  Dep_Delay_Type      104488 non-null  object        
 12  Arr_Airport         104488 non-null  object        
 13  Arr_City

In [20]:
categorical_columns = [
    "Day_Of_Week",
    "Airline",
    "Cancelled",
    "Diverted",
    "Dep_Airport",
    "Dep_CityName",
    "DepTime_label",
    "Dep_Delay_Tag",
    "Dep_Delay_Type",
    "Arr_Airport",
    "Arr_CityName",
    "Arr_Delay_Type",
    "Distance_type",
]

for column in categorical_columns:
    cancelled_diverted[column] = cancelled_diverted[column].astype("category")

In [21]:
cancelled_diverted.sample(5)

Unnamed: 0,FlightDate,Day_Of_Week,Airline,Tail_Number,Cancelled,Diverted,Dep_Airport,Dep_CityName,DepTime_label,Dep_Delay,...,Arr_CityName,Arr_Delay,Arr_Delay_Type,Flight_Duration,Distance_type,Delay_Carrier,Delay_Weather,Delay_NAS,Delay_Security,Delay_LastAircraft
21364,2023-02-22,Wednesday,United Air Lines Inc.,0,1.0,0.0,HDN,"Hayden, CO",Afternoon,0.0,...,"Denver, CO",0.0,No Arrival Delay,0.0,Short Haul,0.0,0.0,0.0,0.0,0.0
80915,2023-08-07,Monday,Delta Air Lines Inc,N101DU,1.0,0.0,ORD,"Chicago, IL",Evening,0.0,...,"New York, NY",0.0,No Arrival Delay,0.0,Short Haul,0.0,0.0,0.0,0.0,0.0
72660,2023-07-02,Sunday,Republic Airways,N645RW,1.0,0.0,EWR,"Newark, NJ",Morning,0.0,...,"Columbus, OH",0.0,No Arrival Delay,0.0,Short Haul,0.0,0.0,0.0,0.0,0.0
21792,2023-03-14,Tuesday,PSA Airlines,N543EA,1.0,0.0,DCA,"Washington, DC",Morning,0.0,...,"Burlington, VT",0.0,No Arrival Delay,0.0,Short Haul,0.0,0.0,0.0,0.0,0.0
17752,2023-02-21,Tuesday,Southwest Airlines Co.,N8549Z,1.0,0.0,DEN,"Denver, CO",Afternoon,0.0,...,"Minneapolis, MN",0.0,No Arrival Delay,0.0,Short Haul,0.0,0.0,0.0,0.0,0.0


In [22]:
cancelled_diverted[cancelled_diverted["Tail_Number"] == "0"]["Cancelled"].eq(1.0).all()

True

In [23]:
cancelled_diverted[cancelled_diverted["Cancelled"] == 1.0]["Tail_Number"].eq("0").value_counts()

Tail_Number
False    72450
True     15486
Name: count, dtype: int64

In [24]:
cancelled_diverted.select_dtypes(["object", "category"]).describe(include="all").T

Unnamed: 0,count,unique,top,freq
Day_Of_Week,104488.0,7.0,Wednesday,17461.0
Airline,104488.0,15.0,Southwest Airlines Co.,17227.0
Tail_Number,104488.0,5438.0,0,15486.0
Cancelled,104488.0,2.0,1.0,87936.0
Diverted,104488.0,2.0,0.0,87936.0
Dep_Airport,104488.0,345.0,DFW,5223.0
Dep_CityName,104488.0,339.0,"New York, NY",8352.0
DepTime_label,104488.0,4.0,Afternoon,37942.0
Dep_Delay_Tag,104488.0,2.0,0,94693.0
Dep_Delay_Type,104488.0,3.0,No Departure Delay,96743.0


In [25]:
cancelled_diverted.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
FlightDate,104488.0,2023-06-02 12:06:03.004364288,2023-01-01 00:00:00,2023-03-14 00:00:00,2023-06-23 00:00:00,2023-08-05 00:00:00,2023-12-31 00:00:00,
Dep_Delay,104488.0,7.798848,-31.0,0.0,0.0,0.0,2414.0,49.62161
Arr_Delay,104488.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Flight_Duration,104488.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Delay_Carrier,104488.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Delay_Weather,104488.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Delay_NAS,104488.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Delay_Security,104488.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Delay_LastAircraft,104488.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Weather Info

- **time** Timestamps associated with the weather data (yyyy-mm-dd). 
- **tavg** Average Temperature (°C). 
- **tmin** Minimum Temperature (°C).
- **tmax** Maximum Temperature (°C).
- **prcp** Total precipitation amount in mm. 
- **snow** Snow Depth.
- **wdir** Wind (From) Direction (Degrees).
- **wspd** Average Wind Speed (km/h). 
- **pres** Sea-Level Air Pressure (hPa)
- **airport_id** Unique identifier for the airport where the weather data was collected. 


In [26]:
print("weather: ")
weather.info()

weather: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132860 entries, 0 to 132859
Data columns (total 10 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   time        132860 non-null  object 
 1   tavg        132860 non-null  float64
 2   tmin        132860 non-null  float64
 3   tmax        132860 non-null  float64
 4   prcp        132860 non-null  float64
 5   snow        132860 non-null  float64
 6   wdir        132860 non-null  float64
 7   wspd        132860 non-null  float64
 8   pres        132860 non-null  float64
 9   airport_id  132860 non-null  object 
dtypes: float64(8), object(2)
memory usage: 10.1+ MB


In [27]:
weather.sample(5)

Unnamed: 0,time,tavg,tmin,tmax,prcp,snow,wdir,wspd,pres,airport_id
20175,2023-04-11,15.7,11.1,22.8,0.0,0.0,139.0,10.8,1014.3,BUR
46537,2023-07-02,23.7,17.2,29.4,0.0,0.0,335.0,7.4,1012.1,FOD
78488,2023-01-14,-2.7,-4.4,-0.6,0.0,0.0,238.0,11.4,1026.0,MDW
124387,2023-10-15,10.4,7.8,13.3,0.0,0.0,331.0,16.2,1012.9,TOL
125997,2023-03-14,17.5,9.4,26.7,0.0,0.0,158.0,10.8,1015.3,TUS


In [28]:
weather.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
tavg,132860.0,14.042641,10.672859,-39.1,6.3,15.4,22.5,42.2
tmin,132860.0,8.699833,10.680441,-76.0,1.1,9.4,17.2,37.2
tmax,132860.0,19.617149,11.392223,-35.0,11.7,21.2,28.9,50.0
prcp,132860.0,2.520604,7.915755,0.0,0.0,0.0,1.0,571.5
snow,132860.0,6.810131,50.935283,0.0,0.0,0.0,0.0,1780.0
wdir,132860.0,188.686042,114.629099,0.0,79.0,197.0,297.0,360.0
wspd,132860.0,12.424556,6.211261,0.0,7.9,11.3,15.7,78.3
pres,132860.0,1015.650368,6.944599,964.3,1011.6,1015.4,1019.7,1051.0


In [29]:
weather["time"] = weather["time"].astype("datetime64[ns]")

# Airports info

- **IATA_CODE**	Unique identifier for the airport.
- **AIRPORT**	Full name of the airport.
- **CITY**	City where the airport is located.
- **STATE**	State (US) where the airport is located.
- **COUNTRY**	Country where the airport is located (Always US).
- **LATITUDE**	Geographic latitude of the airport in decimal degrees.
- **LONGITUDE**	Geographic longitude of the airport in decimal degrees.

In [30]:
print("airports: ")
airports.info()

airports: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 364 entries, 0 to 363
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   IATA_CODE  364 non-null    object 
 1   AIRPORT    364 non-null    object 
 2   CITY       364 non-null    object 
 3   STATE      364 non-null    object 
 4   COUNTRY    364 non-null    object 
 5   LATITUDE   364 non-null    float64
 6   LONGITUDE  364 non-null    float64
dtypes: float64(2), object(5)
memory usage: 20.0+ KB


In [31]:
categorical_columns = ["AIRPORT", "CITY", "STATE", "COUNTRY"]

for column in categorical_columns:
    airports[column] = airports[column].astype("category")

In [32]:
airports.sample(5)

Unnamed: 0,IATA_CODE,AIRPORT,CITY,STATE,COUNTRY,LATITUDE,LONGITUDE
167,ICT,Wichita Dwight D. Eisenhower National Airport ...,Wichita,KS,USA,37.64996,-97.43305
187,KTN,Ketchikan International Airport,Ketchikan,AK,USA,55.35557,-131.71374
271,PPG,Pago Pago International Airport (Tafuna Airport),Pago Pago,AS,USA,-14.2781,-170.7025
168,IDA,Idaho Falls Regional Airport,Idaho Falls,ID,USA,43.51456,-112.07017
63,CHA,Chattanooga Metropolitan Airport (Lovell Field),Chattanooga,TN,USA,35.03527,-85.20379


In [33]:
airports.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
LATITUDE,364.0,38.81097,8.757505,-14.2781,33.672655,39.25193,43.11311,71.28545
LONGITUDE,364.0,-96.809033,27.471654,-176.64603,-110.788562,-93.74314,-82.501792,145.7333


In [34]:
airports.select_dtypes(["object","category"]).describe(include="all").T

Unnamed: 0,count,unique,top,freq
IATA_CODE,364,364,ABE,1
AIRPORT,364,363,Port Columbus International Airport,2
CITY,364,346,Columbus,3
STATE,364,55,TX,26
COUNTRY,364,1,USA,364


# Normalizing column names

In [35]:
# Function to normalize column names
def normalize_columns(df):
    df.columns = df.columns.str.lower().str.replace(" ", "_")
    return df

In [36]:
# Normalize column names
airports = normalize_columns(airports)
cancelled_diverted = normalize_columns(cancelled_diverted)
flights = normalize_columns(flights)
weather = normalize_columns(weather)

In [37]:
print("airports: " , airports.columns)
print(50*"*")
print("cancelled_diverted: " , cancelled_diverted.columns)
print(50*"*")
print("flights: " , flights.columns)
print(50*"*")
print("weather: " , weather.columns)
print(50*"*")

airports:  Index(['iata_code', 'airport', 'city', 'state', 'country', 'latitude',
       'longitude'],
      dtype='object')
**************************************************
cancelled_diverted:  Index(['flightdate', 'day_of_week', 'airline', 'tail_number', 'cancelled',
       'diverted', 'dep_airport', 'dep_cityname', 'deptime_label', 'dep_delay',
       'dep_delay_tag', 'dep_delay_type', 'arr_airport', 'arr_cityname',
       'arr_delay', 'arr_delay_type', 'flight_duration', 'distance_type',
       'delay_carrier', 'delay_weather', 'delay_nas', 'delay_security',
       'delay_lastaircraft'],
      dtype='object')
**************************************************
flights:  Index(['flightdate', 'day_of_week', 'airline', 'tail_number', 'dep_airport',
       'dep_cityname', 'deptime_label', 'dep_delay', 'dep_delay_tag',
       'dep_delay_type', 'arr_airport', 'arr_cityname', 'arr_delay',
       'arr_delay_type', 'flight_duration', 'distance_type', 'delay_carrier',
       'delay_weather'

# Getting to know the duplicates

In [38]:
# Check for duplicates and remove them
print("airports", airports.duplicated().sum())
print("cancelled_diverted", cancelled_diverted.duplicated().sum())
print("flights", flights.duplicated().sum())
print("weather", weather.duplicated().sum())

airports 0
cancelled_diverted 945


flights 31
weather 0


In [39]:
# Check for duplicates and remove them
cancelled_diverted = cancelled_diverted.drop_duplicates()
flights = flights.drop_duplicates()

In [40]:
# Check for duplicates and remove them
print("airports", airports.duplicated().sum())
print("cancelled_diverted", cancelled_diverted.duplicated().sum())
print("flights", flights.duplicated().sum())
print("weather", weather.duplicated().sum())

airports 0
cancelled_diverted 0
flights 0
weather 0


# Find missing values

In [41]:
# Count the nas in each column
airports_nas = airports.isna().sum()
cancelled_diverted_nas = cancelled_diverted.isna().sum()
flights_nas = flights.isna().sum()
weather_nas = weather.isna().sum()

print("airports_nas: ")
print(airports_nas)
print(50 * "*")

print("cancelled_diverted_nas: ")
print(cancelled_diverted_nas)
print(50 * "*")

print("flights_nas: ")
print(flights_nas)
print(50 * "*")

print("weather_nas: ")
print(weather_nas)

airports_nas: 
iata_code    0
airport      0
city         0
state        0
country      0
latitude     0
longitude    0
dtype: int64
**************************************************
cancelled_diverted_nas: 
flightdate            0
day_of_week           0
airline               0
tail_number           0
cancelled             0
diverted              0
dep_airport           0
dep_cityname          0
deptime_label         0
dep_delay             0
dep_delay_tag         0
dep_delay_type        0
arr_airport           0
arr_cityname          0
arr_delay             0
arr_delay_type        0
flight_duration       0
distance_type         0
delay_carrier         0
delay_weather         0
delay_nas             0
delay_security        0
delay_lastaircraft    0
dtype: int64
**************************************************
flights_nas: 
flightdate            0
day_of_week           0
airline               0
tail_number           0
dep_airport           0
dep_cityname          0
deptime_label    

In [None]:
print(f"airports df info:")
airports.info()
print(50 * "=")
print(f"cancelled_diverted df info:")
cancelled_diverted.info()
print(50 * "=")
print(f"flights df info:")
flights.info()
print(50 * "=")
print(f"weather df info:")
weather.info()
print(50 * "=")

# Data profiling

In [43]:
from ydata_profiling import ProfileReport

profile_airports = ProfileReport(df=airports, title="profile_airports", explorative=True)
profile_airports.to_file(output_file="./profiling reports/profile_airports.html")

profile_cancelled_diverted = ProfileReport(
    df=cancelled_diverted, title="profile_cancelled_diverted", explorative=True
)
profile_cancelled_diverted.to_file(output_file="./profiling reports/profile_cancelled_diverted.html")

profile_flights = ProfileReport(df=flights, title="profile_flights", explorative=True)
profile_flights.to_file(output_file="./profiling reports/profile_flights.html")

profile_weather = ProfileReport(df=weather, title="profile_weather", explorative=True)
profile_weather.to_file(output_file="./profiling reports/profile_weather.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

(using `df.profile_report(correlations={"auto": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'could not convert string to float: 'Endeavor Air'')


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

(using `df.profile_report(correlations={"auto": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'could not convert string to float: 'Endeavor Air'')


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]