In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [4]:
optimized_dtypes = {
    'YEAR': 'int16',
    'MONTH': 'int8',
    'DAY': 'int8',
    'DAY_OF_WEEK': 'int8',
    'FLIGHT_NUMBER': 'int16',
    'SCHEDULED_DEPARTURE': 'int16',
    'DEPARTURE_TIME': 'float32',
    'DEPARTURE_DELAY': 'float32',
    'TAXI_OUT': 'float32',
    'WHEELS_OFF': 'float32',
    'SCHEDULED_TIME': 'float32',
    'ELAPSED_TIME': 'float32',
    'AIR_TIME': 'float32',
    'DISTANCE': 'int16',
    'WHEELS_ON': 'float32',
    'TAXI_IN': 'float32',
    'SCHEDULED_ARRIVAL': 'int16',
    'ARRIVAL_TIME': 'float32',
    'ARRIVAL_DELAY': 'float32',
    'DIVERTED': 'int8',
    'CANCELLED': 'int8',
    'AIRLINE': 'category',
    'ORIGIN_AIRPORT': 'category',
    'DESTINATION_AIRPORT': 'category'
}

In [5]:
df = pd.read_csv('flights.csv', dtype=optimized_dtypes)
print("DataFrame created successfully.")
print(f"The dataset has {df.shape[0]:,} rows and {df.shape[1]} columns.")
print("Memory usage of the DataFrame:")
print(df.memory_usage(deep=True).sum() / (1024 ** 2), "MB")

DataFrame created successfully.
The dataset has 5,819,079 rows and 31 columns.
Memory usage of the DataFrame:
1061.2577543258667 MB


In [6]:
print("\n## Descriptive Statistics ##")
print(df.describe().round(2))


## Descriptive Statistics ##
            YEAR       MONTH         DAY  DAY_OF_WEEK  FLIGHT_NUMBER  \
count  5819079.0  5819079.00  5819079.00   5819079.00     5819079.00   
mean      2015.0        6.52       15.70         3.93        2173.09   
std          0.0        3.41        8.78         1.99        1757.06   
min       2015.0        1.00        1.00         1.00           1.00   
25%       2015.0        4.00        8.00         2.00         730.00   
50%       2015.0        7.00       16.00         4.00        1690.00   
75%       2015.0        9.00       23.00         6.00        3230.00   
max       2015.0       12.00       31.00         7.00        9855.00   

       SCHEDULED_DEPARTURE  DEPARTURE_TIME  DEPARTURE_DELAY    TAXI_OUT  \
count           5819079.00      5732926.00       5732926.00  5730032.00   
mean               1329.60         1335.21             9.37       16.07   
std                 483.75          496.42            37.08        8.90   
min                  

In [7]:
print("\n## Missing Values Count ##")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0].sort_values(ascending=False))


## Missing Values Count ##
CANCELLATION_REASON    5729195
SECURITY_DELAY         4755640
AIR_SYSTEM_DELAY       4755640
LATE_AIRCRAFT_DELAY    4755640
WEATHER_DELAY          4755640
AIRLINE_DELAY          4755640
ARRIVAL_DELAY           105071
ELAPSED_TIME            105071
AIR_TIME                105071
TAXI_IN                  92513
WHEELS_ON                92513
ARRIVAL_TIME             92513
TAXI_OUT                 89047
WHEELS_OFF               89047
DEPARTURE_DELAY          86153
DEPARTURE_TIME           86153
TAIL_NUMBER              14721
SCHEDULED_TIME               6
dtype: int64


In [8]:
print("\n## Average Arrival Delay by Airline (in minutes) ##")
airline_delays = df.groupby('AIRLINE')['ARRIVAL_DELAY'].mean().sort_values(ascending=False)
print(airline_delays.round(2))


## Average Arrival Delay by Airline (in minutes) ##
AIRLINE
NK    14.47
F9    12.50
B6     6.68
EV     6.59
MQ     6.46
OO     5.85
UA     5.43
VX     4.74
WN     4.37
US     3.71
AA     3.45
HA     2.02
DL     0.19
AS    -0.98
Name: ARRIVAL_DELAY, dtype: float32


  airline_delays = df.groupby('AIRLINE')['ARRIVAL_DELAY'].mean().sort_values(ascending=False)


In [9]:
print("\n## Average Departure Delay by Hour of Day ##")
# Create an 'hour' column from the scheduled departure time
df['DEPARTURE_HOUR'] = (df['SCHEDULED_DEPARTURE'] // 100).astype('int8')
# Group by hour and calculate the mean delay
hourly_delays = df.groupby('DEPARTURE_HOUR')['DEPARTURE_DELAY'].mean()
print(hourly_delays.round(2))


## Average Departure Delay by Hour of Day ##
DEPARTURE_HOUR
0      7.20
1      8.09
2      7.27
3      8.92
4     10.34
5      1.90
6      2.11
7      3.25
8      4.69
9      5.68
10     6.89
11     7.78
12     8.95
13     9.80
14    11.18
15    11.94
16    12.93
17    13.84
18    14.99
19    15.08
20    15.29
21    13.72
22    11.77
23     9.52
Name: DEPARTURE_DELAY, dtype: float32


In [10]:
print("\n## Total Flight Cancellations by Month ##")
monthly_cancellations = df.groupby('MONTH')['CANCELLED'].sum().sort_values(ascending=False)
print(monthly_cancellations)


## Total Flight Cancellations by Month ##
MONTH
2     20517
1     11982
3     11002
6      9120
12     8063
5      5694
8      5052
7      4806
11     4599
4      4520
10     2454
9      2075
Name: CANCELLED, dtype: int64
