 # Notebook description

 This notebook is a counter-analysis of flight data and delays.

Notepad configuration

In [130]:
from sqlalchemy import create_engine
from sqlalchemy.engine import URL
import pandas as pd 
import plotly.express as px
import os
from dotenv import  load_dotenv

In [None]:
load_dotenv("D:\Python\devcontainer\.env") #loading postgres password from .env file

Create url and engine

In [131]:
url = URL.create(
    "postgresql+psycopg2",
    username='postgres',
    password=os.getenv("POSTGRES_PASSWORD"),  
    host='localhost',
    database='postgres',
)
engine = create_engine(url)

Loading an SQL database using Pandas

In [132]:
def read_sql_table(table_name):
    df = pd.read_sql(sql=table_name, con=engine)
    return df

 Loading data frame `flight_df`

In [133]:
flight_df = pd.read_csv(r'flight_df_02.csv') #loading file flight_df_02.csv created at the end of the notebook 04_Data_Analysis_02.ipynb 

 #  `airport_list`

Loading data frame `airport_list_df`

In [134]:
airport_list_df = read_sql_table(table_name='airport_list')

Checking duplicates for a column `origin_airport_id`

In [135]:
airport_list_df.origin_airport_id.duplicated().any()

False

Merge `flight_df` and `airport_list_df` using `origin_airport_id`

In [137]:
flight_df = flight_df.merge(airport_list_df[['origin_airport_id', 'origin_city_name']], how='left', on='origin_airport_id')

Merge `flight_df` and `airport_list_df` using `destination_airport_id`

In [138]:
flight_df = flight_df.merge(airport_list_df[['origin_airport_id', 'origin_city_name']], how='left', left_on='dest_airport_id', right_on='origin_airport_id')

In [139]:
flight_df = flight_df.rename(columns={'origin_city_name_x': 'origin_city_name', 'origin_city_name_y': 'destination_city_name', 'origin_airport_id_x': 'origin_airport_id'}).drop(columns='origin_airport_id_y')


 ## Analysis by airports and routes

Determine the airport from which the most planes departed.

In [141]:
flight_df.sample()

Unnamed: 0,id,month,day_of_month,day_of_week,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,dest_airport_id,crs_dep_time,...,security_delay,late_aircraft_delay,year,is_delayed,is_weekend,distance_agg,manufacture_year,manufacture_year_agg,origin_city_name,destination_city_name
242240,312025,2,9,6,DL,N954AT,1622,14107,14747,1224,...,0.0,67.0,2019,1,1,"(1100, 1200]",2000.0,"(1999, 2002]","Phoenix, AZ","Seattle, WA"


In [142]:
top_airports_origin_df = flight_df.groupby('origin_city_name').agg('count').sort_values('id', ascending=False).reset_index()

In [143]:
top_airports_origin_df.head()

Unnamed: 0,origin_city_name,id,month,day_of_month,day_of_week,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,dest_airport_id,...,nas_delay,security_delay,late_aircraft_delay,year,is_delayed,is_weekend,distance_agg,manufacture_year,manufacture_year_agg,destination_city_name
0,"Atlanta, GA",123162,123162,123162,123162,123162,123162,123162,123162,123162,...,23228,23228,23228,123162,123162,123162,123162,119753,119532,123162
1,"Chicago, IL",105437,105437,105437,105437,105437,105437,105437,105437,105437,...,28124,28124,28124,105437,105437,105437,105437,102805,102755,105437
2,"Los Angeles, CA",87849,87849,87849,87849,87849,87849,87849,87849,87849,...,16392,16392,16392,87849,87849,87849,87849,87404,87404,87849
3,"New York, NY",75785,75785,75785,75785,75785,75785,75785,75785,75785,...,17971,17971,17971,75785,75785,75785,75785,75575,75574,75785
4,"Denver, CO",64525,64525,64525,64525,64525,64525,64525,64525,64525,...,14991,14991,14991,64525,64525,64525,64525,64409,64409,64525


Determine the airport to which the most planes arrived

In [144]:
top_airports_destination_df = flight_df.groupby('destination_city_name').agg('count').sort_values('id', ascending=False).reset_index()

 # Adding weather data


 Loading data frame `airport_weather`

In [154]:
airport_weather_df = read_sql_table(table_name='airport_weather')

Deleting redundant columns

In [155]:
airport_weather_df.columns

Index(['id', 'station', 'name', 'date', 'awnd', 'pgtm', 'prcp', 'snow', 'snwd',
       'tavg', 'tmax', 'tmin', 'wdf2', 'wdf5', 'wsf2', 'wsf5', 'wt01', 'wt02',
       'wt03', 'wt04', 'wt05', 'wt06', 'wt07', 'wt08', 'wt09', 'wesd', 'wt10',
       'psun', 'tsun', 'sn32', 'sx32', 'tobs', 'wt11', 'wt18'],
      dtype='object')

In [156]:
airport_weather_df = airport_weather_df.drop(columns=['id','pgtm',
       'tavg', 'tmin', 'wdf2', 'wdf5', 'wsf2', 'wsf5', 'wt01', 'wt02',
       'wt03', 'wt04', 'wt05', 'wt06', 'wt07', 'wt08', 'wt09', 'wesd', 'wt10',
       'psun', 'tsun', 'sn32', 'sx32', 'tobs', 'wt11', 'wt18'])

Merge `airport_list_df` and `airport_weather_df` using updating `airport_weather_df`.

In [160]:
airport_weather_df = airport_weather_df.merge(airport_list_df[['origin_airport_id', 'name']], on='name', how='inner')

 ## Merge `airport_weather_df` and `flight_df`.

Change the `date` column type to `DATETIME` in the `airport_weather_df` frame.

In [167]:
airport_weather_df['date'] = pd.to_datetime(airport_weather_df['date'])

In [169]:
airport_weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43394 entries, 0 to 43393
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   station            43394 non-null  object        
 1   name               43394 non-null  object        
 2   date               43394 non-null  datetime64[ns]
 3   awnd               43386 non-null  float64       
 4   prcp               43368 non-null  float64       
 5   snow               29578 non-null  float64       
 6   snwd               29007 non-null  float64       
 7   tmax               43386 non-null  float64       
 8   origin_airport_id  43394 non-null  int64         
dtypes: datetime64[ns](1), float64(5), int64(1), object(2)
memory usage: 3.0+ MB


Create a date` column in the `flight_df` frame. 

In [184]:
flight_df = flight_df.rename(columns={'day_of_month': 'day'})

In [186]:
flight_df['date'] = pd.to_datetime(flight_df[['year', 'month', 'day']])

In [205]:
flight_df['date'] = pd.to_datetime(flight_df['date']).dt.strftime('%Y-%m-%d')

flight_df['date'] = pd.to_datetime(flight_df['date'])

Merge `airport_weather_df` and `flight_df` based on: `origin_airport_id`, `date`.

In [209]:
flight_df = flight_df.merge(airport_weather_df, on=['origin_airport_id', 'date'], how='left')

In [210]:
flight_df.sample()

Unnamed: 0,id,month,day,day_of_week,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,dest_airport_id,crs_dep_time,...,origin_city_name,destination_city_name,date,station,name,awnd,prcp,snow,snwd,tmax
450905,576262,5,6,1,DL,N916DN,2017,14100,10397,1636,...,"Philadelphia, PA","Atlanta, GA",2019-05-06,USW00013739,"PHILADELPHIA INTERNATIONAL AIRPORT, PA US",7.16,0.0,0.0,0.0,73.0


## Analysis of the effect of maximum temperature on delays

Analysis for the column `tmax`.

In [212]:
flight_df.sample()

Unnamed: 0,id,month,day,day_of_week,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,dest_airport_id,crs_dep_time,...,origin_city_name,destination_city_name,date,station,name,awnd,prcp,snow,snwd,tmax
787509,1019157,2,3,7,AS,N926VA,1946,14771,12892,900,...,"San Francisco, CA","Los Angeles, CA",2019-02-03,USW00023234,"SAN FRANCISCO INTERNATIONAL AIRPORT, CA US",13.42,0.14,,,56.0


Outliers

In [219]:
flight_df.tmax.describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]).round(2)

count    1051956.00
mean          70.33
std           17.97
min          -10.00
10%           45.00
25%           59.00
50%           72.00
75%           84.00
90%           92.00
95%           95.00
99%          106.00
max          115.00
Name: tmax, dtype: float64

In [223]:
flight_df[flight_df['tmax'] > 114].origin_city_name.unique()

array(['Phoenix, AZ'], dtype=object)

 ### Comment
temperatures around 115 °F occur in Phoenix, AZ naturally during the period defined by the data, i.e. from June to September (the highest recorded 122 °F), so it is not dismissed as an outlier

source:
- https://www.currentresults.com/Weather/Arizona/Places/phoenix-temperatures-by-month-average.php
- https://www.azfamily.com/page/what-they-dont-tell-you-about-arizonas-temperature-extremes/

Removal of Null in tmax

In [224]:
flight_df.tmax.isnull().any()

True

In [233]:
tmax_clean_flight_df = flight_df.dropna(subset=['tmax'])

In [235]:
tmax_clean_flight_df.tmax.isnull().any()

False

In [237]:
tmax_clean_flight_df.sample()

Unnamed: 0,id,month,day,day_of_week,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,dest_airport_id,crs_dep_time,...,origin_city_name,destination_city_name,date,station,name,awnd,prcp,snow,snwd,tmax
554296,704285,5,8,3,9E,N932XJ,3358,12953,11057,1859,...,"New York, NY","Charlotte, NC",2019-05-08,USW00014732,"LAGUARDIA AIRPORT, NY US",7.83,0.0,0.0,0.0,68.0


In [240]:
agg_tmax_clean_flight_df = tmax_clean_flight_df.groupby('tmax')['is_delayed'].agg(['count', 'mean']).reset_index()

In [244]:
agg_tmax_clean_flight_df.sort_values('tmax')

Unnamed: 0,tmax,count,mean
0,-10.0,127,0.661417
1,1.0,178,0.455056
2,3.0,7,0.571429
3,5.0,295,0.477966
4,6.0,186,0.413978
...,...,...,...
107,111.0,678,0.168142
108,112.0,407,0.164619
109,113.0,300,0.166667
110,114.0,588,0.192177


In [243]:
agg_tmax_clean_flight_df.sample()

Unnamed: 0,tmax,count,mean
91,95.0,13244,0.210057


Graph of the effect of temperature on delay

In [246]:
fig = px.scatter(agg_tmax_clean_flight_df, 
                    x = 'tmax',
                    y = 'mean',
                    title= 'Temperature vs. Delay'
                   )

fig.update_xaxes(title_text='Temperature °F')

fig.update_yaxes(title_text='Delayed Flights [%]')

fig.show()

## Comments on the results

Temperature affects the percentage of delays up to 40°F, or about 4.4°C, which corresponds to reality, such as the need to de-ice the aircraft, which leads to delays, and the constant maintenance of the runway in conditions for takeoff and landing.