# Task 10
# Test Hypothesis: Higher delays tend to have higher fuel consumption per passenger

Note: fuel cons is around 3-4 liters per passenger per 100km in the industry

In [1]:
import pandas as pd
import numpy as np

## Step 1. Find out number of passengers that were carried (monthly) by different air carriers
Find out distance covered monthly by different air carriers  
This information is found in the passengers DB

In [2]:
# passengers DB
passengers = pd.read_csv('passengers_cleaned.csv', index_col = 0)

  mask |= (ar1 == a)


In [3]:
passengers.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2350494 entries, 0 to 2350496
Data columns (total 38 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   departures_scheduled   float64
 1   departures_performed   float64
 2   payload                float64
 3   seats                  float64
 4   passengers             float64
 5   freight                float64
 6   mail                   float64
 7   distance               float64
 8   ramp_to_ramp           float64
 9   air_time               float64
 10  unique_carrier         object 
 11  airline_id             int64  
 12  unique_carrier_name    object 
 13  region                 object 
 14  carrier                object 
 15  carrier_name           object 
 16  carrier_group          int64  
 17  carrier_group_new      int64  
 18  origin_airport_id      int64  
 19  origin_city_market_id  int64  
 20  origin                 object 
 21  origin_city_name       object 
 22  origin_country    

In [4]:
# get date range
print(f'The passengers DB is containing data from {min(passengers.year)} to {max(passengers.year)}')

The passengers DB is containing data from 2015 to 2019


In [5]:
# keep only 2018 and 2019 (flights database only has data from 2018 to 2019)
passengers = passengers[passengers.year >= 2018]

In [6]:
# get monthly number of passengers and total distance per carrier
db_passengers = pd.DataFrame(passengers.groupby(['unique_carrier', 'year', 'month'])[['passengers', 'distance']].sum())
db_passengers.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,passengers,distance
unique_carrier,year,month,Unnamed: 3_level_1,Unnamed: 4_level_1
02Q,2018,3,25.0,3339.0
02Q,2018,4,119.0,8585.0
02Q,2018,6,86.0,13998.0
02Q,2019,2,552.0,16318.0
02Q,2019,5,158.0,12035.0


## Step 2: Find out monthly total fuel comsumption per air carrier.
This information can be found in the fuels DB


In [7]:
#fuels db
fuel = pd.read_csv('fuel_cleaned.csv', index_col = 0)

In [8]:
fuel.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3025 entries, 1 to 3034
Data columns (total 25 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   month              3025 non-null   int64  
 1   airline_id         3025 non-null   float64
 2   unique_carrier     3025 non-null   object 
 3   carrier            3025 non-null   object 
 4   carrier_name       3025 non-null   object 
 5   carrier_group_new  3025 non-null   int64  
 6   sdomt_gallons      3025 non-null   float64
 7   satl_gallons       3025 non-null   float64
 8   spac_gallons       3025 non-null   float64
 9   slat_gallons       3025 non-null   float64
 10  sint_gallons       3025 non-null   float64
 11  ts_gallons         3025 non-null   float64
 12  tdomt_gallons      3025 non-null   float64
 13  tint_gallons       3025 non-null   float64
 14  total_gallons      3025 non-null   float64
 15  sdomt_cost         3025 non-null   float64
 16  satl_cost          3025 

In [9]:
# get date range
print(f'The fuels DB is containing data from {min(fuel.year)} to {max(fuel.year)}')

The fuels DB is containing data from 2015 to 2019


In [10]:
# keep only 2018 and 2019 (flights database only has data from 2018 to 2019)
fuel = fuel[fuel.year >= 2018]

In [11]:
# get monthly consumption per carrier
db_fuel = pd.DataFrame(fuel.groupby(['unique_carrier', 'year', 'month'])['total_gallons'].sum())
db_fuel.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_gallons
unique_carrier,year,month,Unnamed: 3_level_1
09Q,2018,1,1467426.0
09Q,2018,2,1648225.0
09Q,2018,3,2539355.0
09Q,2018,4,1119351.0
09Q,2018,5,975540.0


## Step 3: Find out monthly delay per air carrier 
This information can be found in flights DB

In [2]:
# flights data
flights = pd.read_csv('flights_nullsremoved.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [13]:
# extract month from fl_date feature
import datetime

flights['fl_date'] = pd.to_datetime(flights['fl_date'])
flights['month'] = pd.DatetimeIndex(flights['fl_date']).month
flights['year'] = pd.DatetimeIndex(flights['fl_date']).year

NameError: name 'flights' is not defined

In [None]:
# get date range
print(f'The flights DB is containing data from {min(flights.year)} to {max(flights.year)}')

**Trying code on a sample**

In [None]:
flights_sample = flights[:1000]

In [None]:
flights_sample = flights_sample.groupby(['op_unique_carrier', 'year', 'month'])[['dep_delay', 'arr_delay']].sum()

In [None]:
flights_sample.head()

In [None]:
flights_sample['monthly_delay'] = flights_sample['dep_delay'] + flights_sample['arr_delay']

In [None]:
flights_sample.head()

**Running same code on flights db**

In [None]:
# get a db grouped per carrier & month
db_flights = flights.groupby(['op_unique_carrier', 'year', 'month'])[['dep_delay', 'arr_delay']].sum()

# add a column monthly_delay
db_flights['monthly_delay'] = db_flights['dep_delay'] + db_flights['arr_delay']

db_flights.head()

In [None]:
# rename unique_carrier column (it's actually op_unique_carrier)
db_flights.index.names = ['unique_carrier', 'year', 'month']

In [None]:
db_flights

### Merge databases

In [None]:
# join the three databases
data_merged = pd.merge(db_fuel, db_passengers, on=['unique_carrier', 'year', 'month'], how='left')
data_merged = pd.merge(data_merged, db_flights, on=['unique_carrier', 'year', 'month'], how='inner')

In [None]:
data_merged

In [None]:
# drop columns where total_gallons = 0
data_merged = data_merged[data_merged.total_gallons > 0]
data_merged

In [None]:
# reset_index
data_merged.reset_index()

In [None]:
# compute consumption (in gallons per mile and in liters per 100km) per passenger

#gallons per mile per passenger
data_merged['gallons_per_mile_per_passenger'] = (data_merged.total_gallons / data_merged.distance) / data_merged.passengers

#liters per 100km per passenger (more used and understandable)
data_merged['liters_per_100km_per_passenger'] = data_merged['gallons_per_mile_per_passenger'] * 100/(1.60934 * 0.264172)

In [None]:
data_merged.head()

In [None]:
# the liters_per_100km_per_passenger measure is supposed to be around 3-4 liters. Seems to be an issue here

## Plot the results

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# plot
plt.figure(figsize=(12,7))
plt.scatter(data_merged.monthly_delay, data_merged.liters_per_100km_per_passenger)
plt.ylim(0, max(data_merged.liters_per_100km_per_passenger))
plt.ylabel('consumption per passenger (in liters/100km)')
plt.xlabel('flight delay (minutes)')
plt.title('Consumptions per flight delays')

In [None]:
# check only the left cluster

plt.figure(figsize=(12,7))
plt.scatter(data_merged.monthly_delay, data_merged.liters_per_100km_per_passenger)
plt.ylim(0, 0.007)
plt.ylabel('consumption per passenger (in liters/100km)')
plt.xlabel('flight delay (minutes)')
plt.title('Consumptions per flight delays')

Conclusion: it doesn't look like the consumption per passenger is related to flight delay. There are higher consumptions and it might be for more empty flights. Let's confirm this by removing the passengers and looking at only fuel consumption per distance unit.

# Check hypothesis: Higher delays tend to have a higher fuel consumption per mile
Compare consumption in gallons per mile and see if there is a relation with delays

In [None]:
data_merged['cons_per_mile'] = data_merged.total_gallons / data_merged.distance
data_merged.head()

In [None]:
plt.figure(figsize=(12,7))
plt.scatter(data_merged.monthly_delay, data_merged.cons_per_mile)
plt.ylim(0, max(data_merged.cons_per_mile))
plt.ylabel('fuel consumption(in gallons/mile)')
plt.xlabel('flight delay (minutes)')
plt.title('Consumptions per flight delays')

In [None]:
# Looking at this one, it seems like higher delays tend to lead to sslightly hier fuel consumptions

In [None]:
# export merged_db
pd.data_merged_to_csv('EDA_Task10_data')