In [24]:
#load packages
import pandas as pd 
import xlsxwriter

In [2]:
#load data and convert dates to datetime
fw = pd.read_csv("../Datasets/FW/FW_tail_numbers(cleaned).csv", encoding='latin-1', parse_dates=['dates'])

In [3]:
#inspect data
fw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13031 entries, 0 to 13030
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   tail_number  13031 non-null  object        
 1   dates        13031 non-null  datetime64[ns]
 2   aircraft     13005 non-null  object        
 3   origin       13030 non-null  object        
 4   destination  13031 non-null  object        
 5   departure    13031 non-null  object        
 6   arrival      12788 non-null  object        
 7   duration     12783 non-null  object        
 8   NAME         13031 non-null  object        
dtypes: datetime64[ns](1), object(8)
memory usage: 916.4+ KB


In [4]:
#glimpse to data 
fw.head()

Unnamed: 0,tail_number,dates,aircraft,origin,destination,departure,arrival,duration,NAME
0,N7025P,2020-12-24,,"Near Red Bluff, CA","Near Emigrant Gap, CA",09:25AM PST,10:02AM PST,0:36,A PRECIOUS LIFE FLIGHT LLC ...
1,N7025P,2020-11-27,,Redding Muni,"Near Chico, CA",11:26AM PST,11:42AM PST,0:16,A PRECIOUS LIFE FLIGHT LLC ...
2,N269GJ,2020-12-18,LJ60,Aurora Muni,Fort Lauderdale Exec,08:53AM CST,12:20PM EST,2:26,AEROCARE MEDICAL TRANSPORT SYSTEM INC ...
3,N888CP,2021-02-11,LJ31,Aurora Muni,Aurora Muni,10:32AM CST,11:25AM CST,0:53,AEROCARE MEDICAL TRANSPORT SYSTEMS INC ...
4,N888CP,2021-01-24,LJ31,Akron-Canton Rgnl,Aurora Muni,04:41PM EST,04:58PM CST,1:17,AEROCARE MEDICAL TRANSPORT SYSTEMS INC ...


In [5]:
#chech for null values
fw.isna().sum()

tail_number      0
dates            0
aircraft        26
origin           1
destination      0
departure        0
arrival        243
duration       248
NAME             0
dtype: int64

In [6]:
#the null duration indicates that the flight was canceled or diverted. So, let's remove it 
fw = fw.dropna(subset=['duration'])

In [7]:
# drop flight that was just scheduled 
fw = fw.query("duration != 'Scheduled'")

In [8]:
# convert duration to minutes and add a new column 
fw['duration_minutes'] = pd.to_datetime(fw.duration, format="%H:%M").dt.hour * 60 + pd.to_datetime(fw.duration, format="%H:%M").dt.minute

# convert duration to the H:M format
fw.duration = pd.to_datetime(fw.duration, format="%H:%M").dt.time

#rearrange data for the convenience
fw = fw.iloc[:,:8].join([fw.iloc[:,-1], fw.loc[:,'NAME']])

In [9]:
!!!# let's consider flights which more or equal 5 minutes. We don't think that flight that less than 5 minutes has any sense.
fw = fw.query("duration_minutes >= 5")

In [18]:
# total number of flights by each service and its tail number 
total_flights = pd.DataFrame(fw.groupby("NAME").tail_number.value_counts()).rename({'tail_number':'Number of Flights'}, axis=1)

# Average Flights per Day
avg_total_flights = pd.Series(fw.groupby("NAME").tail_number.value_counts().sort_index() / fw.groupby(['NAME','tail_number']).dates.nunique(), name='Average Flights per Day')

In [19]:
stats = fw.groupby(["NAME", 'tail_number']).duration_minutes.agg(['sum','mean', 'median', 'max', 'min'])\
          .rename({'sum':'Total Duration(min)', 'mean':'Average Duration(min)', 'median':'Median Duration(min)', 'max':'MAX Duration(min)', 'min':'MIN Duration(min)'}, axis=1)

In [22]:
#total summary statistics 
total_stats = total_flights.join([avg_total_flights, stats])
total_stats.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Number of Flights,Average Flights per Day,Total Duration(min),Average Duration(min),Median Duration(min),MAX Duration(min),MIN Duration(min)
NAME,tail_number,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A PRECIOUS LIFE FLIGHT LLC,N7025P,2,1.0,52,26.0,26.0,36,16
AEROCARE MEDICAL TRANSPORT SYSTEM INC,N269GJ,1,1.0,146,146.0,146.0,146,146
AEROCARE MEDICAL TRANSPORT SYSTEMS INC,N888CP,110,3.055556,7525,68.409091,45.0,224,9
AEROMED TRANSPORT CO LLC,N80YD,43,1.592593,2489,57.883721,50.0,263,5
AIR AMBULANCE BY AIR TREK INC,N644AT,96,2.526316,8247,85.90625,86.5,190,9


In [23]:
# Origins (where does the flight start each time)
Origins = pd.DataFrame(fw.groupby(['NAME', 'tail_number']).origin.value_counts()).rename({'origin':'Number of Flights'}, axis=1)
Origins.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Number of Flights
NAME,tail_number,origin,Unnamed: 3_level_1
A PRECIOUS LIFE FLIGHT LLC,N7025P,"Near Red Bluff, CA",1
A PRECIOUS LIFE FLIGHT LLC,N7025P,Redding Muni,1
AEROCARE MEDICAL TRANSPORT SYSTEM INC,N269GJ,Aurora Muni,1
AEROCARE MEDICAL TRANSPORT SYSTEMS INC,N888CP,Aurora Muni,34
AEROCARE MEDICAL TRANSPORT SYSTEMS INC,N888CP,Chicago Midway Intl,23


In [25]:
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('..//Datasets/FW/FW_stats.xlsx', engine='xlsxwriter')

# Write each dataframe to a different worksheet.
total_stats.to_excel(writer, sheet_name='stats')
Origins.to_excel(writer, sheet_name='origin')

# Close the Pandas Excel writer and output the Excel file.
writer.save()