In [46]:
# Importing all libraries
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd 

#Ignoring ugly warnings
warnings.filterwarnings("ignore")

In [47]:
# Creating DataFrame with CSV file 
df = pd.read_csv('../archive/AviationData.csv', encoding = 'latin1')

# Calling DataFrame
df

Unnamed: 0,Event.Id,Investigation.Type,Accident.Number,Event.Date,Location,Country,Latitude,Longitude,Airport.Code,Airport.Name,...,Purpose.of.flight,Air.carrier,Total.Fatal.Injuries,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured,Weather.Condition,Broad.phase.of.flight,Report.Status,Publication.Date
0,20001218X45444,Accident,SEA87LA080,1948-10-24,"MOOSE CREEK, ID",United States,,,,,...,Personal,,2.0,0.0,0.0,0.0,UNK,Cruise,Probable Cause,
1,20001218X45447,Accident,LAX94LA336,1962-07-19,"BRIDGEPORT, CA",United States,,,,,...,Personal,,4.0,0.0,0.0,0.0,UNK,Unknown,Probable Cause,19-09-1996
2,20061025X01555,Accident,NYC07LA005,1974-08-30,"Saltville, VA",United States,36.922223,-81.878056,,,...,Personal,,3.0,,,,IMC,Cruise,Probable Cause,26-02-2007
3,20001218X45448,Accident,LAX96LA321,1977-06-19,"EUREKA, CA",United States,,,,,...,Personal,,2.0,0.0,0.0,0.0,IMC,Cruise,Probable Cause,12-09-2000
4,20041105X01764,Accident,CHI79FA064,1979-08-02,"Canton, OH",United States,,,,,...,Personal,,1.0,2.0,,0.0,VMC,Approach,Probable Cause,16-04-1980
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88884,20221227106491,Accident,ERA23LA093,2022-12-26,"Annapolis, MD",United States,,,,,...,Personal,,0.0,1.0,0.0,0.0,,,,29-12-2022
88885,20221227106494,Accident,ERA23LA095,2022-12-26,"Hampton, NH",United States,,,,,...,,,0.0,0.0,0.0,0.0,,,,
88886,20221227106497,Accident,WPR23LA075,2022-12-26,"Payson, AZ",United States,341525N,1112021W,PAN,PAYSON,...,Personal,,0.0,0.0,0.0,1.0,VMC,,,27-12-2022
88887,20221227106498,Accident,WPR23LA076,2022-12-26,"Morgan, UT",United States,,,,,...,Personal,MC CESSNA 210N LLC,0.0,0.0,0.0,0.0,,,,


In [48]:
# Checking info of DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88889 entries, 0 to 88888
Data columns (total 31 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Event.Id                88889 non-null  object 
 1   Investigation.Type      88889 non-null  object 
 2   Accident.Number         88889 non-null  object 
 3   Event.Date              88889 non-null  object 
 4   Location                88837 non-null  object 
 5   Country                 88663 non-null  object 
 6   Latitude                34382 non-null  object 
 7   Longitude               34373 non-null  object 
 8   Airport.Code            50132 non-null  object 
 9   Airport.Name            52704 non-null  object 
 10  Injury.Severity         87889 non-null  object 
 11  Aircraft.damage         85695 non-null  object 
 12  Aircraft.Category       32287 non-null  object 
 13  Registration.Number     87507 non-null  object 
 14  Make                    88826 non-null

In [49]:
# Filling NaN with "Airplane" value
df['Aircraft.Category'].fillna(value = 'Airplane', inplace = True)
df["Aircraft.Category"].value_counts()

Aircraft.Category
Airplane             84219
Helicopter            3440
Glider                 508
Balloon                231
Gyrocraft              173
Weight-Shift           161
Powered Parachute       91
Ultralight              30
Unknown                 14
WSFT                     9
Powered-Lift             5
Blimp                    4
UNK                      2
Rocket                   1
ULTR                     1
Name: count, dtype: int64

In [50]:
# Removes all non-airplane related series
df = df[df["Aircraft.Category"].isin(['Airplane'])]
df["Aircraft.Category"].value_counts()

Aircraft.Category
Airplane    84219
Name: count, dtype: int64

In [51]:
# Dropping all Amateurly built airplanes and removing the column
df = df[df['Amateur.Built'] != 'Yes']
df.drop(columns = ['Amateur.Built'], inplace = True)

In [52]:
# Only Personal and Instructional flights
df = df[df['Purpose.of.flight'].isin(['Personal', 'Intructional'])]

In [53]:
# Calculate the percentage of values being NaN for each column
rows = len(df)
missing = df.isna().sum()
percentage_missing = missing / rows

In [54]:
# Put the data in a DataFrame and sort it
percentage_missing_df = pd.DataFrame({'Missing' : percentage_missing})
percentage_missing_df.sort_values('Missing', ascending = False, inplace = True)

In [55]:
#printing columns with more than 10% missing values
print(percentage_missing_df[percentage_missing_df['Missing'] > 0.1])

                         Missing
Schedule                0.993543
Air.carrier             0.893938
FAR.Description         0.671258
Longitude               0.618767
Latitude                0.618644
Airport.Code            0.389482
Airport.Name            0.355994
Broad.phase.of.flight   0.255972
Publication.Date        0.163438
Total.Serious.Injuries  0.137316
Total.Minor.Injuries    0.128870
Total.Fatal.Injuries    0.122928


In [56]:
# Drop columns with over 50% missing values
cols_to_drop = list(percentage_missing_df[percentage_missing_df['Missing'] > 0.5].index)
df.drop(columns = cols_to_drop, axis = 1, inplace = True)
print(cols_to_drop)

['Schedule', 'Air.carrier', 'FAR.Description', 'Longitude', 'Latitude']


In [57]:
# Drop records that are not accidents in United States
before = len(df)
df = df[(df['Investigation.Type'] == 'Accident') & (df['Country'] == 'United States')]
dropped = before - len(df)
print(str(dropped) + ' rows dropped.')

1057 rows dropped.


In [58]:
# Convert Date to a datetime, add a Year & Month column and remove data before 1998
df['Event.Date'] = pd.to_datetime(df['Event.Date'])

In [59]:
#Add a day, month & year column
df['Year'] = df['Event.Date'].dt.year
df['Month.Abbr'] = df['Event.Date'].dt.month_name().str[:3]
df['Day.Name.Abbr'] = df['Event.Date'].dt.day_name().str[:3]

# Remove data before 1998
df = df[df['Year'] >= 1995]

In [60]:
# Merge different capitalizations of Make togheter
df['Make'] = df['Make'].str.title()
df['Make'].value_counts().nlargest(10)

Make
Cessna      7945
Piper       5013
Beech       1800
Mooney       613
Bellanca     364
Maule        320
Aeronca      286
Stinson      219
Champion     217
Luscombe     216
Name: count, dtype: int64

In [61]:
# Merge same airport names togheter
df['Airport.Name'].replace(to_replace = '(?i)^.*private.*$', value = 'PRIVATE', inplace = True, regex = True)
df['Airport.Name'].replace(to_replace = '(?i)none', value = 'NONE', inplace = True, regex = True)
df['Airport.Name'].value_counts().nlargest(10)

Airport.Name
PRIVATE            422
MERRILL FIELD       28
NORTH LAS VEGAS     22
PVT                 21
NONE                19
North Las Vegas     17
BIG BEAR CITY       16
Merrill Field       15
CENTENNIAL          15
VAN NUYS            15
Name: count, dtype: int64

In [62]:
# Merge same registration numbers togheter
df['Registration.Number'].replace(to_replace = '(?i)none', value = 'NONE', inplace = True, regex = True)
df['Registration.Number'].value_counts().nlargest(10)

Registration.Number
NONE      14
UNREG     12
N5408Y     4
N9299D     4
N3331R     4
N99811     3
N55HU      3
N41VK      3
N146P      3
N3764Z     3
Name: count, dtype: int64

In [63]:
# Merge weather condition unknowns
df['Weather.Condition'].replace(to_replace = ['Unk', 'UNK'], value = 'Unknown', inplace = True, regex = False)
df['Weather.Condition'].value_counts()

Weather.Condition
VMC        19940
IMC         1457
Unknown       86
Name: count, dtype: int64

In [64]:
# Split location in city and state for tableau interface
df['City'] = df['Location'].str.split(',').str[0]
df['State'] = df['Location'].str.split(',').str[1]
df[['City', 'State']].head(10)

Unnamed: 0,City,State
36598,LANCASTER,CA
36599,TORREON,NM
36600,FOLEY,AL
36602,DAVIS,CA
36603,UNION CITY,TN
36604,TEHACHAPI,CA
36606,BUNNELL,FL
36608,MIAMISBURG,OH
36609,INT'L FALLS,MN
36616,DUTCH HARBOR,AK


In [65]:
# Cleans amount excess Fatal values
df['Injury.Severity'] = df['Injury.Severity'].str.split('(').str[0]
dict_ = {"Minor" : "Non-Fatal", "Serious" : "Non-Fatal", 'Non-Fatal': 'Non-Fatal', "Fatal" : "Fatal"}
df['Injury.Severity'] = df['Injury.Severity'].map(dict_) 
df['Injury.Severity'].value_counts()

Injury.Severity
Non-Fatal    17648
Fatal         4083
Name: count, dtype: int64

In [66]:
# Dropping uncesseary columns
df.drop(columns = 'Event.Id', inplace = True)
df.drop(columns = 'Accident.Number', inplace = True)
df.drop(columns = "Registration.Number", inplace = True)
df.drop(columns = "Publication.Date", inplace = True)
df.drop(columns = "Airport.Code", inplace = True)
df.drop(columns = "Airport.Name", inplace = True)

In [67]:
#Resetting the index
df.reset_index().drop(columns = ['index'])

Unnamed: 0,Investigation.Type,Event.Date,Location,Country,Injury.Severity,Aircraft.damage,Aircraft.Category,Make,Model,Number.of.Engines,...,Total.Minor.Injuries,Total.Uninjured,Weather.Condition,Broad.phase.of.flight,Report.Status,Year,Month.Abbr,Day.Name.Abbr,City,State
0,Accident,1995-01-01,"LANCASTER, CA",United States,Fatal,Substantial,Airplane,Aero Commander,200D,1.0,...,0.0,1.0,VMC,Cruise,Probable Cause,1995,Jan,Sun,LANCASTER,CA
1,Accident,1995-01-01,"TORREON, NM",United States,Non-Fatal,Substantial,Airplane,Cessna,177B,1.0,...,0.0,2.0,VMC,Takeoff,Probable Cause,1995,Jan,Sun,TORREON,NM
2,Accident,1995-01-02,"FOLEY, AL",United States,Non-Fatal,Destroyed,Airplane,Piper,PA-28-151,1.0,...,0.0,1.0,VMC,Approach,Probable Cause,1995,Jan,Mon,FOLEY,AL
3,Accident,1995-01-02,"DAVIS, CA",United States,Non-Fatal,Substantial,Airplane,Cessna,175,1.0,...,0.0,1.0,VMC,Landing,Probable Cause,1995,Jan,Mon,DAVIS,CA
4,Accident,1995-01-02,"UNION CITY, TN",United States,Non-Fatal,Substantial,Airplane,Beech,V35B,1.0,...,1.0,0.0,VMC,Cruise,Probable Cause,1995,Jan,Mon,UNION CITY,TN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21735,Accident,2022-12-21,"Auburn Hills, MI",United States,Non-Fatal,,Airplane,Cessna,172F,,...,0.0,0.0,,,,2022,Dec,Wed,Auburn Hills,MI
21736,Accident,2022-12-26,"Annapolis, MD",United States,Non-Fatal,,Airplane,Piper,PA-28-151,,...,0.0,0.0,,,,2022,Dec,Mon,Annapolis,MD
21737,Accident,2022-12-26,"Payson, AZ",United States,Non-Fatal,Substantial,Airplane,American Champion Aircraft,8GCBC,1.0,...,0.0,1.0,VMC,,,2022,Dec,Mon,Payson,AZ
21738,Accident,2022-12-26,"Morgan, UT",United States,,,Airplane,Cessna,210N,,...,0.0,0.0,,,,2022,Dec,Mon,Morgan,UT


In [68]:
#Filling NaN values with values for visualization later
df['Aircraft.damage'] = df['Aircraft.damage'].fillna('None')
df['Total.Fatal.Injuries'].fillna(0, inplace = True)
df['Total.Serious.Injuries'].fillna(0, inplace = True)
df['Total.Minor.Injuries'].fillna(0, inplace = True)
df['Total.Uninjured'].fillna(0, inplace = True)

In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21740 entries, 36598 to 88888
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Investigation.Type      21740 non-null  object        
 1   Event.Date              21740 non-null  datetime64[ns]
 2   Location                21737 non-null  object        
 3   Country                 21740 non-null  object        
 4   Injury.Severity         21731 non-null  object        
 5   Aircraft.damage         21740 non-null  object        
 6   Aircraft.Category       21740 non-null  object        
 7   Make                    21739 non-null  object        
 8   Model                   21738 non-null  object        
 9   Number.of.Engines       21180 non-null  float64       
 10  Engine.Type             20579 non-null  object        
 11  Purpose.of.flight       21740 non-null  object        
 12  Total.Fatal.Injuries    21740 non-null  float64

In [70]:
#Creating cleaned CSV file.
df.to_csv('data/cleaned_aviation.csv')