### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

### Read and load dataset into pandas DataFrame

In [2]:
data = pd.read_csv('Airplane_Crashes_and_Fatalities_Since_1908_20190820105639.csv')
print(data.shape)
data.head()

(4967, 17)


Unnamed: 0,Date,Time,Location,Operator,Flight #,Route,AC Type,Registration,cn/ln,Aboard,Aboard Passangers,Aboard Crew,Fatalities,Fatalities Passangers,Fatalities Crew,Ground,Summary
0,09/17/1908,17:18,"Fort Myer, Virginia",Military - U.S. Army,,Demonstration,Wright Flyer III,,1.0,2.0,1.0,1.0,1.0,1.0,0.0,0.0,"During a demonstration flight, a U.S. Army fly..."
1,09/07/1909,,"Juvisy-sur-Orge, France",,,Air show,Wright Byplane,SC1,,1.0,0.0,1.0,1.0,0.0,0.0,0.0,Eugene Lefebvre was the first pilot to ever be...
2,07/12/1912,06:30,"Atlantic City, New Jersey",Military - U.S. Navy,,Test flight,Dirigible,,,5.0,0.0,5.0,5.0,0.0,5.0,0.0,First U.S. dirigible Akron exploded just offsh...
3,08/06/1913,,"Victoria, British Columbia, Canada",Private,,,Curtiss seaplane,,,1.0,0.0,1.0,1.0,0.0,1.0,0.0,The first fatal airplane accident in Canada oc...
4,09/09/1913,18:30,Over the North Sea,Military - German Navy,,,Zeppelin L-1 (airship),,,20.0,,,14.0,,,0.0,The airship flew into a thunderstorm and encou...


In [3]:
data['Summary'].value_counts().count()

4808

In [4]:
data.isnull().mean()*100

Date                      0.000000
Time                     30.400644
Location                  0.080532
Operator                  0.201329
Flight #                 73.525267
Route                    15.582847
AC Type                   0.301993
Registration              5.496275
cn/ln                    13.448762
Aboard                    0.362392
Aboard Passangers         4.610429
Aboard Crew               4.550030
Fatalities                0.161063
Fatalities Passangers     4.872156
Fatalities Crew           4.852023
Ground                    0.825448
Summary                   1.288504
dtype: float64

#### working on date column

In [5]:
data['Date'] = pd.to_datetime(data['Date'])
data['Year'] = pd.DatetimeIndex(data['Date']).year
data['Month'] = pd.DatetimeIndex(data['Date']).month
data['Day'] = pd.DatetimeIndex(data['Date']).day

In [6]:
# Year month and day are separated and original column can be dropped
data.drop('Date',inplace=True,axis=1)
data.head()

Unnamed: 0,Time,Location,Operator,Flight #,Route,AC Type,Registration,cn/ln,Aboard,Aboard Passangers,Aboard Crew,Fatalities,Fatalities Passangers,Fatalities Crew,Ground,Summary,Year,Month,Day
0,17:18,"Fort Myer, Virginia",Military - U.S. Army,,Demonstration,Wright Flyer III,,1.0,2.0,1.0,1.0,1.0,1.0,0.0,0.0,"During a demonstration flight, a U.S. Army fly...",1908,9,17
1,,"Juvisy-sur-Orge, France",,,Air show,Wright Byplane,SC1,,1.0,0.0,1.0,1.0,0.0,0.0,0.0,Eugene Lefebvre was the first pilot to ever be...,1909,9,7
2,06:30,"Atlantic City, New Jersey",Military - U.S. Navy,,Test flight,Dirigible,,,5.0,0.0,5.0,5.0,0.0,5.0,0.0,First U.S. dirigible Akron exploded just offsh...,1912,7,12
3,,"Victoria, British Columbia, Canada",Private,,,Curtiss seaplane,,,1.0,0.0,1.0,1.0,0.0,1.0,0.0,The first fatal airplane accident in Canada oc...,1913,8,6
4,18:30,Over the North Sea,Military - German Navy,,,Zeppelin L-1 (airship),,,20.0,,,14.0,,,0.0,The airship flew into a thunderstorm and encou...,1913,9,9


In [7]:
# Categorical columns
cat_cols = data.select_dtypes(include='O').columns

In [8]:
cat_cols

Index(['Time', 'Location', 'Operator', 'Flight #', 'Route', 'AC Type',
       'Registration', 'cn/ln', 'Summary'],
      dtype='object')

In [9]:
# Numerical columns
num_cols = data.select_dtypes(exclude='O').columns
num_cols

Index(['Aboard', 'Aboard Passangers', 'Aboard Crew', 'Fatalities',
       'Fatalities Passangers', 'Fatalities Crew', 'Ground', 'Year', 'Month',
       'Day'],
      dtype='object')

### Categorical Columns Treatment

In [10]:
data[cat_cols].isnull().mean()*100

Time            30.400644
Location         0.080532
Operator         0.201329
Flight #        73.525267
Route           15.582847
AC Type          0.301993
Registration     5.496275
cn/ln           13.448762
Summary          1.288504
dtype: float64

In [11]:
# Flight number has more null values so we can drop it
data.drop('Flight #',inplace=True,axis=1)
data.shape

(4967, 18)

In [12]:
print(data['Time'].value_counts().count())
data.drop('Time',inplace=True,axis=1)
print(data.shape)

1062
(4967, 17)


In [14]:
data['Location'].value_counts()

Moscow, Russia                17
Manila, Philippines           14
New York, New York            14
Cairo, Egypt                  13
Sao Paulo, Brazil             13
                              ..
Shanghai, China , China        1
Near Gaspe, Quebec, Canada     1
Near Batagai, Russia           1
Riyadh, Saudi Arabia           1
Beale AFB, California          1
Name: Location, Length: 4101, dtype: int64

In [16]:
data.drop(['Location'],inplace=True,axis=1)

In [17]:
data['Operator'].value_counts()

Aeroflot                            255
Military - U.S. Air Force           140
Air France                           72
Deutsche Lufthansa                   63
United Air Lines                     44
                                   ... 
Hargreaves Airways                    1
Fleming Airways System Transport      1
Nuna Air                              1
Kogalmavia                            1
Victoria Airlines                     1
Name: Operator, Length: 2242, dtype: int64