In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [6]:
top_makes = pd.read_csv('../../Data/top_10_makes.csv')
top_makes.head()

Unnamed: 0.1,Unnamed: 0,investigation_type,location,injury_severity,aircraft_damage,make,model,weather_condition,broad_phase_of_flight,event_year,event_month
0,0,Accident,"Santa Ana, CA",0,Substantial,piper,PA-12,VMC,VMC,2007,12
1,3,Accident,"Cherokee, AL",1,Substantial,bell,206L-3,VMC,VMC,2007,12
2,6,Accident,"Crystal Falls, MI",0,Substantial,cessna,172F,VMC,VMC,2007,12
3,7,Accident,"Gunnison, CO",0,Substantial,piper,PA-46-310P,VMC,VMC,2007,12
4,8,Accident,"Venice, LA",1,Destroyed,bell,206L1,IMC,IMC,2007,12


In [None]:
top_makes.shape

(110103, 10)

In [None]:
top_makes.dtypes

Unnamed: 0                int64
investigation_type       object
injury_severity           int64
aircraft_damage          object
make                     object
model                    object
weather_condition        object
broad_phase_of_flight    object
event_year                int64
event_month               int64
dtype: object

In [7]:
top_models = pd.read_csv('../../Data/top_10_models.csv')
top_models.head()

Unnamed: 0.1,Unnamed: 0,investigation_type,injury_severity,aircraft_damage,make,model,weather_condition,broad_phase_of_flight,event_year,event_month
0,9,Accident,0,Substantial,cessna,172,VMC,VMC,2007,12
1,30,Accident,0,Substantial,cessna,182,VMC,VMC,2007,12
2,41,Accident,0,Substantial,cessna,152,VMC,VMC,2007,12
3,73,Accident,0,Substantial,cessna,172,VMC,VMC,2007,12
4,75,Accident,0,Substantial,cessna,182,VMC,VMC,2007,12


In [None]:
top_models.shape

(29833, 10)

In [None]:
top_models = top_models.drop(columns = 'Unnamed: 0')

In [None]:
y = top_models['model']
X = top_models.drop(columns = 'model')

In [None]:
y.shape

(29833,)

Let's look at our baseline accuracy:

In [None]:
y.value_counts(normalize = True)

PA-28    0.195790
150      0.172125
172      0.141286
152      0.090236
PA-22    0.080582
182      0.078336
PA-18    0.067174
PA-24    0.066738
180      0.055945
PA-25    0.051788
Name: model, dtype: float64

So, in this particular multiclass model, the majority class is PA-28, and if we predicted PA-28 for every result, we would be right 19.58% of the time.

In [None]:
X.shape

(29833, 8)

I'm going to take a look at my 'broad_phase_of_flight' column:

In [None]:
X['broad_phase_of_flight'].value_counts()

Unknown    24312
VMC         5274
IMC          218
UNK           29
Name: broad_phase_of_flight, dtype: int64

Okay, because I know 'UNK' is unknown, I'm going to shove that in with the other unknowns.

In [None]:
X['broad_phase_of_flight'] = X['broad_phase_of_flight'].map({'Unknown' : 'Unknown', 'VMC' : 'VMC',
                                                            'IMC' : 'IMC', 'UNK' : 'Unknown'})

In [None]:
X['broad_phase_of_flight'].value_counts()

Unknown    24341
VMC         5274
IMC          218
Name: broad_phase_of_flight, dtype: int64

In [None]:
X['broad_phase_of_flight'].isnull().sum()

0

Okay, now that I have my variables in the condition I want them, I can train-test-split and start modeling.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   random_state = 42)

Because I still have a lot of object-type data, I'm going to dummify both of my variables.

In [None]:
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

y_train = pd.get_dummies(y_train)
y_test = pd.get_dummies(y_test)

Let's check out the shapes to make sure they align:

In [None]:
X_train.shape

(22374, 19)

In [None]:
X_test.shape

(7459, 19)

In [None]:
y_train.shape

(22374, 10)

In [None]:
y_test.shape

(7459, 10)

Great. So, now we can scale the data.

In [None]:
sc = StandardScaler()

X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

Great, we're ready to go. The first model I'll try is a neural network with one hidden layer:

In [None]:
model = Sequential()

model.add(Dense(19,
               activation = 'relu',
               input_shape = (19,)))

model.add(Dense(16,
               activation = 'relu'))


model.add(Dense(10,
               activation = 'softmax'))

In [None]:
model.compile(optimizer = 'adam',
             metrics = ['accuracy'],
             loss = 'categorical_crossentropy')

In [None]:
res = model.fit(X_train_sc, y_train,
                epochs = 20,
                validation_data = (X_test_sc, y_test),
                verbose = 1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Okay, so we have a validation accuracy of 42.87%, which isn't great, but is a significant improvement on our baseline accuracy of 19.58%. Next, I'm going to try a Random Forest model to see if it performs better on this particular data.

In order to run my Random Forest model, I need to have a one-column y variable. Let's make that:

In [None]:
y_rf = top_models.model

In [None]:
y_rf.value_counts()

PA-28    5841
150      5135
172      4215
152      2692
PA-22    2404
182      2337
PA-18    2004
PA-24    1991
180      1669
PA-25    1545
Name: model, dtype: int64

In [None]:
y_rf = y_rf.map({'PA-28' : 1, '150' : 2, '172' : 3,
                '152' : 4, 'PA-22' : 5, '182' : 6, 
                'PA-18' : 7, 'PA-24' : 8, '180' : 9,
                'PA-25' : 10})

In [None]:
y_rf.value_counts()

1     5841
2     5135
3     4215
4     2692
5     2404
6     2337
7     2004
8     1991
9     1669
10    1545
Name: model, dtype: int64

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_rf,
                                                   random_state = 42)

In [None]:
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

In [None]:
sc = StandardScaler()

X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

Okay, so now that the variables are split, dummified, and scaled, we can instantiate and fit the Random Forest model. I'm going to gridsearch over the model with a variety of parameters to see which end up giving the best score.

In [None]:
rf = RandomForestClassifier(random_state = 42)

In [None]:
params = {
    'n_estimators' : [70, 75, 100],
    'max_features' : [None, 'auto'],
    'max_depth' : [None, 5, 6]
}

gs = GridSearchCV(rf, param_grid = params, n_jobs = -2)
gs.fit(X_train, y_train)
print(gs.best_score_) # cross val score
gs.best_params_

0.4379189307307523


{'max_depth': 5, 'max_features': None, 'n_estimators': 75}

Okay! So after all that, not a significantly-improved score. But there is SOME improvement! We now have our model correctly predicting 43.79 percent of the time. 

Let's make predictions to see which aircraft models our rf model thinks will most commonly occur in
future NTSB crash data:

In [None]:
preds = gs.predict(X_test)

In [None]:
preds = pd.DataFrame(preds, columns = ['predictions'])

In [None]:
preds.predictions.value_counts(normalize = True)

So, here we can see that the Random Forest model with the best parameters predicts 
that the Piper PA-28 aircraft (number 1 in our list) will make up 41.33% pf the models in 
future crash data. Since our accuracy is only 43%, we can assume that it won't be exactly that 
percentage - but it still predicts Piper PA-28 to be the majority.