# Segment 2 - Machine Learning Model Refinement 


In [1]:
# dependencies

import pandas as pd
import psycopg2 as pg
import plotly.express as px
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split
from collections import Counter
from imblearn.ensemble import BalancedRandomForestClassifier
from sqlalchemy import create_engine

In [3]:

# Posgres Connection

engine = pg.connect("dbname='Capstone_Project' user='postgres' host='127.0.0.1' port='5432' password='samboest'")
usaf_df = pd.read_sql("select * from usaf_table", con=engine)

# Confirming that dataframe created.
usaf_df.head()

Unnamed: 0,index,Crash_Date,Crash_Time,Aircraft_Type,Aircraft_SN,Base,Wing,Squadron,Call_Sign,Operations_Code,...,Pilot_Egress,Pilot_Status,Pilot_Condition,Pilot_Recovered,Co_Pilot_Nav_Rank,Co_Pilot_Nav,Co_Pilot_Nav_Egress,Co_Pilot_Nav_Status,Co_Pilot_Nav_Condition,Co_Pilot_Recovered
0,0,1962-02-02,_,C-123B,564370,TSN,464TCW,_,_,IC,...,Crash,KIA,_,_,Capt,Larsen R. D.,Crash,KIA,_,_
1,1,1962-02-11,_,SC-47A,4315732,BHA,_,4400CCTS,_,IC,...,Crash,KIA,_,_,Capt,Hartson S. G.,Crash,KIA,_,_
2,3,1962-08-28,_,T-28B,538376,_,_,_,_,IC,...,_,KIA,_,_,Capt,_,_,_,_,_
3,4,1962-10-15,_,U-10,625909,_,_,_,_,IC,...,Crash,KIA,_,_,Capt,Foxx R. L.,Crash,KIA,_,_
4,5,1962-10-16,_,T-28B,538365,_,_,_,_,IC,...,Ejection,Recovered,Minor injuries,_,Capt,_,_,_,_,_


## Pre-Model Cleaning

In [4]:
# Capture capture variables for model". 

usaf_df_model = usaf_df.filter(['Crash_Date','Aircraft_Type','Base', 'Defense_Type', 'Mission_Phase', 'Pilot_Status'], axis=1)

usaf_df_model.head()

Unnamed: 0,Crash_Date,Aircraft_Type,Base,Defense_Type,Mission_Phase,Pilot_Status
0,1962-02-02,C-123B,TSN,_,Enroute,KIA
1,1962-02-11,SC-47A,BHA,Gunfire (combat associated),_,KIA
2,1962-08-28,T-28B,_,Gunfire,_,KIA
3,1962-10-15,U-10,_,Gunfire,_,KIA
4,1962-10-16,T-28B,_,Gunfire,_,Recovered


In [5]:
# Remove rows that have string value "_".  For some reason I couldnt chanin the code to do all columns in one line

usaf_df_modelv3 = usaf_df_model[usaf_df_model["Aircraft_Type"].str.contains("_")==False]

usaf_df_modelv4 = usaf_df_modelv3[usaf_df_modelv3["Base"].str.contains("_")==False]

usaf_df_modelv5 = usaf_df_modelv4[usaf_df_modelv4["Defense_Type"].str.contains("_")==False]

usaf_df_modelv6 = usaf_df_modelv5[usaf_df_modelv5["Mission_Phase"].str.contains("_")==False]

usaf_cleaned_nulls = usaf_df_modelv6[usaf_df_modelv6["Pilot_Status"].str.contains("_")==False]

usaf_cleaned_nulls.head(20)

Unnamed: 0,Crash_Date,Aircraft_Type,Base,Defense_Type,Mission_Phase,Pilot_Status
21,1964-03-24,T-28,BHA,Gunfire,At target,KIA
24,1964-06-26,T-28D,BHA,Gunfire,At target,Recovered
25,1964-08-06,B-57B,CLK,Gunfire,Enroute,KIA
28,1964-08-29,A-1E,BHA,30cal AAA,Enroute,KIA
31,1964-09-26,A-1E,BHA,Gunfire,At target,Recovered
32,1964-10-02,A-1E,BHA,Gunfire,At target,KIA
34,1964-10-26,A-1E,TSN,Gunfire,At target,KIA
36,1964-11-19,T-28D,UDN,30cal AAA,Enroute,KIA
38,1964-12-01,A-1E,BHA,Gunfire,Enroute,Recovered
46,1965-03-02,F-105D,KRT,Heavy 37mm AAA,At target,Recovered


In [20]:
#Start with dummy coding text variables

usaf_df_dummy = pd.get_dummies(usaf_cleaned_nulls, columns=["Aircraft_Type", "Base", "Defense_Type", "Mission_Phase"])

usaf_df_dummy.head()

Unnamed: 0,Crash_Date,Pilot_Status,Aircraft_Type_A-1,Aircraft_Type_A-1E,Aircraft_Type_A-1G,Aircraft_Type_A-1H,Aircraft_Type_A-1H/J,Aircraft_Type_A-1J,Aircraft_Type_A-26,Aircraft_Type_A-26A,...,Defense_Type_Small arms (poss.),Defense_Type_Small arms (prob. combat associated),Defense_Type_Small arms (prob.),Defense_Type_Target debris (combat associated),Mission_Phase_At target,Mission_Phase_Enroute,Mission_Phase_On recce,Mission_Phase_On station,Mission_Phase_Return,Mission_Phase_Transit
21,1964-03-24,KIA,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
24,1964-06-26,Recovered,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
25,1964-08-06,KIA,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
28,1964-08-29,KIA,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
31,1964-09-26,Recovered,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [21]:
# Clean Target variable pilot status - Get labels and counts

usaf_df_dummy["Pilot_Status"].value_counts()



Recovered                  511
KIA                        348
POW (returned)             156
MIA                        151
POW (died)                   5
POW                          4
KIA (chute failure)          3
Recoverd                     2
u                            2
KIA (chute failed)           1
Recovered DaNang             1
POW (died in captivity)      1
recovered                    1
Name: Pilot_Status, dtype: int64

In [22]:
# Clean Target variable pilot status - Create Binomial with dictionary.map

recovered = {'Recovered': 1, 'KIA': 0,'POW (returned)': 1,'MIA': 0,'POW (died)': 0,'POW': 1,'KIA (chute failure)': 0,'Recoverd': 1, 'u': 0, 
    'KIA (chute failed)': 0, 'Recovered DaNang': 1, 'POW (died in captivity)':0, 'Recovered (chute fail)': 1,'recovered': 1,'KIA,body MIA,PJ abandnd': 1, 'POW (died in captivity)': 0}


usaf_df_dummy["Target"] = usaf_df_dummy["Pilot_Status"].map(recovered)

usaf_df_dummy["Target"].value_counts

<bound method IndexOpsMixin.value_counts of 21      0
24      1
25      0
28      0
31      1
       ..
1533    0
1535    1
1536    0
1537    1
1538    1
Name: Target, Length: 1186, dtype: int64>

## Model Testing

### Random Forest

In [34]:
# Create feature Variables

Y = usaf_df_dummy["Target"]

x = usaf_df_dummy.drop(["Target", "Pilot_Status", "Crash_Date" ], axis=1)

x.head()

Unnamed: 0,Aircraft_Type_A-1,Aircraft_Type_A-1E,Aircraft_Type_A-1G,Aircraft_Type_A-1H,Aircraft_Type_A-1H/J,Aircraft_Type_A-1J,Aircraft_Type_A-26,Aircraft_Type_A-26A,Aircraft_Type_A-37,Aircraft_Type_A-37A,...,Defense_Type_Small arms (poss.),Defense_Type_Small arms (prob. combat associated),Defense_Type_Small arms (prob.),Defense_Type_Target debris (combat associated),Mission_Phase_At target,Mission_Phase_Enroute,Mission_Phase_On recce,Mission_Phase_On station,Mission_Phase_Return,Mission_Phase_Transit
21,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
24,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
25,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
28,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
31,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [35]:
x.dtypes

Aircraft_Type_A-1           uint8
Aircraft_Type_A-1E          uint8
Aircraft_Type_A-1G          uint8
Aircraft_Type_A-1H          uint8
Aircraft_Type_A-1H/J        uint8
                            ...  
Mission_Phase_Enroute       uint8
Mission_Phase_On recce      uint8
Mission_Phase_On station    uint8
Mission_Phase_Return        uint8
Mission_Phase_Transit       uint8
Length: 318, dtype: object

In [None]:
#x['Crash_Date'] = x['Crash_Date'].dt.year

In [37]:
x.dtypes 

Aircraft_Type_A-1           uint8
Aircraft_Type_A-1E          uint8
Aircraft_Type_A-1G          uint8
Aircraft_Type_A-1H          uint8
Aircraft_Type_A-1H/J        uint8
                            ...  
Mission_Phase_Enroute       uint8
Mission_Phase_On recce      uint8
Mission_Phase_On station    uint8
Mission_Phase_Return        uint8
Mission_Phase_Transit       uint8
Length: 318, dtype: object

In [38]:
# Check the balance of our target values - We have greater than 1000 and the target of recovered versus non-recovered is roughly proportional 
Y.value_counts()

1    675
0    511
Name: Target, dtype: int64

In [39]:
X_train, X_test, y_train, y_test = train_test_split(x, Y, random_state=78)

In [40]:
brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
modle=brfc.fit(X_train, y_train)

In [41]:
# Calculated the balanced accuracy score
y_pred = brfc.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5687300203429235

In [31]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 61,  50],
       [ 73, 113]], dtype=int64)

In [32]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.46      0.55      0.61      0.50      0.58      0.33       111
          1       0.69      0.61      0.55      0.65      0.58      0.34       186

avg / total       0.60      0.59      0.57      0.59      0.58      0.33       297



### Decision Tree

## Logistic Regression and Sampling Techniques

Random Oversampling

In [42]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({1: 489, 0: 489})

In [43]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)

model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [44]:
from sklearn.metrics import balanced_accuracy_score

y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5820255739610578

In [45]:
confusion_matrix(y_test, y_pred)

array([[ 57,  54],
       [ 65, 121]], dtype=int64)

In [46]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.47      0.51      0.65      0.49      0.58      0.33       111
          1       0.69      0.65      0.51      0.67      0.58      0.34       186

avg / total       0.61      0.60      0.56      0.60      0.58      0.34       297



SMOTE Oversampling

In [47]:
from imblearn.over_sampling import SMOTE

X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy=1.0).fit_resample(
    X_train, y_train
)
from collections import Counter

Counter(y_resampled)

Counter({1: 489, 0: 489})

In [48]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [49]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5547079337401918

In [50]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 39,  72],
       [ 45, 141]], dtype=int64)

In [51]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.46      0.35      0.76      0.40      0.52      0.26       111
          1       0.66      0.76      0.35      0.71      0.52      0.28       186

avg / total       0.59      0.61      0.50      0.59      0.52      0.27       297



In [52]:
# Resample the data using the ClusterCentroids resampler

from imblearn.under_sampling import ClusterCentroids

cluster = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cluster.fit_resample(X_train, y_train)

Counter(y_resampled)

  self.estimator_.fit(_safe_indexing(X, target_class_indices))


Counter({0: 400, 1: 400})

In [53]:
# Train Model

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [54]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5525283347863994

In [55]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 60,  51],
       [ 81, 105]], dtype=int64)

In [56]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.43      0.54      0.56      0.48      0.55      0.30       111
          1       0.67      0.56      0.54      0.61      0.55      0.31       186

avg / total       0.58      0.56      0.55      0.56      0.55      0.31       297



### SMOTEEN

In [57]:
from imblearn.combine import SMOTEENN

smoteenn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smoteenn.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 86, 1: 112})

In [58]:
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [59]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.4730456262714327

In [60]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 34,  77],
       [ 67, 119]], dtype=int64)

In [61]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.34      0.31      0.64      0.32      0.44      0.19       111
          1       0.61      0.64      0.31      0.62      0.44      0.20       186

avg / total       0.51      0.52      0.43      0.51      0.44      0.20       297

