# Segment 1 - Machine Learning Model Mock-up 
### Sam Boester 10/23/2022

In [38]:
# dependencies

import pandas as pd
import plotly.express as px
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split
from collections import Counter
from imblearn.ensemble import BalancedRandomForestClassifier

In [8]:
# Loading Mock CVS, this will be updated to a posgres connection in future segments

# read in CVS


file_path = "Vietnam_10.23.22/Resources/Cleaned/Vietnam_USAF_Losses_Cleaned_Data.csv"

usaf_df = pd.read_csv(file_path)

usaf_df.head()




Unnamed: 0,Crash Date,Crash Time,Aircraft Type,Aircraft S/N,Base,Wing,Squadron,Call Sign,Operations Code,Mission Type,...,Pilot Egress,Pilot Status,Pilot Condition,Pilot Recovered,Co-Pilot/Nav Rank,Co-Pilot/Nav,Co-Pilot/Nav Egress,Co-Pilot/Nav Status,Co-Pilot/Nav Condition,Co-Pilot Recovered
0,620202,_,C-123B,564370,TSN,464TCW,_,_,IC,,...,Crash,KIA,_,_,Capt,Larsen R. D.,Crash,KIA,_,_
1,620211,_,SC-47A,4315732,BHA,_,4400CCTS,_,IC,,...,Crash,KIA,_,_,Capt,Hartson S. G.,Crash,KIA,_,_
2,620613,_,T-28B,0,_,_,_,_,IC,,...,_,Recovered,Uninjured,_,u,_,_,_,_,_
3,620828,_,T-28B,538376,_,_,_,_,IC,,...,_,KIA,_,_,Capt,_,_,_,_,_
4,621015,_,U-10,625909,_,_,_,_,IC,,...,Crash,KIA,_,_,Capt,Foxx R. L.,Crash,KIA,_,_


## Pre-Model Cleaning

In [9]:
# Capture capture variables for model". 

usaf_df_model = usaf_df.filter(['Crash Date','Aircraft Type','Base', 'Hit Country', 'Loss Longitude', 'Loss Latitude', 'Pilot Status'], axis=1)

usaf_df_model.head()

Unnamed: 0,Crash Date,Aircraft Type,Base,Hit Country,Loss Longitude,Loss Latitude,Pilot Status
0,620202,C-123B,TSN,SVn,_,_,KIA
1,620211,SC-47A,BHA,SVn,10700E,1145N,KIA
2,620613,T-28B,_,SVn,_,_,Recovered
3,620828,T-28B,_,SVn,_,_,KIA
4,621015,U-10,_,SVn,_,_,KIA


In [18]:
# remove spaces from column names"
usaf_df_model.columns = usaf_df_model.columns.str.replace(' ', '')

usaf_df_model.head()

Unnamed: 0,CrashDate,AircraftType,Base,HitCountry,LossLongitude,LossLatitude,PilotStatus
0,620202,C-123B,TSN,SVn,_,_,KIA
1,620211,SC-47A,BHA,SVn,10700E,1145N,KIA
2,620613,T-28B,_,SVn,_,_,Recovered
3,620828,T-28B,_,SVn,_,_,KIA
4,621015,U-10,_,SVn,_,_,KIA


In [33]:
# Remove rows that have string value "_".  For some reason I couldnt chanin the code to do all columns in one line

#update to a loop? like the below sample code?

#for col in cols_to_check:
    #df[col] = df[col].map(lambda x: x.replace(';',''))

usaf_df_modelv2 = usaf_df_model[usaf_df_model["LossLongitude"].str.contains("_")==False]

usaf_df_modelv3 = usaf_df_modelv2[usaf_df_modelv2["AircraftType"].str.contains("_")==False]

usaf_df_modelv4 = usaf_df_modelv3[usaf_df_modelv3["Base"].str.contains("_")==False]

usaf_df_modelv5 = usaf_df_modelv4[usaf_df_modelv4["HitCountry"].str.contains("_")==False]

usaf_df_modelv6 = usaf_df_modelv5[usaf_df_modelv5["PilotStatus"].str.contains("_")==False]

usaf_cleaned_nulls = usaf_df_modelv6[usaf_df_modelv6["LossLatitude"].str.contains("_")==False]

usaf_cleaned_nulls.head(20)

Unnamed: 0,CrashDate,AircraftType,Base,HitCountry,LossLongitude,LossLatitude,PilotStatus
1,620211,SC-47A,BHA,SVn,10700E,1145N,KIA
32,640922,A-1E,BHA,SVn,10535E,0930N,KIA
33,640922,A-1E,BHA,SVn,10535E,0930N,Recovered
34,640926,A-1E,BHA,SVn,10604E,1005N,Recovered
35,641002,A-1E,BHA,SVn,10627E,0934N,KIA
37,641026,A-1E,TSN,SVn,10635E,1048N,KIA
43,641201,A-1E,BHA,SVn,10654E,1118N,Recovered
44,641231,O-1F,DNG,SVn,10754E,1608N,MIA
46,650113,F-105D,KRT,LS,10340E,1935N,Recovered
49,650210,A-1E,BHA,SVn,10635E,0934N,Recovered


In [None]:
# Add code - remove E and N from loss lat/lon

In [34]:
#Start with dummy coding text variables

usaf_df_dummy = pd.get_dummies(usaf_cleaned_nulls, columns=["AircraftType", "Base", "HitCountry"])

usaf_df_dummy.head()

Unnamed: 0,CrashDate,LossLongitude,LossLatitude,PilotStatus,AircraftType_A-1,AircraftType_A-1E,AircraftType_A-1G,AircraftType_A-1H,AircraftType_A-1H/J,AircraftType_A-1J,...,Base_V35,Base_VTA,HitCountry_CMB,HitCountry_HNN,HitCountry_LS,HitCountry_NVn,HitCountry_Nvn,HitCountry_SVn,HitCountry_THL,HitCountry_u
1,620211,10700E,1145N,KIA,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32,640922,10535E,0930N,KIA,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
33,640922,10535E,0930N,Recovered,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
34,640926,10604E,1005N,Recovered,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
35,641002,10627E,0934N,KIA,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [None]:
# Create Binomial (Recovered/Non-Recovered)


df['Name'] = df['Name'].replace(to_replace='Jane', value='Joan')
print(df)

## Random Forest Draft

In [35]:
# Create feature Variables

X = usaf_df_dummy.drop(["PilotStatus"], axis=1)

y = usaf_df_dummy["PilotStatus"]

X.head()

Unnamed: 0,CrashDate,LossLongitude,LossLatitude,AircraftType_A-1,AircraftType_A-1E,AircraftType_A-1G,AircraftType_A-1H,AircraftType_A-1H/J,AircraftType_A-1J,AircraftType_A-26,...,Base_V35,Base_VTA,HitCountry_CMB,HitCountry_HNN,HitCountry_LS,HitCountry_NVn,HitCountry_Nvn,HitCountry_SVn,HitCountry_THL,HitCountry_u
1,620211,10700E,1145N,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32,640922,10535E,0930N,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
33,640922,10535E,0930N,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
34,640926,10604E,1005N,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
35,641002,10627E,0934N,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [36]:
# Check the balance of our target values - We have greater than 1000 and the target of recovered versus non-recovered is roughly proportional
y.value_counts()

Recovered                  590
KIA                        411
POW (returned)             172
MIA                        168
POW (died)                   8
POW                          4
KIA (chute failure)          3
Recoverd                     2
u                            2
KIA (chute failed)           1
Recovered DaNang             1
POW (died in captivity)      1
Recovered (chute fail)       1
recovered                    1
KIA,body MIA,PJ abandnd      1
Name: PilotStatus, dtype: int64

In [None]:
# Splitting into Train and Test sets - modify
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [None]:
# Resample the training data with the BalancedRandomForestClassifier - may need to change estimator count
# create model instance

brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

modle=brfc.fit(X_train, y_train)

In [None]:
# Calculated the balanced accuracy score
y_pred = brfc.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))