### Importing libraries and data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, make_scorer, f1_score

In [2]:
from google.colab import files
uploaded = files.upload()

Saving NPS_Initial_Planning_Point.csv to NPS_Initial_Planning_Point.csv


Cleaning data

In [3]:
# Data from the isrid3 drive
# Preprocessing
df = pd.read_csv("NPS_Initial_Planning_Point.csv")
df = df[df['Total Number of Subjects'] == 1]
df_consolidated = df[['Subject 1: Age', 'Subject 1: Level of Fitness',
                      'Subject 1: Level of Experience', 'Incident Environment',
                      'Subject 1: Status', 'Total Hours']]
df_consolidated = df_consolidated.rename(columns={
    'Subject 1: Age': 'Age',
    'Subject 1: Level of Fitness': 'Physical Fitness',
    'Subject 1: Level of Experience': 'Experience',
    'Incident Environment': 'Environment',
    'Subject 1: Status': 'Status'
})

df_cleaned = pd.DataFrame()
df_cleaned['Age'] = df_consolidated['Age'].fillna(0)
df_cleaned['Physical Fitness'] = df_consolidated['Physical Fitness'].fillna('na')
df_cleaned['Experience'] = df_consolidated['Experience'].fillna('na')
df_cleaned['Environment'] = df_consolidated['Environment'].fillna('na')
df_cleaned['Status'] = df_consolidated['Status'].fillna('na')
df_cleaned['Total Hours'] = df_consolidated['Total Hours'].fillna(0)

X = df_cleaned[['Age', 'Physical Fitness', 'Experience', 'Environment', 'Total Hours']]
y = df_cleaned['Status']

df_cleaned[df_cleaned['Status'] == 'Ill or Injured'] = 'Ill_Injured'
df_cleaned[df_cleaned['Status'] == 'DECEASED'] = 'DOA'

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  df_cleaned[df_cleaned['Status'] == 'Ill or Injured'] = 'Ill_Injured'
  df_cleaned[df_cleaned['Status'] == 'Ill or Injured'] = 'Ill_Injured'


Setting up Cross-Validation

In [4]:
def macro_f1(y_true, y_pred):
    return f1_score(y_true, y_pred, average='macro', zero_division=0)

def weighted_f1(y_true, y_pred):
    return f1_score(y_true, y_pred, average='weighted', zero_division=0)

macro_f1_scorer = make_scorer(macro_f1)
weighted_f1_scorer = make_scorer(weighted_f1)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

##Logistic Regression

In [7]:
ct = make_column_transformer(
    (StandardScaler(), ['Age', 'Total Hours']),
    (OneHotEncoder(), ['Physical Fitness', 'Experience', 'Environment'])
)

lr = make_pipeline(
    ct,
    LogisticRegression()
)

cv_scores_accuracy = cross_val_score(lr, X, y, cv=kf, scoring='accuracy')
cv_scores_f1_macro = cross_val_score(lr, X, y, cv=kf, scoring=macro_f1_scorer)
cv_scores_f1_weighted = cross_val_score(lr, X, y, cv=kf, scoring=weighted_f1_scorer)

print("\nAccuracy Scores:", cv_scores_accuracy)
print("Mean Accuracy: {:.3f} (+/- {:.3f})".format(cv_scores_accuracy.mean(), cv_scores_accuracy.std() * 2))
print("\nF1 Scores (macro):", cv_scores_f1_macro)
print("Mean F1 Score (macro): {:.3f} (+/- {:.3f})".format(cv_scores_f1_macro.mean(), cv_scores_f1_macro.std() * 2))
print("\nF1 Scores (weighted):", cv_scores_f1_weighted)
print("Mean F1 Score (weighted): {:.3f} (+/- {:.3f})".format(cv_scores_f1_weighted.mean(), cv_scores_f1_weighted.std() * 2))

lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print("\nLogistic Regression Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy Scores: [0.58865248 0.65957447 0.60283688 0.66666667 0.67857143]
Mean Accuracy: 0.639 (+/- 0.073)

F1 Scores (macro): [0.29634831 0.33027417 0.42539683 0.31221831 0.34369585]
Mean F1 Score (macro): 0.342 (+/- 0.090)

F1 Scores (weighted): [0.53267193 0.62309826 0.57553754 0.63826191 0.6418623 ]
Mean F1 Score (weighted): 0.602 (+/- 0.084)

Logistic Regression Report:
              precision    recall  f1-score   support

  Alive_Well       0.71      0.39      0.50        57
         DOA       0.00      0.00      0.00         9
 Ill_Injured       0.55      0.90      0.69        68
   Not_Found       0.00      0.00      0.00         7

    accuracy                           0.59       141
   macro avg       0.32      0.32      0.30       141
weighted avg       0.55      0.59      0.53       141


Confusion Matrix:
[[22  0 35  0]
 [ 1  0  8  0]
 [ 7  0 61  0]
 [ 1  0  6  0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


##SVM

In [9]:
svm = make_pipeline(
    ct,
    SVC(kernel='rbf', random_state=42)
)

cv_scores_accuracy = cross_val_score(svm, X, y, cv=kf, scoring='accuracy')
cv_scores_f1_macro = cross_val_score(svm, X, y, cv=kf, scoring=macro_f1_scorer)
cv_scores_f1_weighted = cross_val_score(svm, X, y, cv=kf, scoring=weighted_f1_scorer)

print("\nAccuracy Scores:", cv_scores_accuracy)
print("Mean Accuracy: {:.3f} (+/- {:.3f})".format(cv_scores_accuracy.mean(), cv_scores_accuracy.std() * 2))
print("\nF1 Scores (macro):", cv_scores_f1_macro)
print("Mean F1 Score (macro): {:.3f} (+/- {:.3f})".format(cv_scores_f1_macro.mean(), cv_scores_f1_macro.std() * 2))
print("\nF1 Scores (weighted):", cv_scores_f1_weighted)
print("Mean F1 Score (weighted): {:.3f} (+/- {:.3f})".format(cv_scores_f1_weighted.mean(), cv_scores_f1_weighted.std() * 2))

svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
print("\nSupport Vector Machine Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy Scores: [0.62411348 0.63829787 0.65248227 0.70212766 0.7       ]
Mean Accuracy: 0.663 (+/- 0.064)

F1 Scores (macro): [0.31071071 0.29277588 0.37532367 0.39404762 0.35143717]
Mean F1 Score (macro): 0.345 (+/- 0.076)

F1 Scores (weighted): [0.5589334  0.59321866 0.60164999 0.66058764 0.65596352]
Mean F1 Score (weighted): 0.614 (+/- 0.078)

Support Vector Machine Report:
              precision    recall  f1-score   support

  Alive_Well       0.88      0.37      0.52        57
         DOA       0.00      0.00      0.00         9
 Ill_Injured       0.57      0.99      0.72        68
   Not_Found       0.00      0.00      0.00         7

    accuracy                           0.62       141
   macro avg       0.36      0.34      0.31       141
weighted avg       0.63      0.62      0.56       141


Confusion Matrix:
[[21  0 36  0]
 [ 1  0  8  0]
 [ 1  0 67  0]
 [ 1  0  6  0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Stacking model

In [10]:
ct = make_column_transformer(
    (StandardScaler(), ['Age', 'Total Hours']),
    (OneHotEncoder(handle_unknown='ignore'), ['Physical Fitness', 'Experience', 'Environment'])
)

# Create base models with transformers
lr = make_pipeline(
    ct,
    LogisticRegression(max_iter=1000, multi_class='multinomial', class_weight='balanced')
)

svm = make_pipeline(
    ct,
    SVC(kernel='rbf', random_state=42, class_weight='balanced', probability=True)
)

# Create stacking classifier with the pipelines
estimators = [
    ('svm', svm),
    ('lr', lr)
]

stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(random_state=42),
    cv=3
)

# Perform cross-validation
cv_scores_accuracy = cross_val_score(stacking_clf, X, y, cv=kf, scoring='accuracy')
cv_scores_f1_macro = cross_val_score(stacking_clf, X, y, cv=kf, scoring=macro_f1_scorer)
cv_scores_f1_weighted = cross_val_score(stacking_clf, X, y, cv=kf, scoring=weighted_f1_scorer)

print("\nAccuracy Scores:", cv_scores_accuracy)
print("Mean Accuracy: {:.3f} (+/- {:.3f})".format(cv_scores_accuracy.mean(), cv_scores_accuracy.std() * 2))
print("\nF1 Scores (macro):", cv_scores_f1_macro)
print("Mean F1 Score (macro): {:.3f} (+/- {:.3f})".format(cv_scores_f1_macro.mean(), cv_scores_f1_macro.std() * 2))
print("\nF1 Scores (weighted):", cv_scores_f1_weighted)
print("Mean F1 Score (weighted): {:.3f} (+/- {:.3f})".format(cv_scores_f1_weighted.mean(), cv_scores_f1_weighted.std() * 2))

stacking_clf.fit(X, y)
y_pred = stacking_clf.predict(X)
print("\nFinal Model Report:")
print(classification_report(y, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y, y_pred))




Accuracy Scores: [0.58865248 0.63829787 0.57446809 0.67375887 0.67857143]
Mean Accuracy: 0.631 (+/- 0.085)

F1 Scores (macro): [0.29634831 0.26360544 0.2870915  0.31508896 0.3408198 ]
Mean F1 Score (macro): 0.301 (+/- 0.052)

F1 Scores (weighted): [0.53267193 0.59328412 0.52258842 0.64294638 0.63702013]
Mean F1 Score (weighted): 0.586 (+/- 0.101)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Final Model Report:
              precision    recall  f1-score   support

  Alive_Well       0.66      0.54      0.59       247
         DOA       1.00      0.05      0.09        42
 Ill_Injured       0.68      0.87      0.76       386
   Not_Found       0.00      0.00      0.00        28
          na       0.00      0.00      0.00         1

    accuracy                           0.67       704
   macro avg       0.47      0.29      0.29       704
weighted avg       0.66      0.67      0.63       704


Confusion Matrix:
[[133   0 114   0   0]
 [ 17   2  23   0   0]
 [ 49   0 337   0   0]
 [  3   0  25   0   0]
 [  1   0   0   0   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import pickle
with open('trained_pipeline-0.1.0.pkl', 'wb') as file:
  pickle.dump(stacking_clf, file)

In [None]:
!zip -r ./trained_pipeline-0.1.0.pkl.zip ./trained_pipeline-0.1.0.pkl

  adding: trained_pipeline-0.1.0.pkl (deflated 83%)
