In [1]:
import pandas as pd                  # Pandas
import numpy as np                   # Numpy
from matplotlib import pyplot as plt # Matplotlib

# Package to implement ML Algorithms
import sklearn
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.metrics import f1_score

# Package for data partitioning
from sklearn.model_selection import train_test_split

# Package to record time
import time

# Module to save and load Python objects to and from files
import pickle 

# Ignore Deprecation Warnings
import warnings
warnings.filterwarnings('ignore')

# Display inline plots as vector-based (svg)
%config InlineBackend.figure_formats = ['svg']

%matplotlib inline

In [2]:
# Load dataset as dataframe
df = pd.read_csv('fetal_health.csv')
df.head()

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,...,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,73.0,0.5,43.0,...,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0,2.0
1,132.0,0.006,0.0,0.006,0.003,0.0,0.0,17.0,2.1,0.0,...,68.0,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0,1.0
2,133.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.1,0.0,...,68.0,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0,1.0
3,134.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0,1.0
4,132.0,0.007,0.0,0.008,0.0,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0,1.0


In [3]:
# Dropping null values
df.dropna(inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2126 entries, 0 to 2125
Data columns (total 22 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   baseline value                                          2126 non-null   float64
 1   accelerations                                           2126 non-null   float64
 2   fetal_movement                                          2126 non-null   float64
 3   uterine_contractions                                    2126 non-null   float64
 4   light_decelerations                                     2126 non-null   float64
 5   severe_decelerations                                    2126 non-null   float64
 6   prolongued_decelerations                                2126 non-null   float64
 7   abnormal_short_term_variability                         2126 non-null   float64
 8   mean_value_of_short_term_variability  

In [4]:
y = df['fetal_health']

X = df.drop('fetal_health', axis=1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=1)

rf_model = RandomForestClassifier(random_state=1)
dt_model = DecisionTreeClassifier(random_state=1)
ada_model = AdaBoostClassifier(random_state=1)

rf_model.fit(X_train, y_train)
dt_model.fit(X_train, y_train)
ada_model.fit(X_train, y_train)

rf_pred = rf_model.predict(X_test)
dt_pred = dt_model.predict(X_test)
ada_pred = ada_model.predict(X_test)

rf_f1 = f1_score(y_test, rf_pred, average='macro')
dt_f1 = f1_score(y_test, dt_pred, average='macro')
ada_f1 = f1_score(y_test, ada_pred, average='macro')

total_f1 = rf_f1 + dt_f1 + ada_f1
weights = [rf_f1/total_f1, dt_f1/total_f1, ada_f1/total_f1]

estimators = [
    ('rf', rf_model),
    ('dt', dt_model),
    ('ada', ada_model)
]

voting_clf = VotingClassifier(
    estimators=estimators,
    voting='soft',
    weights=weights
)

voting_clf.fit(X_train, y_train)

In [6]:
# Random Forest Confusion Matrix
y_pred_rf = rf_model.predict(X_test)
cm_rf = confusion_matrix(y_test, y_pred_rf, labels=rf_model.classes_)
disp_rf = ConfusionMatrixDisplay(confusion_matrix=cm_rf, display_labels=rf_model.classes_)

fig, ax = plt.subplots(figsize=(5, 5))
plt.rcParams.update({'font.size': 12})
disp_rf.plot(cmap='PuRd', ax=ax)
plt.title('Confusion Matrix - Random Forest')
fig.savefig("confusion_mat_random_forest.svg", bbox_inches='tight')
plt.close()

# Decision Tree Confusion Matrix
y_pred_dt = dt_model.predict(X_test)
cm_dt = confusion_matrix(y_test, y_pred_dt, labels=dt_model.classes_)
disp_dt = ConfusionMatrixDisplay(confusion_matrix=cm_dt, display_labels=dt_model.classes_)

fig, ax = plt.subplots(figsize=(5, 5))
plt.rcParams.update({'font.size': 12})
disp_dt.plot(cmap='PuRd', ax=ax)
plt.title('Confusion Matrix - Decision Tree')
fig.savefig("confusion_mat_decision_tree.svg", bbox_inches='tight')
plt.close()

# AdaBoost Confusion Matrix
y_pred_ada = ada_model.predict(X_test)
cm_ada = confusion_matrix(y_test, y_pred_ada, labels=ada_model.classes_)
disp_ada = ConfusionMatrixDisplay(confusion_matrix=cm_ada, display_labels=ada_model.classes_)

fig, ax = plt.subplots(figsize=(5, 5))
plt.rcParams.update({'font.size': 12})
disp_ada.plot(cmap='PuRd', ax=ax)
plt.title('Confusion Matrix - AdaBoost')
fig.savefig("confusion_mat_adaboost.svg", bbox_inches='tight')
plt.close()

# Voting Classifier Confusion Matrix
y_pred_voting = voting_clf.predict(X_test)
cm_voting = confusion_matrix(y_test, y_pred_voting, labels=voting_clf.classes_)
disp_voting = ConfusionMatrixDisplay(confusion_matrix=cm_voting, display_labels=voting_clf.classes_)

fig, ax = plt.subplots(figsize=(5, 5))
plt.rcParams.update({'font.size': 12})
disp_voting.plot(cmap='PuRd', ax=ax)
plt.title('Confusion Matrix - Voting Classifier')
fig.savefig("confusion_mat_voting_classifier.svg", bbox_inches='tight')
plt.close()

In [7]:
# Random Forest Classification Report
y_pred_rf = rf_model.predict(X_test)
rf_report = classification_report(y_test, y_pred_rf, target_names=['Normal', 'Suspect', 'Pathological'], output_dict=True)
rf_report_df = pd.DataFrame(rf_report).transpose()
rf_report_df = rf_report_df[['precision', 'recall', 'f1-score', 'support']]
rf_report_df = rf_report_df.drop(['accuracy', 'macro avg', 'weighted avg'])
rf_report_df.to_csv('class_report_random_forest.csv')

# Decision Tree Classification Report
y_pred_dt = dt_model.predict(X_test)
dt_report = classification_report(y_test, y_pred_dt, target_names=['Normal', 'Suspect', 'Pathological'], output_dict=True)
dt_report_df = pd.DataFrame(dt_report).transpose()
dt_report_df = dt_report_df[['precision', 'recall', 'f1-score', 'support']]
dt_report_df = dt_report_df.drop(['accuracy', 'macro avg', 'weighted avg'])
dt_report_df.to_csv('class_report_decision_tree.csv')

# AdaBoost Classification Report
y_pred_ada = ada_model.predict(X_test)
ada_report = classification_report(y_test, y_pred_ada, target_names=['Normal', 'Suspect', 'Pathological'], output_dict=True)
ada_report_df = pd.DataFrame(ada_report).transpose()
ada_report_df = ada_report_df[['precision', 'recall', 'f1-score', 'support']]
ada_report_df = ada_report_df.drop(['accuracy', 'macro avg', 'weighted avg'])
ada_report_df.to_csv('class_report_adaboost.csv')

# Voting Classifier Classification Report
y_pred_voting = voting_clf.predict(X_test)
voting_report = classification_report(y_test, y_pred_voting, target_names=['Normal', 'Suspect', 'Pathological'], output_dict=True)
voting_report_df = pd.DataFrame(voting_report).transpose()
voting_report_df = voting_report_df[['precision', 'recall', 'f1-score', 'support']]
voting_report_df = voting_report_df.drop(['accuracy', 'macro avg', 'weighted avg'])
voting_report_df.to_csv('class_report_voting_classifier.csv')

In [8]:
# Random Forest Feature Importance Plot
importance_rf = rf_model.feature_importances_
feature_imp_rf = pd.DataFrame(list(zip(X.columns, importance_rf)),
                            columns=['Feature', 'Importance'])
feature_imp_rf = feature_imp_rf.sort_values('Importance', ascending=False).reset_index(drop=True)

plt.figure(figsize=(10, 5))
plt.barh(feature_imp_rf['Feature'], feature_imp_rf['Importance'], color=['purple', 'pink'])
plt.xlabel("Importance")
plt.ylabel("Input Feature")
plt.title('Feature Importance - Random Forest')
plt.tight_layout()
plt.savefig("feature_imp_random_forest.svg")
plt.close()

# Decision Tree Feature Importance Plot
importance_dt = dt_model.feature_importances_
feature_imp_dt = pd.DataFrame(list(zip(X.columns, importance_dt)),
                            columns=['Feature', 'Importance'])
feature_imp_dt = feature_imp_dt.sort_values('Importance', ascending=False).reset_index(drop=True)

plt.figure(figsize=(10, 5))
plt.barh(feature_imp_dt['Feature'], feature_imp_dt['Importance'], color=['purple', 'pink'])
plt.xlabel("Importance")
plt.ylabel("Input Feature")
plt.title('Feature Importance - Decision Tree')
plt.tight_layout()
plt.savefig("feature_imp_decision_tree.svg")
plt.close()

# AdaBoost Feature Importance Plot
importance_ada = ada_model.feature_importances_
feature_imp_ada = pd.DataFrame(list(zip(X.columns, importance_ada)),
                            columns=['Feature', 'Importance'])
feature_imp_ada = feature_imp_ada.sort_values('Importance', ascending=False).reset_index(drop=True)

plt.figure(figsize=(10, 5))
plt.barh(feature_imp_ada['Feature'], feature_imp_ada['Importance'], color=['purple', 'pink'])
plt.xlabel("Importance")
plt.ylabel("Input Feature")
plt.title('Feature Importance - AdaBoost')
plt.tight_layout()
plt.savefig("feature_imp_adaboost.svg")
plt.close()

# Voting Classifier Feature Importance Plot 
rf_importance = rf_model.feature_importances_
dt_importance = dt_model.feature_importances_
ada_importance = ada_model.feature_importances_

weighted_sum_importance = (weights[0] * rf_importance + 
                        weights[1] * dt_importance + 
                        weights[2] * ada_importance)

feature_imp_voting = pd.DataFrame(list(zip(X.columns, weighted_sum_importance)),
                                 columns=['Feature', 'Importance'])

feature_imp_voting = feature_imp_voting.sort_values('Importance', ascending=False).reset_index(drop=True)

plt.figure(figsize=(10, 5))
plt.barh(feature_imp_voting['Feature'], feature_imp_voting['Importance'], color=['purple', 'pink'])
plt.xlabel("Importance")
plt.ylabel("Input Feature")
plt.title('Feature Importance - Voting Classifier')
plt.tight_layout()
plt.savefig("feature_imp_voting_classifier.svg")
plt.close()

In [9]:
rf_pickle = open('random_forest_model.pickle', 'wb')
pickle.dump(rf_model, rf_pickle)
rf_pickle.close()

dt_pickle = open('decision_tree_model.pickle', 'wb')
pickle.dump(dt_model, dt_pickle)
dt_pickle.close()

ada_pickle = open('adaboost_model.pickle', 'wb')
pickle.dump(ada_model, ada_pickle)
ada_pickle.close()

voting_pickle = open('voting_classifier_model.pickle', 'wb')
pickle.dump(voting_clf, voting_pickle)
voting_pickle.close()