In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import missingno as msno

from configure import DATA_DIR
from src.load_data import load_data

In [13]:
# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [14]:
# load data
df = load_data(DATA_DIR/'processed/fetal_health_clean.csv')

In [None]:
df.columns.to_list()


In [16]:
# Define features and target variable
X = df.drop(['fetal_health'], axis = 1)
y = df['fetal_health']

In [None]:
from src.baseline import escalar_features
# Setting up a standard scaler for the features and analyzing it thereafter
X_scaled = escalar_features(X)
X_scaled.describe().T

In [27]:
from sklearn.model_selection import train_test_split
# Splitting the training and test variables
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.20, random_state=25)

In [33]:
# Building pipelines of model for various classifiers

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, recall_score

pipeline_lr = Pipeline([('lr_classifier', LogisticRegression())])
pipeline_dt = Pipeline([('dt_classifier', DecisionTreeClassifier())])
pipeline_gbcl = Pipeline([('gbcl_classifier', GradientBoostingClassifier())])
pipeline_rf = Pipeline([('rf_classifier', RandomForestClassifier())])
pipeline_knn = Pipeline([('knn_classifier', KNeighborsClassifier())])

# List of all the pipelines
pipelines = [pipeline_lr, pipeline_dt, pipeline_gbcl, pipeline_rf, pipeline_knn]

# Dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: 'Logistic Regression', 1: 'Decision Tree', 2: 'Gradient Boost', 3:'RandomForest', 4: 'KNN'}


# Fitting the pipelines
for pipe in pipelines:
    pipe.fit(X_train, y_train)

In [34]:
# Evaluating the models
results = []
for i, model in enumerate(pipelines):
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)
    f1 = f1_score(y_test, model.predict(X_test), average='weighted')
    recall = recall_score(y_test, model.predict(X_test), average='weighted')
    results.append([pipe_dict[i], train_score, test_score, f1, recall])
results_df = pd.DataFrame(results, columns=['Model', 'Train Score', 'Test Score', 'F1 Score', 'Recall Score'])
results_df.sort_values(by='Test Score', ascending=False)    

Unnamed: 0,Model,Train Score,Test Score,F1 Score,Recall Score
1,Decision Tree,0.999408,0.92435,0.925071,0.92435
2,Gradient Boost,0.994083,0.921986,0.920913,0.921986
3,RandomForest,0.999408,0.919622,0.916982,0.919622
4,KNN,0.930769,0.869976,0.863847,0.869976
0,Logistic Regression,0.869231,0.851064,0.836383,0.851064
