# Analysis & Classification of Fetal Heart Rates

### Team 2
- Juan Pablo Díaz López
- Jose Pablo González Flores
- Juan Sebastian Neira González

In [None]:
#Library importation
import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from dotenv import load_dotenv
from plotting import Plotting

In [None]:
#upload data into Dataframe
data= pd.read_excel(os.getenv('CTG'),sheet_name=os.getenv('CTG_sheet'),skiprows=1)

In [None]:
#delete unnamed columns
data = data.loc[:, ~data.columns.str.contains('^Unnamed')]

#select important columns, drop raw data
col = ['b','e','AC','FM','UC','DL','DS','DP','DR','NSP', 'A', 'B', 'C' ,'D', 'E', 'AD', 'DE', 'LD', 'FS', 'SUSP']
data = data.drop(columns=col)

In [None]:
data.head(5)

In [None]:
data.isna().sum()

In [None]:
data = data.dropna()

In [None]:
new_names = {'LB': 'bl_FHR', 
             'AC.1': 'accel', 
             'FM.1': 'fetal_mov', 
             'UC.1': 'uterine_contr', 
             'DL.1': 'light_decel', 
             'DS.1': 'severe_decel', 
             'DP.1': 'prolong_decel', 
             'SH':'sh_pattern', 
             'AD':'ad_pattern', 
             'DE':'de_pattern', 
             'LD': 'ld_pattern', 
             'FS': 'fs_pattern', 
             'SUSP': 'sus_pattern'}
data.rename(columns = new_names, inplace=True)

In [None]:
data.info()

# EDA

## EDA Univariate

### feature baseline_FHR (Fetal Heart Rate)

In [None]:
plotter = Plotting(plot_option=1, x='bl_FHR', data=data)
plotter.plot()

#### Conclusion
The main statistics are:
- mean: 133.3038
- median: 133.0000
- sdt dev: 9.8408

There doesn´t seem to be any outliers in the distribution

### Feature acceleration

In [None]:
plotter = Plotting(plot_option=1, x='accel', data=data)
plotter.plot()

#### Conclusion
The main statistics are:
- mean: 0.0031
- median: 0.0016
- sdt dev: 0.0038

While there are outlier values that appear in the distribution, they are close enough to take them as part of the main distribution

### Feature fetal movement

In [None]:
plotter = Plotting(plot_option=1, x='fetal_mov', data=data)
plotter.plot()

#### Conclusion
The main statistics are:
- mean: 0.0094
- median: 0.0000
- sdt dev: 0.0466
- outlier-rule : >= 0.2

### Feature uterine contractions

In [None]:
plotter = Plotting(plot_option=1, x='uterine_contr', data=data)
plotter.plot()

#### Conclusion
The main statistics are:
- mean: 0.0043
- median: 0.0044
- sdt dev: 0.0029

### Feature light decelerations

In [None]:
plotter = Plotting(plot_option=1, x='light_decel', data=data)
plotter.plot()

#### Conclusion
The main statistics are:
- mean: 0.0018
- median: 0.0000
- sdt dev: 0.0029

### Feature severe decelerations

In [None]:
plotter = Plotting(plot_option=1, x='severe_decel', data=data)
plotter.plot()

#### Conclusion
The main statistics are:
- mean: 0.000004
- median: 0.000000
- sdt dev: 0.000063
- outlier rule: >= 0.0008

### Feature prolonged decelerations

In [None]:
plotter = Plotting(plot_option=1, x='prolong_decel', data=data)
plotter.plot()

#### Conclusion
The main statistics are:
- mean: 0.000157
- median: 0.000000
- sdt dev: 0.00058
- outlier rule: >= 0.002

### Feature abnormal short term variability (ASTV)

In [None]:
plotter = Plotting(plot_option=1, x='ASTV', data=data)
plotter.plot()

#### Conclusion
The main statistics are:
- mean: 46.9901
- median: 49.0000
- sdt dev: 17.1928

### Feature mean value short term variability (MSTV)

In [None]:
plotter = Plotting(plot_option=1, x='MSTV', data=data)
plotter.plot()

#### Conclusion
The main statistics are:
- mean: 1.3327
- median: 1.2000
- sdt dev: 0.8832
- outlier rule: >= 5

### Feature abnormal long term variability (ALTV)

In [None]:
plotter = Plotting(plot_option=1, x='ALTV', data=data)
plotter.plot()

#### Conclusion
The main statistics are:
- mean: 9.8466
- median: 0.0000
- sdt dev: 18.3968
- outlier rule: >= 70

### Feature mean value long term variability (MLTV)

In [None]:
plotter = Plotting(plot_option=1, x='MLTV', data=data)
plotter.plot()

#### Conclusion
The main statistics are:
- mean: 8.1876
- median: 7.4000
- sdt dev: 5.6282
- outlier rule: >= 30

### Feature Width

In [None]:
plotter = Plotting(plot_option=1, x='Width', data=data)
plotter.plot()

#### Conclusion
The main statistics are:
- mean: 70.4459
- median: 67.5000
- sdt dev: 38.9557

### Feature min

In [None]:
plotter = Plotting(plot_option=1, x='Min', data=data)
plotter.plot()

#### Conclusion
The main statistics are:
- mean: 93.5795
- median: 93.0000
- sdt dev: 29.5602

### Feature max

In [None]:
plotter = Plotting(plot_option=1, x='Max', data=data)
plotter.plot()

#### Conclusion
The main statistics are:
- mean: 164.0254
- median: 162.0000
- sdt dev: 17.9441
- outlier rule: >= 220

### Feature Nmax

In [None]:
plotter = Plotting(plot_option=1, x='Nmax', data=data)
plotter.plot()

#### Conclusion
The main statistics are:
- mean: 4.0682
- median: 3.0000
- sdt dev: 2.9494

### Feature Nzeros

In [None]:
plotter = Plotting(plot_option=1, x='Nzeros', data=data)
plotter.plot()

#### Conclusion
The main statistics are:
- mean: 0.3236
- median: 0.0000
- sdt dev: 0.7060
- outlier rule: >= 6

### Feature Mode

In [None]:
plotter = Plotting(plot_option=1, x='Mode', data=data)
plotter.plot()

#### Conclusion
The main statistics are:
- mean: 137.4520
- median: 139.0000
- sdt dev: 16.3813
- outlier rule: < 70

### Feature Mean

In [None]:
plotter = Plotting(plot_option=1, x='Mean', data=data)
plotter.plot()

#### Conclusion
The main statistics are:
- mean: 134.6105
- median: 136.0000
- sdt dev: 15.5935

### Feature Median

In [None]:
plotter = Plotting(plot_option=1, x='Median', data=data)
plotter.plot()

#### Conclusion
The main statistics are:
- mean: 138.0903
- median: 139.0000
- sdt dev: 14.4666

### Feature Variance

In [None]:
plotter = Plotting(plot_option=1, x='Variance', data=data)
plotter.plot()

#### Conclusion
The main statistics are:
- mean: 18.8081
- median: 7.0000
- sdt dev: 28.9776
- outlier rule: >= 200

### Feature Tendency

In [None]:
x="Tendency"
pd.DataFrame(data[x].value_counts(normalize=True)).plot.barh()
plt.title("Bar plot of "+x)
plt.show()
data[x].describe()

#### Conclusion
The tendency of the histograms from most of the instances are 0, so they are symmetric, from there, most of the tendencies are positive

### Feature Class

In [None]:
x="CLASS"
pd.DataFrame(data[x].value_counts(normalize=True)).plot.bar()
plt.title("Bar plot of "+x)
plt.show()
data[x].describe()

#### Conclusion
Over 25% of the entries are categorized as class 2, being the largest of the classifications.
The class with the least entries is 3

## EDA to target variable

### Feature baseline FHR vs Class

In [None]:
plotter = Plotting(plot_option=2, x='bl_FHR', data=data)
plotter.plot()

In [None]:
data[["bl_FHR","CLASS"]].groupby("CLASS").mean().add_suffix("_mean").round(2)

In [None]:
data[["bl_FHR","CLASS"]].groupby("CLASS").median().add_suffix("_median")

In [None]:
data[["bl_FHR","CLASS"]].groupby("CLASS").std().add_suffix("_std")

### Feature acceleration vs Class

In [None]:
plotter = Plotting(plot_option=2, x='accel', data=data)
plotter.plot()

In [None]:
data[["accel","CLASS"]].groupby("CLASS").mean().add_suffix("_mean").round(4)

In [None]:
data[["accel","CLASS"]].groupby("CLASS").median().add_suffix("_median")

In [None]:
data[["accel","CLASS"]].groupby("CLASS").std().add_suffix("_std")

### Feature fetal movement vs Class

In [None]:
plotter = Plotting(plot_option=2, x='fetal_mov', data=data)
plotter.plot()

In [None]:
data[["fetal_mov","CLASS"]].groupby("CLASS").mean().add_suffix("_mean").round(4)

In [None]:
data[["fetal_mov","CLASS"]].groupby("CLASS").median().add_suffix("_median")

In [None]:
data[["fetal_mov","CLASS"]].groupby("CLASS").std().add_suffix("_std")

### Feature uterine contraction vs Class

In [None]:
plotter = Plotting(plot_option=2, x='uterine_contr', data=data)
plotter.plot()

In [None]:
data[["uterine_contr","CLASS"]].groupby("CLASS").mean().add_suffix("_mean").round(4)

In [None]:
data[["uterine_contr","CLASS"]].groupby("CLASS").median().add_suffix("_median")

In [None]:
data[["uterine_contr","CLASS"]].groupby("CLASS").std().add_suffix("_std")

### Feature light deceleration vs Class

In [None]:
plotter = Plotting(plot_option=2, x='light_decel', data=data)
plotter.plot()

In [None]:
data[["light_decel","CLASS"]].groupby("CLASS").mean().add_suffix("_mean").round(6)

In [None]:
data[["light_decel","CLASS"]].groupby("CLASS").median().add_suffix("_median")

In [None]:
data[["light_decel","CLASS"]].groupby("CLASS").std().add_suffix("_std")

### Feature severe deceleration vs Class

In [None]:
plotter = Plotting(plot_option=2, x='severe_decel', data=data)
plotter.plot()

In [None]:
data[["severe_decel","CLASS"]].groupby("CLASS").mean().add_suffix("_mean").round(6)

In [None]:
data[["severe_decel","CLASS"]].groupby("CLASS").median().add_suffix("_median")

In [None]:
data[["severe_decel","CLASS"]].groupby("CLASS").std().add_suffix("_std")

### Feature prolonged deceleration vs Class

In [None]:
plotter = Plotting(plot_option=2, x='prolong_decel', data=data)
plotter.plot()

In [None]:
data[["prolong_decel","CLASS"]].groupby("CLASS").mean().add_suffix("_mean").round(6)

In [None]:
data[["prolong_decel","CLASS"]].groupby("CLASS").median().add_suffix("_median")

In [None]:
data[["prolong_decel","CLASS"]].groupby("CLASS").std().add_suffix("_std")

### Feature ASTV vs Class

In [None]:
plotter = Plotting(plot_option=2, x='ASTV', data=data)
plotter.plot()

In [None]:
data[["ASTV","CLASS"]].groupby("CLASS").mean().add_suffix("_mean").round(2)

In [None]:
data[["ASTV","CLASS"]].groupby("CLASS").median().add_suffix("_median")

In [None]:
data[["ASTV","CLASS"]].groupby("CLASS").std().add_suffix("_std")

### Feature MSTV vs Class

In [None]:
plotter = Plotting(plot_option=2, x='MSTV', data=data)
plotter.plot()

In [None]:
data[["MSTV","CLASS"]].groupby("CLASS").mean().add_suffix("_mean").round(2)

In [None]:
data[["MSTV","CLASS"]].groupby("CLASS").median().add_suffix("_median")

In [None]:
data[["MSTV","CLASS"]].groupby("CLASS").std().add_suffix("_std")

### Feature ALTV vs Class

In [None]:
plotter = Plotting(plot_option=2, x='ALTV', data=data)
plotter.plot()

In [None]:
data[["ALTV","CLASS"]].groupby("CLASS").mean().add_suffix("_mean").round(2)

In [None]:
data[["ALTV","CLASS"]].groupby("CLASS").median().add_suffix("_median")

In [None]:
data[["ALTV","CLASS"]].groupby("CLASS").std().add_suffix("_std")

### Feature MLTV vs Class

In [None]:
plotter = Plotting(plot_option=2, x='MLTV', data=data)
plotter.plot()

In [None]:
data[["MLTV","CLASS"]].groupby("CLASS").mean().add_suffix("_mean").round(2)

In [None]:
data[["MLTV","CLASS"]].groupby("CLASS").median().add_suffix("_median")

In [None]:
data[["MLTV","CLASS"]].groupby("CLASS").std().add_suffix("_std")

### Feature Width vs Class

In [None]:
plotter = Plotting(plot_option=2, x='Width', data=data)
plotter.plot()

In [None]:
data[["Width","CLASS"]].groupby("CLASS").mean().add_suffix("_mean").round(4)

In [None]:
data[["Width","CLASS"]].groupby("CLASS").median().add_suffix("_median")

In [None]:
data[["Width","CLASS"]].groupby("CLASS").std().add_suffix("_std")

### Feature Min vs Class

In [None]:
plotter = Plotting(plot_option=2, x='Min', data=data)
plotter.plot()

In [None]:
data[["Min","CLASS"]].groupby("CLASS").mean().add_suffix("_mean").round(4)

In [None]:
data[["Min","CLASS"]].groupby("CLASS").median().add_suffix("_median")

In [None]:
data[["Min","CLASS"]].groupby("CLASS").std().add_suffix("_std")

### Feature Max vs Class

In [None]:
plotter = Plotting(plot_option=2, x='Max', data=data)
plotter.plot()

In [None]:
data[["Max","CLASS"]].groupby("CLASS").mean().add_suffix("_mean").round(4)

In [None]:
data[["Max","CLASS"]].groupby("CLASS").median().add_suffix("_median")

In [None]:
data[["Median","CLASS"]].groupby("CLASS").std().add_suffix("_std")

### Feature Nmax vs Class

In [None]:
plotter = Plotting(plot_option=2, x='NMax', data=data)
plotter.plot()

In [None]:
data[["Nmax","CLASS"]].groupby("CLASS").mean().add_suffix("_mean").round(4)

In [None]:
data[["Nmax","CLASS"]].groupby("CLASS").median().add_suffix("_median")

In [None]:
data[["Nmax","CLASS"]].groupby("CLASS").std().add_suffix("_std")

### Feature Nzeros vs Class

In [None]:
plotter = Plotting(plot_option=2, x='NZeros', data=data)
plotter.plot()

In [None]:
data[["Nzeros","CLASS"]].groupby("CLASS").mean().add_suffix("_mean").round(4)

In [None]:
data[["Nzeros","CLASS"]].groupby("CLASS").median().add_suffix("_median")

In [None]:
data[["Nzeros","CLASS"]].groupby("CLASS").std().add_suffix("_std")

### Feature Mode vs Class

In [None]:
plotter = Plotting(plot_option=2, x='Mode', data=data)
plotter.plot()

In [None]:
data[["Mode","CLASS"]].groupby("CLASS").mean().add_suffix("_mean").round(4)

In [None]:
data[["Mode","CLASS"]].groupby("CLASS").median().add_suffix("_median")

In [None]:
data[["Mode","CLASS"]].groupby("CLASS").std().add_suffix("_std")

### Feature Mean vs Class

In [None]:
plotter = Plotting(plot_option=2, x='Mean', data=data)
plotter.plot()

In [None]:
data[["Mean","CLASS"]].groupby("CLASS").mean().add_suffix("_mean").round(4)

In [None]:
data[["Mean","CLASS"]].groupby("CLASS").median().add_suffix("_median")

In [None]:
data[["Mean","CLASS"]].groupby("CLASS").std().add_suffix("_std")

### Feature Median vs Class

In [None]:
plotter = Plotting(plot_option=2, x='Median', data=data)
plotter.plot()

In [None]:
data[["Median","CLASS"]].groupby("CLASS").mean().add_suffix("_mean").round(4)

In [None]:
data[["Median","CLASS"]].groupby("CLASS").median().add_suffix("_median")

In [None]:
data[["Median","CLASS"]].groupby("CLASS").std().add_suffix("_std")

### Feature Variance vs Class

In [None]:
plotter = Plotting(plot_option=2, x='Variance', data=data)
plotter.plot()

In [None]:
data[["Variance","CLASS"]].groupby("CLASS").mean().add_suffix("_mean").round(4)

In [None]:
data[["Variance","CLASS"]].groupby("CLASS").median().add_suffix("_median")

In [None]:
data[["Variance","CLASS"]].groupby("CLASS").std().add_suffix("_std")

## Correlations

In [None]:
data2 = data[['bl_FHR','accel','fetal_mov','uterine_contr','light_decel','severe_decel','prolong_decel','ASTV','MSTV','ALTV','MLTV','Width','Min','Max','Nmax','Nzeros','Mode','Mean','Median','Variance']]

In [None]:
corr_=data2.corr()
corr_.style.background_gradient(cmap='coolwarm').format(precision=4)

In [None]:
## Only to show plots for possible pairings
## No need to run this cell again, 
#sns.pairplot(data2)

# Feature Engineering

We are going to evaluate what is the best method for handling outliers on our Data Frame. Dropping them or Capping them.

## Handling Outliers (Dropping)

In [None]:
#We copy the dataframe to drop outliers
data2 = data.copy()
data2

In [None]:
#Creating a list of the columns' names that have outliers
col_outliers = ['fetal_mov', 'severe_decel', 'prolong_decel', 'MSTV', 'ALTV', 'MLTV', 'Max', 'Nzeros', 'Variance']

In [None]:
for i in col_outliers:
    factor = 3
    upper_lim = data2[i].mean () + data2[i].std () * factor
    lower_lim = data2[i].mean () - data2[i].std () * factor
    data_outliers = data2[(data2[i] < upper_lim) & (data2[i] > lower_lim)]

In [None]:
data_outliers[col_outliers].describe() #Here we can see the columns that have been modified (the ones with outliers)

In [None]:
data[col_outliers].describe() #We display the orginial to compare them

In [None]:
pd.set_option('display.max_columns', 500) #We increase the display for the columns

In [None]:
data_outliers.describe()  #Here we can see all the columns on the data frame where the outliers have been dropped

In [None]:
data.describe() #Here we can see all the columns on the original data frame

As we can see above, the stadisitc analysis of the columns, changes just a little bit compared to the original one.

## Handling Outliers (Capping)

In [None]:
data_cap = data.copy() #We copy the dataframe to cap the outliers
data_cap

In [None]:
#Capping the outlier rows with Percentiles
for i in col_outliers:
    upper_lim = data_cap[i].quantile(.95)
    lower_lim = data_cap[i].quantile(.05) 
    data_cap.loc[(data_cap[i] > upper_lim),i] = upper_lim 
    data_cap.loc[(data_cap[i] < lower_lim),i] = lower_lim

In [None]:
data_cap.describe()  #Here we can see all the columns on the data frame where the outliers have been capped

In [None]:
data.describe() #Here we can see all the columns on the original data frame

As we can see above, in the columns where the outliers existed, there's a bigger difference compared to the orignal
data frame.

We are opting for the use of dropping outliers because the changes in values only happens to the columns where outliers existed, leaving behind the other columns. This might have an impact on columns where relations existed. 

For example fetal_movement is usually associated with increases on the FHR, which are the accelerations. Fetal movement's mean value changes in the table where outliers are capped, but the mean of accelerations stays the same.

## Scaling (Normalization)

In [None]:
mmscaler = MinMaxScaler()

data_norm = data.copy()

In [None]:
data_norm[col_outliers] = mmscaler.fit_transform(data_norm[col_outliers])

In [None]:
data_norm[col_outliers].describe()

## Scaling (Standarization)

In [None]:
sc = StandardScaler()

data_stand = data.copy()

In [None]:
data_stand[col_outliers] = sc.fit_transform(data_stand[col_outliers])

In [None]:
data_stand[col_outliers].describe()

# Algorithm

We'll be using Random Forest Classifier for our algorithm and see the results of the three dataframes with different feature engineering techniques applied.

In [None]:
#Capped Outliers

#CLASS as target value
X=data_cap.drop(["CLASS"],axis=1).copy()
y=data_cap["CLASS"]

In [None]:
#splitting into train sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
plt.barh(list(X.columns),list(rf.feature_importances_))
plt.title("Coefficient Weight")
plt.show()

In [None]:
#Normalized Scaling

#CLASS as target value
X=data_norm.drop(["CLASS"],axis=1).copy()
y=data_norm["CLASS"]

In [None]:
#splitting into train sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
plt.barh(list(X.columns),list(rf.feature_importances_))
plt.title("Coefficient Weight")
plt.show()

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
#Standardized Scaling

#CLASS as target value
X=data_stand.drop(["CLASS"],axis=1).copy()
y=data_stand["CLASS"]

In [None]:
#splitting into train sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
plt.barh(list(X.columns),list(rf.feature_importances_))
plt.title("Coefficient Weight")
plt.show()

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Overall, the three methods have the same results which means that each one can give coherent and satisfactory results. 
Also, we noticed that the columns with the most weight are the "sleep" and "pattern" related ones as well as accel, while most of them stay much lighter.