# 1. Load data

We import useful packages

In [14]:
#basics
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
from scipy.stats import mode
import base64
import io
import os
import requests

#sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, mean_squared_error, recall_score
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier


We import the data 

In [3]:
path = "/Users/erikgutierrezduthiers/Desktop/CW_Intro_AI/data/"
filename = "CTG.xls"

df = pd.read_excel(path+filename, sheet_name = "Data",header=1)
df = df.reindex(np.random.permutation(df.index))
df.head()
# shape: (891, 12) assdsd

Unnamed: 0,b,e,AC,FM,UC,DL,DS,DP,DR,Unnamed: 9,...,E,AD,DE,LD,FS,SUSP,Unnamed: 42,CLASS,Unnamed: 44,NSP
352,118,617,0,11,0,0,0,0,0,,...,-1,-1,-1,-1,1,-1,,9,,3
2050,2,525,0,1,1,0,0,0,0,,...,-1,-1,-1,-1,-1,-1,,3,,1
1860,734,1789,16,0,4,0,0,0,0,,...,-1,-1,-1,-1,-1,-1,,2,,1
1856,552,1500,13,0,4,0,0,0,0,,...,-1,-1,-1,-1,-1,-1,,2,,1
335,0,1199,0,3,0,0,0,0,0,,...,-1,-1,-1,-1,1,-1,,9,,3


**Data description**:

* 2126 fetal cardiotocograms (CTGs).
* CTGs classified by: 
    * morphologic pattern (A, B, C....) - 10-class experiment
    * fetal state (N, S, P) - 3-class experiment

In [4]:
# Columns to be considered in the dataset
columns = ["LB", "AC", "FM", "UC", "DL", "DP", 
           "ASTV", "MSTV", "ALTV", "MLTV", "Width", "Min", 
           "Max", "Nmax", "Nzeros", "Mode", "Mean", "Median", "Variance", "Tendency"]

#Prepare X and y
X = df[columns]
y = df["NSP"]
X.head()

Unnamed: 0,LB,AC,FM,UC,DL,DP,ASTV,MSTV,ALTV,MLTV,Width,Min,Max,Nmax,Nzeros,Mode,Mean,Median,Variance,Tendency
352,140,0,11,0,0,0,78,0.2,86,3.4,8,136,144,1,0,141,140,141,0,0
2050,128,0,1,1,0,0,70,1.9,16,4.1,20,119,139,2,0,130,127,131,1,0
1860,138,16,0,4,0,0,51,0.9,0,1.9,49,122,171,2,0,147,148,149,5,0
1856,138,13,0,4,0,0,51,1.1,0,2.1,49,122,171,3,0,148,148,149,6,0
335,146,0,3,0,0,0,81,0.2,67,3.9,13,137,150,1,0,146,144,146,1,0


In [5]:
print(X.shape)
print(y.shape)


(2126, 20)
(2126,)


# 2. Standardization 


In [6]:
# Apply the standar scaler to features
sc = StandardScaler()
sc.fit(X)
X_std = pd.DataFrame(sc.transform(X),columns=columns)
X_std.head()

Unnamed: 0,LB,AC,FM,UC,DL,DP,ASTV,MSTV,ALTV,MLTV,Width,Min,Max,Nmax,Nzeros,Mode,Mean,Median,Variance,Tendency
0,0.680604,-0.76474,0.101267,-1.285798,-0.628375,-0.27153,1.804078,-1.282833,4.140444,-0.850843,-1.603375,1.435392,-1.116245,-1.04053,-0.458444,0.216638,0.345702,0.201179,-0.649208,-0.524526
1,-0.53909,-0.76474,-0.168154,-0.93448,-0.628375,-0.27153,1.338658,0.642349,0.334556,-0.726441,-1.295261,0.860159,-1.394953,-0.701397,-0.458444,-0.455018,-0.48817,-0.490232,-0.614691,-0.524526
2,0.477322,3.729626,-0.195096,0.119475,-0.628375,-0.27153,0.233285,-0.490111,-0.535361,-1.117419,-0.55065,0.961671,0.388775,-0.701397,-0.458444,0.582996,0.858853,0.754307,-0.476621,-0.524526
3,0.477322,2.886933,-0.195096,0.119475,-0.628375,-0.27153,0.233285,-0.263619,-0.535361,-1.081875,-0.55065,0.961671,0.388775,-0.362263,-0.458444,0.644055,0.858853,0.754307,-0.442103,-0.524526
4,1.290451,-0.76474,-0.11427,-1.285798,-0.628375,-0.27153,1.97861,-1.282833,3.107418,-0.761985,-1.474994,1.469229,-0.781796,-1.04053,-0.458444,0.521936,0.602278,0.546884,-0.614691,-0.524526


Data standardization is used to minimize the difference in the ranges of the features, ensuring the gradient descent moves smoothly towards the minima and that the steps for gradient descent are updated at the same rate for all the features. However, scaling does not significantly affect our accuracy in our case because our features are within the same range (i.e., 0 - 200).  

# 3. Preprocessing

To treat the class imbalance we use SMOTE 

In [9]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=31)

X_sm, y_sm = sm.fit_resample(X_std, y)

# 4. Supervised Learning Models 


We define a class with all the supervised learning models to make evaluating them easier. 

In [10]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, mean_squared_error

class SLClassifiers(): 
    
    
    def __init__(self, X, y): 
        
        self.X = X
        self.y = y 
        
    
    def confusion_matrix(self, y_pred, y_test, target_labels): 
        
        cm = confusion_matrix(y_pred,y_test, normalize="all")
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_labels);
        disp.plot();
        
    
    def train_test_split(self): 
        
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, train_size=0.8,random_state=35)
        
        return X_train, X_test, y_train, y_test
    
        
    def decision_tree(self): 
        
        X_train, X_test, y_train, y_test = self.train_test_split()
        dt = DecisionTreeClassifier(random_state=35)
        dt.fit(X_train, y_train)
        y_pred = dt.predict(X_test)
        print("Decision Tree accuracy: %.2f" % accuracy_score(y_pred,y_test) )
        print("Decision Tree recall: %.2f" % recall_score(y_pred,y_test,average="macro") )
        #self.confusion_matrix(y_pred, y_test, dt.classes_)
        
    
    def random_forest(self, n_estimators=10, max_depth=None, criterion="entropy"): 
        
        X_train, X_test, y_train, y_test = self.train_test_split()
        rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, criterion=criterion)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_test)
        
        print("Random Forest accuracy: %.2f" % accuracy_score(y_pred,y_test) )
        print("Random Forest recall: %.2f" % recall_score(y_pred,y_test,average="macro") )
        #self.confusion_matrix(y_pred, y_test, rf.classes_)

    
    def support_vector_machine(self, kernel = "linear", C=10000): 
       
        X_train, X_test, y_train, y_test = self.train_test_split()
        svm = SVC(kernel=kernel, C=C)
        svm.fit(X_train, y_train)
        y_pred = svm.predict(X_test)
        print("Support Vector Machine accuracy: %.2f" % accuracy_score(y_test, y_pred))
        print("Support Vector Machine recall: %.2f" % recall_score(y_pred,y_test,average="macro") )
        
        #self.confusion_matrix(y_pred, y_test, rf.classes_)
        
        
    def k_nearest_neighbours(self, n=3): 
    
        X_train, X_test, y_train, y_test = self.train_test_split()
        knn = KNeighborsClassifier(n_neighbors=n)
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        print("K-Nearest Neighbours accuracy: %.2f" % accuracy_score(y_pred,y_test) )
        print("K-Nearest Neighbours recall: %.2f" % recall_score(y_pred,y_test,average="macro") )
        
        
    def gaussian_naive_bayes(self):
       
        X_train, X_test, y_train, y_test = self.train_test_split()
        gnb = GaussianNB()
        gnb.fit(X_train, y_train);
        y_pred = gnb.predict(X_test)
        print("Gaussian Naive Bayes accuracy: %.2f" % accuracy_score(y_pred,y_test) )
        print("Gaussian Naive Bayes recall: %.2f" % recall_score(y_pred,y_test,average="macro") )


slc = SLClassifiers(X, y)

    

Let us apply all the supervised learning models to our raw data to see how well the classifiers perform.   

In [11]:
dt = slc.decision_tree()
rf = slc.random_forest(n_estimators=100)
svm = slc.support_vector_machine()
knn = slc.k_nearest_neighbours()
gnb = slc.gaussian_naive_bayes()

Decision Tree accuracy: 0.90
Decision Tree recall: 0.84
Random Forest accuracy: 0.94
Random Forest recall: 0.90
Support Vector Machine accuracy: 0.91
Support Vector Machine recall: 0.85
K-Nearest Neighbours accuracy: 0.88
K-Nearest Neighbours recall: 0.82
Gaussian Naive Bayes accuracy: 0.80
Gaussian Naive Bayes recall: 0.68


Now we apply all the supervised learning models to our processed data.

In [13]:
slc = SLClassifiers(X_sm, y_sm)

dt = slc.decision_tree()
rf = slc.random_forest(n_estimators=100)
svm = slc.support_vector_machine()
knn = slc.k_nearest_neighbours()
gnb = slc.gaussian_naive_bayes()

Decision Tree accuracy: 0.96
Decision Tree recall: 0.96
Random Forest accuracy: 0.98
Random Forest recall: 0.98
Support Vector Machine accuracy: 0.90
Support Vector Machine recall: 0.90
K-Nearest Neighbours accuracy: 0.96
K-Nearest Neighbours recall: 0.96
Gaussian Naive Bayes accuracy: 0.79
Gaussian Naive Bayes recall: 0.81


# 5. Unsupervised Learning Models

We define a class with all the unsupervised learning models to make evaluating them easier. 

In [15]:
class ULClassifiers():
    
    def __init__(self, X, y): 

        self.X = X
        self.y = y 
        
        
    def mask(self, clusters): 
        
        labels = np.zeros_like(clusters)

        for i in range(3): 
            mask = (clusters == i)
            labels[mask] = mode(self.y[mask])[0]
        
        return labels
    
    
    def PCA(self, n=2): 
        
        pca = PCA(n_components=n)
        pca.fit(self.X)
        X_pca = pca.transform(self.X)
        
        return X_pca

    
    def gaussian_mixture_models(self, n=3):
        
        gmm = GaussianMixture(n_components=n).fit(self.X)
        clusters = gmm.predict(self.X)
        labels = self.mask(clusters)
        print("Gaussian Mixture Models accuracy: %.2f" % accuracy_score(self.y, labels))
        
    
    def k_means(self, n=3, rs=0):
    
        km = KMeans(n_clusters=n, random_state=rs)
        clusters = km.fit_predict(self.X)
        labels = self.mask(clusters)
        print("k Means accuracy: %.2f" % accuracy_score(self.y, labels))
    

We apply all the unsupervised learning models to our raw data to see how well the classifiers perform. 

In [16]:
ulc = ULClassifiers(X, y)

gmm = ulc.gaussian_mixture_models()
km = ulc.k_means()

Gaussian Mixture Models accuracy: 0.78
k Means accuracy: 0.78


Then, we apply all the unsupervised learning models to the processed data.

In [17]:
ulc = ULClassifiers(X_sm, y_sm)

gmm = ulc.gaussian_mixture_models()
km = ulc.k_means()

Gaussian Mixture Models accuracy: 0.68
k Means accuracy: 0.71


# 6. Future work: hyperparameter tuning

When we evaluated the SL models, the results showed that random forest is the best. In this case, we get an accuracy of `0.98`. If the results had been lower, now would be the time to perform hyperparameter tuning to optimize it even further. 

**Note**: this is a simplistic approach since we do not know the optimal model architecture for the rest of the models. Thus, a hyper-parameter configuration for one of the other models might render better results than the Random Forest Classifier. However, investing time in finding all the different possible configurations for every model deviated from the objective of our analysis. 

Lets look at how hyperparameter tuning would look like for the Random Forest. 

**Random Forest hyperparameters**:

1. `max_dept`: longest path between the root node and the leaf node.
2. `min_sample_split`: minimum required number of observations in any given node to split it.
3. `max_leaf_nodes`: restricts the growth of the tree.
4. `min_samples_leaf`: minimum number of samples that should be present in the leaf node after splitting a node.
5. `n_estimators`: Number of trees in the forest.
6. `max_sample`: fraction of the original dataset is given to any individual tree.
7. `max_features`: number of maximum features provided to each tree in a random forest.

A way to hyperparameter tune is using GridSearch. 

The computational complexity of a hyperparameter tuning job depends primarily on the number of hyperparameters whose range of values GridSearch has to search through during optimization. That is why we limit our search to the parameters we think will give us better results.

These are: 

- `n_estimator`: by building forests with a large number of trees (high number of estimators), we can create a more robust model with less variance at the cost of a greater training time. 
- `criterion`: fine-tuning the split criteria could lead to different forests. Since there are only two possible values, we will try both measures to see which leads to a more minor error.
- `min_samples_split`: fine-tuning the number of features to consider when splitting at each node is fundamental. Therefore it should be considered when using a search approach to find the best hyperparameters for our forest.

The following code block shows how to hyperparmeter tune using grid search:

In [20]:
# specify the important parameters.
grid_param = {
'n_estimators': [90, 100, 115, 130], 
'criterion': ['gini', 'entropy'], 
'min_samples_split': range(1, 10, 1),
}

grid_search = GridSearchCV(estimator=rf, param_grid=grid_param, cv=5, verbose=0)

# feed the training data set to grid_search
## grid_search.fit(X_train, y_train)

# to see the best parameters as per our grid
## grid_search.best_param_