In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import math
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from matplotlib import pyplot
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, roc_auc_score, accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from collections import Counter


In [2]:
def cleaning(diabetes):
    
    """ diabetes is a dataframe """
    
    # Primero quitamos las columnas que no queremos
    
    diabetes = diabetes.drop([
                            # Valores de segunda medida
                            "diagnosdm", "tiempoadm","fecha2","peso2","talla2","cintura2","ntratami2",
                            "X25oh2","urico2","crea2","colester2","triglice2","hdl2","ldl2","glucemia2",
                            "microalc2", "cistatin2","fibrinog2","pcr2","hbglicos2","insulina2","homa2",
                            "antiagr2","diureti2","betablo2","alfablo2","calcioa2","ieca2", "tiempo.censo",
                            "araii2","tas_s2","tad_s2","fc_s2","diferenciafechas", "imc2", "sm2", "hta2", 
                            "ncrit_sm2", "epi2", "dislipe2", "Unnamed: 0", "fechaglucometria", "fnacimien",
                            "fecha1", "nsagrado",
                              
                            # tratamientos
                            "dislipe1", "antiagr1", "diureti1", "betablo1", "alfablo1", "calcioa1", "ieca1", "araii1",
                            
                            # otros
                            "peso1", "colester1", "progres_microalc"
                              ], axis=1)
    
    # Nos aseguramos de que las columnas tengan sus tipos de clase correspondiente
    
    for key in diabetes.keys(): 
        if diabetes[key].dtype == object:    
            diabetes[key] = diabetes[key].astype(float)
            
        if diabetes[key].dtype == bool:    
            diabetes[key] = diabetes[key].astype(int)
            
    return diabetes

In [3]:
def complete_data(diabetes):
    
    "If value is Nan and column is hbglicos delete row"
    
    """    
    diabetes = diabetes.dropna(subset=['hbglicos1'])
    diabetes = diabetes.reset_index(drop=True)
    """
    
    "If value is Nan, input the average of the column"
    
    imputer = SimpleImputer(strategy="median")
    imputer.fit(diabetes)
    X = imputer.transform(diabetes)
    
    #Hay que volver a transformar a dataFrame, porque ahora diabetes es una matriz sin nombres 
    diabetes = pd.DataFrame(X, columns=diabetes.columns, index=diabetes.index)
    
    return diabetes

In [4]:
def standarize(data):
    
    "Use standarization to scale the data"
    
    standard_scaler = StandardScaler()
    standard_scaler.fit(data)
    X = standard_scaler.transform(data)
    
    # Hay que volver a transformar a dataFrame, porque ahora diabetes es una matriz sin nombres     
    data_scaled = pd.DataFrame(X, columns=data.columns, index=data.index)
    
    # Los valores booleanos se vuelven a meter, ya que no se cambian
    data_scaled[["sexo", "ecv", "diabete2"]] = data[["sexo", "ecv", "diabete2"]]
      
    return data_scaled


In [5]:
def get_features_lables(df):
    
    "Separate the features and labels from the database"
    
    features = df.drop(['diabete2'], axis=1)
    labels = df["diabete2"]
    
    return features, labels
    

In [6]:
def train_and_test(X, y, grid_model, n_iterations):
    
    """ 
    Divide the data in different train and test splits,
    search for each split a model with grid search,
    compute the metrics for each split and return the average metric
    return also the best model
    """
    
    recall = 0
    precision = 0
    accuracy = 0
    auc = 0
    
    sss = StratifiedShuffleSplit(n_splits=n_iterations, test_size=0.1, random_state=3)
    
    for train_index, test_index in sss.split(X, y):
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]
        
        # Fit the grid to the train data for this iteration
        grid.fit(X_train, y_train)

        # Get the best estimator
        clf = grid.best_estimator_
        
        # Compute the predicitions
        grid_predictions = grid.predict(X_test)
        y_pred = grid_predictions
        
        # Compute and save metrics
        precision += precision_score(y_test, y_pred)
        recall += recall_score(y_test, y_pred)
        accuracy += accuracy_score(y_test, y_pred)
        auc += roc_auc_score(y_test, y_pred)
        
    precision = precision / n_iterations 
    accuracy = accuracy / n_iterations
    recall = recall / n_iterations
    auc = auc / n_iterations
    
    return clf, accuracy, precision, recall, auc

In [7]:
def compute_confusion_matrix(acc, prec, recall, length):
    
    """Compute the confusion matrix from a recall, precision and accuracy"""
    
    a = np.array([
        [1 - prec, - prec, 0, 0],
        [1 - recall, 0, - recall, 0],
        [1 - acc, - acc, - acc, 1 - acc],
        [1, 1, 1, 1]
    ])

    b = np.array([0, 0, 0, length])
    
    # Solve the linear problem
    x = np.linalg.solve(a, b)
    
    tp, fp, fn, tn = x[0], x[1], x[2], x[3]
    tp, fp, fn, tn = round(tp), round(fp), round(fn), round(tn)
    
    print("\nconfusion matrix")
    print("[tp, fn ]")
    print("[fp, tn ]")
    print("\n[{} {}]".format(tp,fn))
    print("[{} {}]".format(fp,tn))

# 1) GET AND TRANSFORM DATA 

## Load, Clean, Complete

In [8]:
# Load Data

diabetes_orig = pd.read_csv("diabetes.csv", decimal=',')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 3000)

In [9]:
diabetes_orig

Unnamed: 0.1,Unnamed: 0,nsagrado,fechaglucometria,sexo,fnacimien,ecv,fecha1,peso1,talla1,cintura1,ntratami1,X25oh1,urico1,crea1,colester1,triglice1,hdl1,ldl1,glucemia1,microalc1,cistatin1,fibrinog1,pcr1,hbglicos1,insulina1,homa1,dislipe1,antiagr1,diureti1,betablo1,alfablo1,calcioa1,ieca1,araii1,tas_s1,tad_s1,fc_s1,diagnosdm,tiempoadm,fecha2,peso2,talla2,cintura2,ntratami2,X25oh2,urico2,crea2,colester2,triglice2,hdl2,ldl2,glucemia2,microalc2,cistatin2,fibrinog2,pcr2,hbglicos2,insulina2,homa2,dislipe2,diabete2,antiagr2,diureti2,betablo2,alfablo2,calcioa2,ieca2,araii2,tas_s2,tad_s2,fc_s2,diferenciafechas,imc1,imc2,sm1,sm2,hta1,hta2,ncrit_sm1,ncrit_sm2,epi1,epi2,progres_microalc,tiempo.censo,TU100,AO140,gluc_media,coef.var,gmi,mage,conga2,dfa,apen,sampen,permentrop,co,P_sd1,P_sd2,P_e
0,0,1,2012-10-30,1,22/8/1935,False,2012-09-10,65.0,160.0,89.0,2.0,20.0,5.4,1.3,172,65,86.0,73.0,106,0.0,1.19,523.0,4.87,6.3,7.3,1.91,True,False,False,False,False,True,False,False,137.0,51.0,71.0,,,2013-11-26,64.0,160.0,,2.0,15.0,5.0,1.1,184.0,56.0,96.0,77.0,96.0,0.0,1.26,465.0,3.19,5.9,7.1,1.68,True,False,False,False,False,False,True,False,True,150.0,57.0,65.0,442.0,25.4,25.0,1.0,0.0,1.0,1.0,3,1.0,39.6662042362821,48.204063374914,False,442.0,0.62248,68.5,98.42,0.2,5.665,56.35,25.87,0.92745,0.293,0.1887,0.9967,59.4059,1.7058,26.8306,15.8225
1,1,2,2013-05-13,0,28/7/1970,False,2013-03-20,93.0,176.0,110.0,0.0,22.0,4.8,0.8,199,217,49.0,107.0,92,0.0,0.64,294.0,3.19,5.8,17.7,4.02,False,False,False,False,False,False,False,False,143.0,85.0,100.0,,,2016-06-17,91.0,176.0,103.0,0.0,40.0,6.0,0.8,196.0,257.0,48.0,129.0,75.0,1.97,0.86,313.0,3.3,5.7,13.7,2.54,False,False,False,False,False,False,False,False,False,129.0,75.0,67.0,1185.0,30.0,29.4,1.0,1.0,1.0,1.0,3,3.0,110.182148380645,107.884482247986,True,1185.0,0.41493,295.5,107.27,0.21,5.875,53.3335,30.95,1.0092,0.3666,0.2745,0.9968,199.6197,1.7989,29.3355,16.3145
2,2,3,2014-11-28,0,12/4/1953,True,2014-12-11,100.0,172.0,120.0,2.0,26.0,7.1,1.1,146,107,55.0,70.0,114,0.0,0.99,,3.44,5.5,9.7,2.73,True,True,False,False,False,True,False,True,126.0,64.0,67.0,,,2016-05-25,89.0,172.0,126.0,2.0,,6.9,1.0,122.0,84.0,45.0,60.0,94.0,0.0,1.0,,29.4,5.6,20.6,4.78,True,False,True,False,False,False,True,False,True,127.0,68.0,72.0,531.0,33.8,30.1,1.0,0.0,1.0,1.0,3,2.0,72.0706947201851,79.7445408411253,False,531.0,0.71528,0.0,90.12,0.17,5.465,34.3855,15.715,0.85676,0.3702,0.3162,0.9975,85.92599,1.1842,18.0015,14.3428
3,3,4,2012-10-30,0,4/11/1944,True,2012-10-22,78.0,171.0,96.0,2.0,56.0,4.8,0.9,153,43,62.0,82.0,110,4.59,0.79,368.0,3.19,6.0,7.3,1.98,True,True,False,False,False,True,False,True,137.0,81.0,65.0,,,2016-01-18,74.5,171.0,,2.0,39.0,4.3,0.9,189.0,66.0,65.0,111.0,99.0,3.35,0.88,356.0,3.27,5.6,7.7,1.88,True,False,True,False,False,False,True,False,True,147.0,81.0,63.0,1183.0,26.7,25.5,0.0,0.0,1.0,1.0,2,1.0,88.068127368492,85.6279912136014,False,1183.0,0.58819,39.5,95.38,0.21,5.59,49.0285,21.595,0.88439,0.411,0.3635,0.9962,99.61246,1.6722,26.4029,15.5557
4,4,5,2014-01-24,1,24/4/1960,True,2013-09-12,66.0,160.0,100.0,1.0,40.0,5.0,0.7,183,168,,,106,2.59,0.71,334.0,3.19,5.2,,,True,True,False,False,False,False,False,True,127.0,72.0,82.0,,,2016-06-14,66.0,160.0,95.0,1.0,58.0,5.8,0.9,176.0,97.0,46.0,111.0,107.0,0.0,0.86,350.0,3.3,5.4,27.5,7.27,True,False,True,False,False,False,False,False,True,152.0,81.0,82.0,1006.0,25.8,25.8,1.0,1.0,1.0,1.0,4,4.0,99.2367796242698,71.7075268665681,False,1006.0,0.48958,5.0,98.79,0.2,5.675,46.378,20.2,0.8277,0.408,0.3383,0.9972,70.45952,1.7185,24.3725,14.3563
5,5,6,2012-07-17,1,3/12/1942,False,2012-06-20,86.1,165.0,104.0,2.0,,2.7,0.8,189,82,77.0,96.0,99,46.16,0.62,297.0,3.19,5.7,9.5,2.32,False,False,False,True,False,True,False,False,126.0,75.0,75.0,,,2015-02-13,88.0,165.0,109.0,2.0,57.0,4.9,0.9,181.0,115.0,58.0,100.0,129.0,12.56,1.0,373.0,15.2,5.7,9.4,2.99,False,False,False,False,True,False,True,False,False,129.0,85.0,78.0,968.0,31.6,32.3,0.0,1.0,1.0,1.0,2,3.0,75.4654861947156,64.0844586853372,False,968.0,0.13767,367.5,113.14,0.15,6.02,49.1665,25.675,0.96611,0.3692,0.2532,0.9982,152.62411,1.6036,23.5914,14.6312
6,6,7,2013-12-10,1,17/8/1968,False,2013-11-10,93.0,148.0,108.0,4.0,5.0,6.7,0.6,210,179,54.0,120.0,100,4.14,0.68,330.0,9.66,5.5,31.1,7.68,False,False,True,False,False,False,False,False,121.0,67.0,80.0,,,2014-06-17,93.0,148.0,108.0,3.0,38.0,6.5,0.6,196.0,165.0,63.0,100.0,85.0,0.0,0.8,292.0,7.06,5.5,17.0,3.57,False,False,False,True,False,False,True,False,True,120.0,84.0,93.0,219.0,42.5,42.5,1.0,1.0,1.0,1.0,4,3.0,110.43434716436,110.43434716436,False,219.0,0.39931,157.5,103.15,0.19,5.775,49.7225,26.67,0.96967,0.4238,0.3722,0.997,100.08957,1.7275,26.3135,15.203
7,7,8,2013-03-04,1,24/2/1955,False,2013-05-07,62.0,156.0,89.0,0.0,30.0,3.8,0.8,225,53,90.0,124.0,112,0.0,0.72,306.0,3.19,5.9,10.6,2.9,False,False,False,False,False,False,False,False,134.0,83.0,87.0,,,2016-05-04,66.0,156.0,,0.0,,4.8,1.0,213.0,50.0,78.0,125.0,111.0,5.96,0.91,419.0,3.27,5.9,9.4,2.58,False,False,False,False,False,False,False,False,False,124.0,69.0,68.0,1093.0,25.5,27.1,1.0,,1.0,1.0,3,2.0,81.527972282329,60.9523115891595,True,1093.0,0.69271,170.0,93.98,0.23,5.555,59.833,23.58,0.96889,0.2523,0.1616,0.9957,203.32102,1.5522,29.7843,18.9961
8,8,9,2013-02-08,1,29/7/1940,False,2012-11-23,79.0,149.0,105.0,4.0,25.0,5.5,0.7,184,134,64.0,93.0,106,6.0,0.81,393.0,3.19,6.2,25.2,6.6,False,False,True,False,False,True,False,True,120.0,71.0,77.0,,,2016-02-15,76.0,149.0,110.0,4.0,34.0,5.6,0.8,176.0,105.0,69.0,86.0,101.0,0.0,1.02,414.0,3.27,5.9,17.8,4.44,False,False,False,True,False,False,True,False,True,152.0,72.0,71.0,1179.0,35.6,34.2,1.0,1.0,1.0,1.0,3,3.0,86.8377143235127,72.3508879239436,False,1179.0,0.49912,59.5,101.43,0.15,5.735,40.2915,19.715,0.89435,0.3913,0.3086,0.998,106.22451,1.3861,20.9251,14.7668
9,9,10,2012-05-29,1,2/12/1941,False,2012-05-16,70.0,145.0,105.0,4.0,21.0,5.6,0.7,142,112,72.0,86.0,98,12.17,0.67,403.0,3.34,5.5,24.7,5.98,True,False,True,True,False,True,False,False,144.0,81.0,69.0,,,2016-05-30,65.0,145.0,,5.0,,4.5,0.8,147.0,126.0,75.0,47.0,86.0,5.16,0.8,301.0,3.27,5.3,14.5,3.08,True,False,False,True,True,False,True,False,True,146.0,73.0,69.0,1475.0,33.3,30.9,0.0,0.0,1.0,1.0,2,1.0,88.0663276607072,72.8609143242131,False,1475.0,0.81597,0.0,85.09,0.19,5.345,33.25,17.79,0.90085,0.3838,0.3159,0.9968,76.90064,1.2188,22.604,18.5445


In [10]:
# Eliminamos las columnas no necesarias y fijamos los tipos de clases para cada una
diabetes = cleaning(diabetes_orig.copy())

# Completamos los datos faltantes
diabetes = complete_data(diabetes)

In [11]:
diabetes

Unnamed: 0,sexo,ecv,talla1,cintura1,ntratami1,X25oh1,urico1,crea1,triglice1,hdl1,ldl1,glucemia1,microalc1,cistatin1,fibrinog1,pcr1,hbglicos1,insulina1,homa1,tas_s1,diabete2,imc1,sm1,hta1,ncrit_sm1,epi1,TU100,AO140,gluc_media,coef.var,gmi,mage,conga2,dfa,apen,sampen,permentrop,co,P_sd1,P_sd2,P_e
0,1.0,0.0,160.0,89.0,2.0,20.0,5.4,1.3,65.0,86.0,73.0,106.0,0.0,1.19,523.0,4.87,6.3,7.3,1.91,137.0,0.0,25.4,1.0,1.0,3.0,39.666204,0.62248,68.5,98.42,0.2,5.665,56.35,25.87,0.92745,0.293,0.1887,0.9967,59.4059,1.7058,26.8306,15.8225
1,0.0,0.0,176.0,110.0,0.0,22.0,4.8,0.8,217.0,49.0,107.0,92.0,0.0,0.64,294.0,3.19,5.8,17.7,4.02,143.0,0.0,30.0,1.0,1.0,3.0,110.182148,0.41493,295.5,107.27,0.21,5.875,53.3335,30.95,1.0092,0.3666,0.2745,0.9968,199.6197,1.7989,29.3355,16.3145
2,0.0,1.0,172.0,120.0,2.0,26.0,7.1,1.1,107.0,55.0,70.0,114.0,0.0,0.99,344.0,3.44,5.5,9.7,2.73,126.0,0.0,33.8,1.0,1.0,3.0,72.070695,0.71528,0.0,90.12,0.17,5.465,34.3855,15.715,0.85676,0.3702,0.3162,0.9975,85.92599,1.1842,18.0015,14.3428
3,0.0,1.0,171.0,96.0,2.0,56.0,4.8,0.9,43.0,62.0,82.0,110.0,4.59,0.79,368.0,3.19,6.0,7.3,1.98,137.0,0.0,26.7,0.0,1.0,2.0,88.068127,0.58819,39.5,95.38,0.21,5.59,49.0285,21.595,0.88439,0.411,0.3635,0.9962,99.61246,1.6722,26.4029,15.5557
4,1.0,1.0,160.0,100.0,1.0,40.0,5.0,0.7,168.0,54.0,104.0,106.0,2.59,0.71,334.0,3.19,5.2,11.75,2.895,127.0,0.0,25.8,1.0,1.0,4.0,99.23678,0.48958,5.0,98.79,0.2,5.675,46.378,20.2,0.8277,0.408,0.3383,0.9972,70.45952,1.7185,24.3725,14.3563
5,1.0,0.0,165.0,104.0,2.0,27.0,2.7,0.8,82.0,77.0,96.0,99.0,46.16,0.62,297.0,3.19,5.7,9.5,2.32,126.0,0.0,31.6,0.0,1.0,2.0,75.465486,0.13767,367.5,113.14,0.15,6.02,49.1665,25.675,0.96611,0.3692,0.2532,0.9982,152.62411,1.6036,23.5914,14.6312
6,1.0,0.0,148.0,108.0,4.0,5.0,6.7,0.6,179.0,54.0,120.0,100.0,4.14,0.68,330.0,9.66,5.5,31.1,7.68,121.0,0.0,42.5,1.0,1.0,4.0,110.434347,0.39931,157.5,103.15,0.19,5.775,49.7225,26.67,0.96967,0.4238,0.3722,0.997,100.08957,1.7275,26.3135,15.203
7,1.0,0.0,156.0,89.0,0.0,30.0,3.8,0.8,53.0,90.0,124.0,112.0,0.0,0.72,306.0,3.19,5.9,10.6,2.9,134.0,0.0,25.5,1.0,1.0,3.0,81.527972,0.69271,170.0,93.98,0.23,5.555,59.833,23.58,0.96889,0.2523,0.1616,0.9957,203.32102,1.5522,29.7843,18.9961
8,1.0,0.0,149.0,105.0,4.0,25.0,5.5,0.7,134.0,64.0,93.0,106.0,6.0,0.81,393.0,3.19,6.2,25.2,6.6,120.0,0.0,35.6,1.0,1.0,3.0,86.837714,0.49912,59.5,101.43,0.15,5.735,40.2915,19.715,0.89435,0.3913,0.3086,0.998,106.22451,1.3861,20.9251,14.7668
9,1.0,0.0,145.0,105.0,4.0,21.0,5.6,0.7,112.0,72.0,86.0,98.0,12.17,0.67,403.0,3.34,5.5,24.7,5.98,144.0,0.0,33.3,0.0,1.0,2.0,88.066328,0.81597,0.0,85.09,0.19,5.345,33.25,17.79,0.90085,0.3838,0.3159,0.9968,76.90064,1.2188,22.604,18.5445


## Standarize

In [12]:
diabetes_scaled = diabetes.copy()
diabetes_scaled = standarize(diabetes_scaled)

## Didive into features and labels

In [13]:
# Separar diabetes y diabetes scaled en features y labels

X, y = get_features_lables(diabetes)
X_scaled, y_scaled = get_features_lables(diabetes_scaled)

# 2) FEATURE SELECTION

In [14]:
from sklearn.feature_selection import mutual_info_classif, RFE
from sklearn.svm import LinearSVC
import seaborn as sns

## Correlation Matrix

In [15]:
# Calculate the correlation matrix and take the absolute value
corr_matrix = X_scaled.corr().abs()

# Create a True/False mask and apply it
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
tri_df = corr_matrix.mask(mask)

In [16]:
# List column names of highly correlated features (r > 0.9)
to_drop = list()

for col in tri_df.keys():
    for row in tri_df.keys():
        if tri_df[col][row] > 0.9:
            to_drop.append((col, row))
        
for row in to_drop:
    print(row)

('insulina1', 'homa1')
('gluc_media', 'gmi')
('coef.var', 'permentrop')
('mage', 'conga2')
('conga2', 'P_sd2')
('apen', 'sampen')


In [17]:
columns_to_drop = ['homa1', 'gluc_media', 'permentrop', 'conga2', 'apen']

In [18]:
# Drop the features in the to_drop list
X = X.drop(['homa1', 'gluc_media', 'permentrop', 'conga2', 'apen'], axis=1)
X_scaled = X_scaled.drop(['homa1', 'gluc_media', 'permentrop', 'conga2', 'apen'], axis=1)

## Info Gain

In [19]:
mutual_info = dict(zip(X.columns,
                    mutual_info_classif(X, y, n_neighbors = 3, random_state = 21)
                    ))

In [20]:
mutual_info_ordered = dict(sorted(mutual_info.items(),  key=lambda x: x[1], reverse=True))

In [21]:
mutual_info_ordered

{'TU100': 0.05645409236979315,
 'gmi': 0.0383123785812407,
 'epi1': 0.03769417976247791,
 'sm1': 0.023504236210417195,
 'hbglicos1': 0.0233956367474335,
 'pcr1': 0.014728999476763649,
 'mage': 0.013783265876977424,
 'ncrit_sm1': 0.01306371500803194,
 'P_sd2': 0.013008583154656028,
 'crea1': 0.009432009589277479,
 'X25oh1': 0.008606062324509223,
 'ldl1': 0.0080064028874971,
 'urico1': 0.006792835233691452,
 'P_e': 0.006688472492348163,
 'sampen': 0.006643280804989216,
 'cintura1': 0.0031557694666899394,
 'P_sd1': 0.0024283585573383615,
 'ntratami1': 0.002179772915814393,
 'sexo': 0.0018497225696814112,
 'ecv': 0.0,
 'talla1': 0.0,
 'triglice1': 0.0,
 'hdl1': 0.0,
 'glucemia1': 0.0,
 'microalc1': 0.0,
 'cistatin1': 0.0,
 'fibrinog1': 0.0,
 'insulina1': 0.0,
 'tas_s1': 0.0,
 'imc1': 0.0,
 'hta1': 0.0,
 'AO140': 0.0,
 'coef.var': 0.0,
 'dfa': 0.0,
 'co': 0.0}

## Recursive Feature Elimination

In [22]:
selector = RFE(LinearSVC(), n_features_to_select = 1, step=1)
selector = selector.fit(X_scaled, y_scaled)

rfe_selection = dict(zip(X_scaled.columns,
                    selector.ranking_
                    ))



In [23]:
rfe_selection_ordered = dict(sorted(rfe_selection.items(),  key=lambda x: x[1], reverse=False))

In [24]:
rfe_selection_ordered 

{'hbglicos1': 1,
 'TU100': 2,
 'sexo': 3,
 'fibrinog1': 4,
 'imc1': 5,
 'triglice1': 6,
 'ncrit_sm1': 7,
 'urico1': 8,
 'microalc1': 9,
 'cistatin1': 10,
 'co': 11,
 'gmi': 12,
 'AO140': 13,
 'epi1': 14,
 'crea1': 15,
 'cintura1': 16,
 'P_sd2': 17,
 'dfa': 18,
 'P_sd1': 19,
 'mage': 20,
 'sampen': 21,
 'glucemia1': 22,
 'insulina1': 23,
 'coef.var': 24,
 'X25oh1': 25,
 'sm1': 26,
 'pcr1': 27,
 'talla1': 28,
 'ldl1': 29,
 'P_e': 30,
 'hdl1': 31,
 'ecv': 32,
 'ntratami1': 33,
 'tas_s1': 34,
 'hta1': 35}

## Value of both Importances
mutual_info_ordered / rfe_selection_ordered

In [25]:
importance_values = dict()

for key in rfe_selection_ordered.keys():
    importance_values[key] = mutual_info_ordered[key] / rfe_selection_ordered[key] 

In [26]:
importance_values_ordered = dict(sorted(importance_values.items(),  key=lambda x: x[1], reverse=True))

In [27]:
importance_values_ordered 

{'TU100': 0.028227046184896576,
 'hbglicos1': 0.0233956367474335,
 'gmi': 0.003192698215103392,
 'epi1': 0.002692441411605565,
 'ncrit_sm1': 0.00186624500114742,
 'sm1': 0.000904009085016046,
 'urico1': 0.0008491044042114315,
 'P_sd2': 0.0007652107738032957,
 'mage': 0.0006891632938488712,
 'crea1': 0.0006288006392851653,
 'sexo': 0.0006165741898938037,
 'pcr1': 0.0005455184991393944,
 'X25oh1': 0.0003442424929803689,
 'sampen': 0.0003163467049994865,
 'ldl1': 0.00027608285818955515,
 'P_e': 0.0002229490830782721,
 'cintura1': 0.0001972355916681212,
 'P_sd1': 0.00012780834512307165,
 'ntratami1': 6.605372472164828e-05,
 'fibrinog1': 0.0,
 'imc1': 0.0,
 'triglice1': 0.0,
 'microalc1': 0.0,
 'cistatin1': 0.0,
 'co': 0.0,
 'AO140': 0.0,
 'dfa': 0.0,
 'glucemia1': 0.0,
 'insulina1': 0.0,
 'coef.var': 0.0,
 'talla1': 0.0,
 'hdl1': 0.0,
 'ecv': 0.0,
 'tas_s1': 0.0,
 'hta1': 0.0}

In [28]:
list_importance_values = list()

for col in importance_values_ordered.keys():
    list_importance_values.append(col)

In [29]:
list_importance_values[:5]

['TU100', 'hbglicos1', 'gmi', 'epi1', 'ncrit_sm1']

## Creating the reduced features sets
X_red and X_red_scaled

In [30]:
top = 5

In [31]:
X = X[list_importance_values[:top]]
X_scaled = X_scaled[list_importance_values[:top]]

# 3) DATA AUGMENTATION

In [32]:
import imblearn
print(imblearn.__version__)

0.9.1


In [33]:
from imblearn.over_sampling import SMOTE

# transform the dataset

oversample = SMOTE(sampling_strategy=0.2, k_neighbors = 5)
X, y = oversample.fit_resample(X, y)
X_scaled, y_scaled = oversample.fit_resample(X_scaled, y_scaled)

# summarize class distribution
counter = Counter(y)
print(counter)

Counter({0.0: 192, 1.0: 38})


# 4) TESTING

# SVM

In [35]:
from sklearn.svm import SVC

# defining parameter range

"""param_grid = {'C': [0.1, 1, 10, 100],
              'gamma': [0.001, 0.01, 0.1],
              'kernel': ['rbf', 'sigmoid', 'linear', 'poly']
            }"""

param_grid = {'C': [10, 100],
              'gamma': [0.01, 0.1],
              'kernel': ['rbf', 'sigmoid']
            }
 
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 1, scoring = 'f1', cv=10)
 

In [36]:
svm, acc, prec, recall, auc = train_and_test(X_scaled, y_scaled, grid, 10)

Fitting 10 folds for each of 8 candidates, totalling 80 fits
Fitting 10 folds for each of 8 candidates, totalling 80 fits
Fitting 10 folds for each of 8 candidates, totalling 80 fits
Fitting 10 folds for each of 8 candidates, totalling 80 fits
Fitting 10 folds for each of 8 candidates, totalling 80 fits
Fitting 10 folds for each of 8 candidates, totalling 80 fits
Fitting 10 folds for each of 8 candidates, totalling 80 fits
Fitting 10 folds for each of 8 candidates, totalling 80 fits
Fitting 10 folds for each of 8 candidates, totalling 80 fits
Fitting 10 folds for each of 8 candidates, totalling 80 fits


In [37]:
print("BEST CLASSIFIER :", svm)
print("\naccuracy: {:.2f}".format(acc))
print("precision: {:.2f}".format(prec))
print("recall: {:.2f}".format(recall))
print("auc: {:.2f}".format(auc))


BEST CLASSIFIER : SVC(C=100, gamma=0.1)

accuracy: 0.91
precision: 0.74
recall: 0.78
auc: 0.86


In [38]:
compute_confusion_matrix(acc, prec, recall, len(y))


confusion matrix
[tp, fn ]
[fp, tn ]

[33 10]
[11 176]


# Random Forest

In [39]:
from sklearn.ensemble import RandomForestClassifier

# defining parameter range

param_grid = {'n_estimators': [400],
              'criterion': ["gini"],
              'max_depth': [10]
            }
 
grid = GridSearchCV(RandomForestClassifier(), param_grid, refit = True, verbose = 10, scoring = 'f1', cv=10)

In [40]:
rf, acc, prec, recall, auc = train_and_test(X, y, grid, 10)

Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV 1/10; 1/1] START criterion=gini, max_depth=10, n_estimators=400.............
[CV 1/10; 1/1] END criterion=gini, max_depth=10, n_estimators=400;, score=0.800 total time=   0.3s
[CV 2/10; 1/1] START criterion=gini, max_depth=10, n_estimators=400.............
[CV 2/10; 1/1] END criterion=gini, max_depth=10, n_estimators=400;, score=1.000 total time=   0.3s
[CV 3/10; 1/1] START criterion=gini, max_depth=10, n_estimators=400.............
[CV 3/10; 1/1] END criterion=gini, max_depth=10, n_estimators=400;, score=0.500 total time=   0.3s
[CV 4/10; 1/1] START criterion=gini, max_depth=10, n_estimators=400.............
[CV 4/10; 1/1] END criterion=gini, max_depth=10, n_estimators=400;, score=0.400 total time=   0.3s
[CV 5/10; 1/1] START criterion=gini, max_depth=10, n_estimators=400.............
[CV 5/10; 1/1] END criterion=gini, max_depth=10, n_estimators=400;, score=0.667 total time=   0.3s
[CV 6/10; 1/1] START criterion=gini, ma

[CV 5/10; 1/1] END criterion=gini, max_depth=10, n_estimators=400;, score=0.400 total time=   0.3s
[CV 6/10; 1/1] START criterion=gini, max_depth=10, n_estimators=400.............
[CV 6/10; 1/1] END criterion=gini, max_depth=10, n_estimators=400;, score=0.667 total time=   0.3s
[CV 7/10; 1/1] START criterion=gini, max_depth=10, n_estimators=400.............
[CV 7/10; 1/1] END criterion=gini, max_depth=10, n_estimators=400;, score=0.750 total time=   0.3s
[CV 8/10; 1/1] START criterion=gini, max_depth=10, n_estimators=400.............
[CV 8/10; 1/1] END criterion=gini, max_depth=10, n_estimators=400;, score=0.571 total time=   0.3s
[CV 9/10; 1/1] START criterion=gini, max_depth=10, n_estimators=400.............
[CV 9/10; 1/1] END criterion=gini, max_depth=10, n_estimators=400;, score=1.000 total time=   0.3s
[CV 10/10; 1/1] START criterion=gini, max_depth=10, n_estimators=400............
[CV 10/10; 1/1] END criterion=gini, max_depth=10, n_estimators=400;, score=0.800 total time=   0.3s


[CV 10/10; 1/1] END criterion=gini, max_depth=10, n_estimators=400;, score=0.571 total time=   0.3s
Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV 1/10; 1/1] START criterion=gini, max_depth=10, n_estimators=400.............
[CV 1/10; 1/1] END criterion=gini, max_depth=10, n_estimators=400;, score=0.667 total time=   0.3s
[CV 2/10; 1/1] START criterion=gini, max_depth=10, n_estimators=400.............
[CV 2/10; 1/1] END criterion=gini, max_depth=10, n_estimators=400;, score=0.667 total time=   0.3s
[CV 3/10; 1/1] START criterion=gini, max_depth=10, n_estimators=400.............
[CV 3/10; 1/1] END criterion=gini, max_depth=10, n_estimators=400;, score=1.000 total time=   0.3s
[CV 4/10; 1/1] START criterion=gini, max_depth=10, n_estimators=400.............
[CV 4/10; 1/1] END criterion=gini, max_depth=10, n_estimators=400;, score=0.400 total time=   0.3s
[CV 5/10; 1/1] START criterion=gini, max_depth=10, n_estimators=400.............
[CV 5/10; 1/1] END criterion=gini, max

In [41]:
print("BEST CLASSIFIER :", rf)
print("\naccuracy: {:.2f}".format(acc))
print("precision: {:.2f}".format(prec))
print("recall: {:.2f}".format(recall))
print("auc: {:.2f}".format(auc))

BEST CLASSIFIER : RandomForestClassifier(max_depth=10, n_estimators=400)

accuracy: 0.87
precision: 0.71
recall: 0.45
auc: 0.70


In [43]:
compute_confusion_matrix(acc, prec, recall, len(y))


confusion matrix
[tp, fn ]
[fp, tn ]

[18 22]
[8 182]
