In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, recall_score, accuracy_score, classification_report

#from fcapy.context import FormalContext
#from fcapy.lattice import ConceptLattice

#from fcapy.visualizer import LineVizNx
import matplotlib.pyplot as plt

plt.rcParams['figure.facecolor'] = (1,1,1,1)

#import neural_lib as nl

# Pipeline

In [None]:
df = pd.read_csv('/content/diabetes.csv')
print(df.shape)
df.sample()

(768, 9)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
505,10,75,82,0,0,33.3,0.263,38,0


# Step 0. Binarize the data

In [None]:
df_bin = pd.DataFrame()
#1
df_bin['Pregnancies<6'] = df['Pregnancies'] < 6
df_bin['Pregnancies>=6'] = df['Pregnancies'] >= 6
#2
df_bin['Glucose>=150'] = df['Glucose'] >= 150
df_bin['Glucose<150'] = df['Glucose'] < 150
#3
df_bin['BMI>40'] = df['BMI'] > 40
df_bin['BMI<40'] = df['BMI'] < 40
#7
df_bin['DiabetesPedigreeFunction>=1'] = df['DiabetesPedigreeFunction'] >= 1
df_bin['DiabetesPedigreeFunction<1'] = df['DiabetesPedigreeFunction'] < 1
#8
df_bin['Age>=60'] = df['Age'] >= 60
df_bin['Age>=40<60'] = (df['Age'] < 60) & (df['Age'] >= 40)
df_bin['Age<40'] = df['Age'] < 40

df_bin['Outcome'] = df['Outcome'] == 1


In [None]:
y_feat = 'Outcome'
df_train, df_test = train_test_split(df_bin, train_size=0.7, random_state=0)

X_train, y_train = df_train.drop(y_feat, axis=1), df_train[y_feat]
X_test, y_test = df_test.drop(y_feat, axis=1), df_test[y_feat]
X_train, y_train,  X_test, y_test = X_train.to_numpy(dtype='int'), y_train.to_numpy(dtype='int'), X_test.to_numpy(dtype='int'), y_test.to_numpy(dtype='int')
X_train[:10]


array([[1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1],
       [1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1],
       [1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1],
       [1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0],
       [1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0],
       [1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0],
       [0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1],
       [1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0],
       [0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0],
       [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]])

Default tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('Recall score:', recall_score(y_test, y_pred))
print('F1     score:', f1_score(y_test, y_pred))
print('Accuracy score:', accuracy_score(y_test, y_pred))

Recall score: 0.47297297297297297
F1     score: 0.5
Accuracy score: 0.696969696969697


In [None]:
clf.get_depth()

6

GridsearchCV with trees

(scoring with recall)

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {'criterion': ['gini', 'entropy', 'log_loss'],
              'max_depth' : [1,2,3,4,5,6],
              'min_samples_split': [2,5, 8, 10],
              'min_samples_leaf': [1,2,3,5, 7, 10]}
tree = DecisionTreeClassifier(random_state=0)
clf = GridSearchCV(tree, parameters, verbose = 3, scoring = 'recall')
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
print('Recall score:', recall_score(y_test, y_pred))
print('F1     score:', f1_score(y_test, y_pred))
print('Accuracy score:', accuracy_score(y_test, y_pred))

Recall score: 0.5405405405405406
F1     score: 0.5298013245033113
Accuracy score: 0.6926406926406926


(scoring with f1)

In [None]:

tree = DecisionTreeClassifier(random_state=0)
clf = GridSearchCV(tree, parameters, verbose = 3, scoring = 'f1')
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
print('Recall score:', recall_score(y_test, y_pred))
print('F1     score:', f1_score(y_test, y_pred))
print('Accuracy score:', accuracy_score(y_test, y_pred))

Recall score: 0.5
F1     score: 0.5211267605633804
Accuracy score: 0.7056277056277056


Simple Gradient boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('Recall score:', recall_score(y_test, y_pred))
print('F1     score:', f1_score(y_test, y_pred))
print('Accuracy score:', accuracy_score(y_test, y_pred))

Recall score: 0.5135135135135135
F1     score: 0.5277777777777778
Accuracy score: 0.7056277056277056


GridSearchCV with gradient Boosting

(scoring with recall)

In [None]:
parameters = {'learning_rate': [1,0.1, 0.01, 0.001],
              'n_estimators':[10, 50, 100, 200],
              'subsample' : [1.0, 0.75, 0.5, 0.25]}
gb = GradientBoostingClassifier(random_state=0)
clf = GridSearchCV(gb, parameters, verbose = 3, scoring = 'recall')
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
print('Recall score:', recall_score(y_test, y_pred))
print('F1     score:', f1_score(y_test, y_pred))
print('Accuracy score:', accuracy_score(y_test, y_pred))

Recall score: 0.5135135135135135
F1     score: 0.5277777777777778
Accuracy score: 0.7056277056277056


(scoring with f1)

In [None]:
gb = GradientBoostingClassifier(random_state=0)
clf = GridSearchCV(gb, parameters, verbose = 3, scoring = 'f1')
clf.fit(X_train, y_train)


In [None]:
y_pred = clf.predict(X_test)
print('Recall score:', recall_score(y_test, y_pred))
print('F1     score:', f1_score(y_test, y_pred))
print('Accuracy score:', accuracy_score(y_test, y_pred))

Recall score: 0.4594594594594595
F1     score: 0.5230769230769231
Accuracy score: 0.7316017316017316
