In [1]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import classification_report
from numpy import mean
from numpy import std
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.model_selection import RepeatedStratifiedKFold
from matplotlib import pyplot

In [3]:
import pandas as pd
import numpy as np
import io
from google.colab import files
uploaded = files.upload()
data = pd.read_csv(io.BytesIO(uploaded["kidney_disease_train.csv"]))


Saving kidney_disease_train.csv to kidney_disease_train.csv


In [5]:
x = data
y = data['classification']
column_trans = make_column_transformer((OneHotEncoder(handle_unknown='ignore'),
                                        [6, 7, 8,9,17,18,19,20,21,22,23,24,25]),
                                      remainder='passthrough')

xtrain, xtest, ytrain, ytest=train_test_split(x, y, test_size=0.15)
xtrain = xtrain.replace((np.nan), "0.0").reset_index(drop=True)
xtest = xtest.replace((np.nan), "0.0").reset_index(drop=True)


In [6]:
import os, ast
CPU_NUMBER = 1
if 'RUNTIME_HARDWARE_SPEC' in os.environ:
    CPU_NUMBER = int(ast.literal_eval(os.environ['RUNTIME_HARDWARE_SPEC'])['num_cpu'])

In [7]:
extra_trees_classifier = ExtraTreesClassifier(
    bootstrap=True,
    class_weight="balanced",
    criterion="entropy",
    max_depth=5,
    max_features=0.9978125453053365,
    min_samples_split=3,
    n_estimators=66,
    n_jobs=CPU_NUMBER,
    random_state=33,
)
pipe = make_pipeline(column_trans, extra_trees_classifier)

pipe.fit(xtrain.values, ytrain.values.ravel());

pipe.predict(xtest.values)

array(['ckd', 'ckd', 'ckd', 'notckd', 'ckd', 'notckd', 'ckd', 'ckd',
       'notckd', 'ckd', 'notckd', 'notckd', 'notckd', 'ckd', 'ckd',
       'notckd', 'ckd', 'ckd', 'ckd', 'notckd', 'ckd', 'ckd', 'ckd',
       'notckd', 'ckd', 'notckd', 'notckd', 'ckd', 'ckd', 'ckd', 'ckd',
       'ckd', 'ckd', 'ckd', 'ckd', 'notckd', 'ckd', 'ckd', 'notckd',
       'notckd', 'notckd', 'notckd'], dtype=object)

**Accuracy**

In [8]:
from sklearn.metrics import get_scorer

ypred = pipe.predict(xtrain.values)
scorer = get_scorer('accuracy')
score = scorer(pipe, xtest.values, ytest.values)
print("Accuracy for training data:",accuracy_score(ytrain,ypred))
print("Accuracy for test data:",score)

Accuracy for training data: 1.0
Accuracy for test data: 1.0


**Confusion Matrix**

In [9]:
ypred = pipe.predict(xtest.values)

cm = confusion_matrix(ytest, ypred)
print(cm)

[[26  0]
 [ 0 16]]


**Classification Report**

In [10]:
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

         ckd       1.00      1.00      1.00        26
      notckd       1.00      1.00      1.00        16

    accuracy                           1.00        42
   macro avg       1.00      1.00      1.00        42
weighted avg       1.00      1.00      1.00        42



**Hyperparameter Tuning**

In [11]:
#n_estimators
parameters = {
    'n_estimators': [10, 20, 50, 100, 200, 500, 1000, 1200, 1500, 1800, 1900, 2000, 2100, 3000]
}
clf = GridSearchCV(extra_trees_classifier, parameters, cv=5, verbose=3, n_jobs=-1)
pipe = make_pipeline(column_trans, clf)

pipe.fit(xtrain.values, ytrain.values.ravel());

pred=pipe.predict(xtest.values)
print(classification_report(ytest, pred))
 

Fitting 5 folds for each of 14 candidates, totalling 70 fits
              precision    recall  f1-score   support

         ckd       1.00      1.00      1.00        26
      notckd       1.00      1.00      1.00        16

    accuracy                           1.00        42
   macro avg       1.00      1.00      1.00        42
weighted avg       1.00      1.00      1.00        42



In [12]:
#criterion
parameters = {
    'criterion': ['gini', 'entropy']
}
clf = GridSearchCV(extra_trees_classifier, parameters, cv=5, verbose=3, n_jobs=-1)
pipe = make_pipeline(column_trans, clf)

pipe.fit(xtrain.values, ytrain.values.ravel());

pred=pipe.predict(xtest.values)
print(classification_report(ytest, pred))

Fitting 5 folds for each of 2 candidates, totalling 10 fits
              precision    recall  f1-score   support

         ckd       1.00      1.00      1.00        26
      notckd       1.00      1.00      1.00        16

    accuracy                           1.00        42
   macro avg       1.00      1.00      1.00        42
weighted avg       1.00      1.00      1.00        42



In [13]:
#max_depth
parameters = {
    'max_depth': [1, 2, 5, 8, 13, 21, 34, 53, 54, 55, 89, None]
}
clf = GridSearchCV(extra_trees_classifier, parameters, cv=5, verbose=3, n_jobs=-1)
pipe = make_pipeline(column_trans, clf)

pipe.fit(xtrain.values, ytrain.values.ravel());

pred=pipe.predict(xtest.values)
print(classification_report(ytest, pred))

Fitting 5 folds for each of 12 candidates, totalling 60 fits
              precision    recall  f1-score   support

         ckd       1.00      1.00      1.00        26
      notckd       1.00      1.00      1.00        16

    accuracy                           1.00        42
   macro avg       1.00      1.00      1.00        42
weighted avg       1.00      1.00      1.00        42



In [14]:
#min_samples_leaf
parameters = {
    'min_samples_leaf': [1, 2, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377]
}
clf = GridSearchCV(extra_trees_classifier, parameters, cv=5, verbose=3, n_jobs=-1)
pipe = make_pipeline(column_trans, clf)

pipe.fit(xtrain.values, ytrain.values.ravel());

pred=pipe.predict(xtest.values)
print(classification_report(ytest, pred))

Fitting 5 folds for each of 12 candidates, totalling 60 fits
              precision    recall  f1-score   support

         ckd       1.00      1.00      1.00        26
      notckd       1.00      1.00      1.00        16

    accuracy                           1.00        42
   macro avg       1.00      1.00      1.00        42
weighted avg       1.00      1.00      1.00        42



In [15]:
#max_leaf_nodes
parameters = {
    'max_leaf_nodes': [2, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, None]
}
clf = GridSearchCV(extra_trees_classifier, parameters, cv=5, verbose=3, n_jobs=-1)
pipe = make_pipeline(column_trans, clf)

pipe.fit(xtrain.values, ytrain.values.ravel());

pred=pipe.predict(xtest.values)
print(classification_report(ytest, pred))

Fitting 5 folds for each of 12 candidates, totalling 60 fits
              precision    recall  f1-score   support

         ckd       1.00      1.00      1.00        26
      notckd       1.00      1.00      1.00        16

    accuracy                           1.00        42
   macro avg       1.00      1.00      1.00        42
weighted avg       1.00      1.00      1.00        42



**Validation Method**

In [16]:
cv_scores = cross_val_score(pipe, xtrain, ytrain, cv=5 )
print("Cross Validation average score: %.2f" % cv_scores.mean())

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Cross Validation average score: 1.00
