In [1]:
import pandas as pd
kidney_disease = pd.read_csv(r'Data/clean_dataset.csv')
display(kidney_disease)

Unnamed: 0,id,age,blood_pressure,specific_gravity,albumin,sugar,red_blood_cells,pus_cell,pus_cell_clumps,bacteria,...,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,appetite,peda_edema,aanemia,classification
0,0,48.0,80.0,1.020,1.0,0.0,,True,False,False,...,44,7800,5.2,True,True,False,True,False,False,ckd
1,1,7.0,50.0,1.020,4.0,0.0,,True,False,False,...,38,6000,,False,False,False,True,False,False,ckd
2,2,62.0,80.0,1.010,2.0,3.0,True,True,False,False,...,31,7500,,False,True,False,False,False,True,ckd
3,3,48.0,70.0,1.005,4.0,0.0,True,False,True,False,...,32,6700,3.9,True,False,False,False,True,True,ckd
4,4,51.0,80.0,1.010,2.0,0.0,True,True,False,False,...,35,7300,4.6,False,False,False,True,False,False,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
367,395,55.0,80.0,1.020,0.0,0.0,True,True,False,False,...,47,6700,4.9,False,False,False,True,False,False,notckd
368,396,42.0,70.0,1.025,0.0,0.0,True,True,False,False,...,54,7800,6.2,False,False,False,True,False,False,notckd
369,397,12.0,80.0,1.020,0.0,0.0,True,True,False,False,...,49,6600,5.4,False,False,False,True,False,False,notckd
370,398,17.0,60.0,1.025,0.0,0.0,True,True,False,False,...,51,7200,5.9,False,False,False,True,False,False,notckd


# Select data and target

In [2]:
kidney_disease_data = kidney_disease[['age', 'hypertension', 'diabetes_mellitus', 'serum_creatinine', 'coronary_artery_disease']]
kidney_disease_target = kidney_disease['classification']

# Best fit finder

In [3]:
from sklearn.base import ClassifierMixin

def split_dataset(test_size):
    return train_test_split(
        kidney_disease_data, 
        kidney_disease_target, 
        test_size=test_size, 
        random_state=0)

def classifier_score_train_test(classifier : ClassifierMixin, test_size):
    x_train, x_test, y_train, y_test = split_dataset(test_size)
    classifier.fit(x_train, y_train)
    return classifier.score(x_test, y_test)


def optimize_classifier_train_test(classifier : ClassifierMixin, min_test_size=10, percent_step=1):
    test_size_max = 0
    accuracy_max = 0
    maxed = []
    not_runnable = []
    for test_size in range(min_test_size, 100):
        try:
            accuracy = classifier_score_train_test(classifier, test_size*percent_step*0.01)
        except:
            not_runnable.append(test_size)
        else:
            if accuracy == 1:
                maxed.append(test_size)
            if accuracy > accuracy_max:
                accuracy_max = accuracy
                test_size_max = test_size
    if len(maxed) > 1:
        print('Warning - classifier max accuracy reached for values:')
        print(maxed)
    return test_size_max, accuracy_max

# Split train test

In [4]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    kidney_disease_data,
    kidney_disease_target,
    random_state=0
)

# K-nearest Neighbors

In [5]:
from sklearn.neighbors import KNeighborsClassifier

# Standard (0.2 test, 0.8 train, 5 neighbors)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train, y_train)
print("Standard accuracy (20% test, 5 neighbors): {:.3f}".format(knn.score(x_test, y_test)))

# Optimized
neighbors_max = 0
classifier_max = 0
test_size_max = 0
for neighbors in range(1, 20):
    test_size_score, classifier_score = optimize_classifier_train_test(KNeighborsClassifier(n_neighbors=neighbors))
    if classifier_score > classifier_max:
        classifier_max = classifier_score
        test_size_max = test_size_score
        neighbors_max = neighbors
print("Max accuracy ({:d}% test, {:d} neighbors): {:.3f}".format(test_size_max, neighbors_max, classifier_max))

Standard accuracy (20% test, 5 neighbors): 0.871
Max accuracy (10% test, 5 neighbors): 0.921


# Decision Tree

In [6]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=0)

# Standard
decision_tree.fit(x_train, y_train)
print("Standard accuracy (20% test): {:.3f}".format(decision_tree.score(x_test, y_test)))

# Optimized
test_size_max, classifier_max = optimize_classifier_train_test(decision_tree)
print("Max accuracy ({:d}% test): {:.3f}".format(test_size_max, classifier_max))

Standard accuracy (20% test): 0.935
Max accuracy (35% test): 0.954


# Random Forest

In [7]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=5, random_state=0)

# Standard
forest.fit(x_train, y_train)
print("Standard accuracy (20% test): {:.3f}".format(forest.score(x_test, y_test)))

# Optimized
trees_max = 0
classifier_max = 0
test_size_max = 0
for trees in range(1, 30):
    test_size_score, classifier_score = optimize_classifier_train_test(RandomForestClassifier(n_estimators=trees, random_state=0))
    if classifier_score > classifier_max:
        classifier_max = classifier_score
        test_size_max = test_size_score
        trees_max = trees
print("Max accuracy ({:d}% test, {:d} trees): {:.3f}".format(test_size_max, trees_max, classifier_max))

Standard accuracy (20% test): 0.957
Max accuracy (13% test, 1 trees): 0.980


# Logistic Regression

In [8]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0)

# Standard
clf.fit(x_train, y_train)
print("Standard accuracy (20% test): {:.3f}".format(clf.score(x_test, y_test)))

# Optimized
test_size_max, classifier_max = optimize_classifier_train_test(clf)
print("Max accuracy ({:d}% test): {:.3f}".format(test_size_max, classifier_max))

Standard accuracy (20% test): 0.946
Max accuracy (19% test): 0.958
