In [1]:
import pandas as pd
import graphviz
import numpy as np

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [2]:
# Data import
data_diabetes = pd.read_table('patients_data.txt', sep='\t',header=None)
classes_diabetes = pd.read_table('patients_classes.txt', sep='\t',header=None)

In [3]:
# Decision Trees
clf = tree.DecisionTreeClassifier()
score_tree = cross_val_score(clf, data_diabetes, classes_diabetes, cv=5).mean()

In [4]:
# Tree Visualization
clf = clf.fit(data_diabetes, classes_diabetes)
feature_names = ['age', 'hba1c', 'insuline taken', 'other drugs taken']
classes = ['DR', 'NDR']
dot_data = tree.export_graphviz(clf, out_file=None,
    feature_names=feature_names,
    class_names=classes,
    filled=True,
    rounded=True,
    special_characters=True)
graph = graphviz.Source(dot_data)
graph.render("diabetes remission")

'diabetes remission.pdf'

In [5]:
# Random Forest
clf = RandomForestClassifier(max_depth=2, random_state=0)
score_forest = cross_val_score(clf, data_diabetes, np.ravel(classes_diabetes,), cv=5).mean()

In [6]:
# DiaRem
df = data_diabetes
df.columns = ['Age', 'Hba', 'Drugs', 'Treatment']
n = [0] * df.shape[0]
df['score'] = n
df['pred_remission'] = n

df.loc[(df['Age'] >= 40) & (df['Age']) < 50, 'score'] += 1
df.loc[(df['Age'] >= 50) & (df['Age']) < 60, 'score'] += 2
df.loc[(df['Age'] >= 60), 'score'] += 3
df.loc[(df['Hba'] >= 6.5) & (df['Hba'] < 7.0), 'score'] += 2
df.loc[(df['Hba'] >= 7.0) & (df['Hba'] < 9), 'score'] += 4
df.loc[df['Hba'] >= 9.0, 'score'] += 6
df.loc[df['Drugs'] == 1, 'score'] += 3
df.loc[df['Treatment'] == 1, 'score'] += 10

df.loc[df['score'] >= 7, 'pred_remission'] = 1
y_pred = df['pred_remission']

In [7]:
# Results
print("Decision Trees accuracy is " + str(score_tree))
print("Random Forest accuracy is " + str(score_forest))
print("DiaRem method accuracy is " + str(accuracy_score(classes_diabetes, y_pred)))

Decision Trees accuracy is 0.6849999999999999
Random Forest accuracy is 0.775
DiaRem method accuracy is 0.605


# Conclusion

After a few experiments making use of the cross validation, we can conclude that the predictive power of the three models can be ranked quite easily. In most of the prediction experiments, the Random Forest accuracy is the highest (around 0.77), while the lowest is the DiaRem one (around 0.6). Between those two is the Decision Trees accuracy (around 0.66). It highlights the power of machine learning methods as we just showed that they beat quite significantly state-of-the-art methods based on categorization, with very few training data available. We yet can observe that the gap between the Decision Trees method and the Random Forest method is in most of experiments higher than the one between DiaRem and Decision Trees.