In [26]:
import numpy as np
from scipy.io import arff
from sklearn.preprocessing import Imputer
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
np.set_printoptions(precision=3, suppress=True)

In [27]:
data, meta = arff.loadarff('diabetes.arff')

In [28]:
X = np.empty((0,8), np.float)
y = np.empty((0,1), np.str)
for e in data:
    e2 = list(e)
    X = np.append(X, [e2[0:8]], axis=0)
    y = np.append(y, e2[8:9])

In [29]:
clf = GaussianNB()
print(clf)

GaussianNB(priors=None)


In [30]:
scores = cross_val_score(clf, X, y, cv=10)
print("{0:4.2f} +/- {1:4.2f} %".format(scores.mean() * 100, scores.std() * 100))

75.65 +/- 3.30 %


In [31]:
X2 = X.copy()
med_imp = Imputer(missing_values=0, strategy='median', axis=0)
med_imp.fit(X2[:, 1:6])
X2[:, 1:6] = med_imp.transform(X2[:, 1:6])

In [32]:
scores = cross_val_score(clf, X2, y, cv=10)
print("{0:4.2f} +/- {1:4.2f} %".format(scores.mean() * 100, scores.std() * 100))

75.00 +/- 3.38 %


In [33]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X2)
X2 = scaler.transform(X2)

In [34]:
scores = cross_val_score(clf, X2, y, cv=10)
print("{0:4.2f} +/- {1:4.2f} %".format(scores.mean() * 100, scores.std() * 100))

75.00 +/- 3.38 %


In [38]:
X2[0:20]

array([[ 0.64 ,  0.866, -0.032,  0.671, -0.182,  0.167,  0.468,  1.426],
       [-0.845, -1.205, -0.528, -0.012, -0.182, -0.852, -0.365, -0.191],
       [ 1.234,  2.017, -0.694, -0.012, -0.182, -1.333,  0.604, -0.106],
       [-0.845, -1.074, -0.528, -0.695, -0.541, -0.634, -0.921, -1.042],
       [-1.142,  0.504, -2.679,  0.671,  0.317,  1.549,  5.485, -0.02 ],
       [ 0.343, -0.186,  0.133, -0.012, -0.182, -0.998, -0.818, -0.276],
       [-0.251, -1.435, -1.852,  0.329, -0.61 , -0.212, -0.676, -0.616],
       [ 1.828, -0.219, -0.032, -0.012, -0.182,  0.414, -1.02 , -0.361],
       [-0.548,  2.477, -0.197,  1.809,  4.661, -0.285, -0.948,  1.681],
       [ 1.234,  0.11 ,  1.953, -0.012, -0.182, -0.023, -0.724,  1.766],
       [ 0.046, -0.383,  1.622, -0.012, -0.182,  0.749, -0.848, -0.276],
       [ 1.828,  1.524,  0.133, -0.012, -0.182,  0.807,  0.197,  0.065],
       [ 1.828,  0.57 ,  0.63 , -0.012, -0.182, -0.779,  2.927,  2.022],
       [-0.845,  2.214, -1.025, -0.695,  8.17 , -0.

In [39]:
rows, cols = np.where(X[:,1:6]==0)

In [40]:
X3 = np.delete(X, np.unique(rows), axis=0)
y3 = np.delete(y, np.unique(rows), axis=0)

In [41]:
X3.shape

(392, 8)

In [42]:
scores = cross_val_score(clf, X3, y3, cv=10)
print("{0:4.2f} +/- {1:4.2f} %".format(scores.mean() * 100, scores.std() * 100))

77.27 +/- 6.30 %


In [43]:
from sklearn.linear_model import LogisticRegression
clf2 = LogisticRegression()
print(clf2)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [44]:
scores = cross_val_score(clf2, X, y, cv=10)
print("{0:4.2f} +/- {1:4.2f} %".format(scores.mean() * 100, scores.std() * 100))

76.70 +/- 3.54 %


In [45]:
scores = cross_val_score(clf2, X2, y, cv=10)
print("{0:4.2f} +/- {1:4.2f} %".format(scores.mean() * 100, scores.std() * 100))

76.56 +/- 3.55 %


In [46]:
scores = cross_val_score(clf2, X3, y3, cv=10)
print("{0:4.2f} +/- {1:4.2f} %".format(scores.mean() * 100, scores.std() * 100))

76.26 +/- 5.77 %
