In [77]:
import pandas
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
import numpy
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [14]:
isDebug = True

In [48]:
# Load data from csv
data_path = './data/pima-indians-diabetes.data'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = pandas.read_csv(data_path, names=names)

In [20]:
# Understand data with descriptive statistics
if isDebug:
    # Understand your data using the head() function to look at the first few rows.
    print(data.head())
    # Review the dimensions of your data with the shape property.
    print(data.shape)
    # Look at the data types for each attribute with the dtypes property.
    print(data.dtypes)
    # Review the distribution of your data with the describe() function.
    print(data.describe())
    # Calculate pairwise correlation between your variables using the corr() function.
    print(data.corr())
    print("----------")
    print(data.corr()['Class'].sort_values(ascending=False))

   PregnantTimes  PlasmaGlucoseConcentration  BloodPressure  \
0              6                         148             72   
1              1                          85             66   
2              8                         183             64   
3              1                          89             66   
4              0                         137             40   

   TricepsSkinFoldThickness  SerumInsulin  BodyMassIndex  DiabetesPedigree  \
0                        35             0           33.6             0.627   
1                        29             0           26.6             0.351   
2                         0             0           23.3             0.672   
3                        23            94           28.1             0.167   
4                        35           168           43.1             2.288   

   Age  Class  
0   50      1  
1   31      0  
2   32      1  
3   21      0  
4   33      1  
(768, 9)
PregnantTimes                   int64
PlasmaGlu

In [32]:
# Understand data with visualization 
scatter_matrix(data)
plt.hist(data['Age'])
plt.show()

In [56]:
# Prepare For Modeling by Pre-Processing Data
array = data.values
# Separate array into input ant output components
X = array[:, 0:8]
Y = array[:, 8]
scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)
# Summarize transformed data
numpy.set_printoptions(precision=3)
print(rescaledX[0:2, :])
normalizedX = normalize(X)
print(normalizedX[0:2,:])

[[ 0.64   0.848  0.15   0.907 -0.693  0.204  0.468  1.426]
 [-0.845 -1.123 -0.161  0.531 -0.693 -0.684 -0.365 -0.191]]
[[ 0.034  0.828  0.403  0.196  0.     0.188  0.004  0.28 ]
 [ 0.008  0.716  0.556  0.244  0.     0.224  0.003  0.261]]


In [62]:
# Algorithm Evaluation With Resampling Methods
kfold = KFold(n_splits=10, random_state=7)
model = LogisticRegression()
results = cross_val_score(model, X, Y, cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)") % (results.mean()*100.0, results.std()*100.0)
kfold = KFold(n_splits=10, random_state=7)
model = LogisticRegression()
results = cross_val_score(model, rescaledX, Y, cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)") % (results.mean()*100.0, results.std()*100.0)
kfold = KFold(n_splits=10, random_state=7)
model = LogisticRegression()
results = cross_val_score(model, normalizedX, Y, cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)") % (results.mean()*100.0, results.std()*100.0)

Accuracy: 76.951% (4.841%)
Accuracy: 77.996% (5.009%)
Accuracy: 64.843% (7.868%)


In [64]:
# Algorithm Evaluation Metrics
scoring = 'neg_log_loss'
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print("Logloss: %.3f (%.3f)") % (results.mean(), results.std())
results = cross_val_score(model, rescaledX, Y, cv=kfold, scoring=scoring)
print("Logloss: %.3f (%.3f)") % (results.mean(), results.std())

Logloss: -0.493 (0.047)
Logloss: -0.484 (0.061)


In [67]:
# Spot-Check Algorithms
model = KNeighborsRegressor()
scoring = 'neg_mean_squared_error'
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print(results.mean())
results = cross_val_score(model, rescaledX, Y, cv=kfold, scoring=scoring)
print(results.mean())

-0.196342447027
-0.178674641148


In [73]:
# Model comparison and selection 
# prepare models
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
	kfold = KFold(n_splits=10, random_state=7)
	cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)

LR: 0.769515 (0.048411)
LDA: 0.773462 (0.051592)


In [74]:
# Model comparison and selection 
# prepare models
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
	kfold = KFold(n_splits=10, random_state=7)
	cv_results = cross_val_score(model, rescaledX, Y, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)

LR: 0.779956 (0.050088)
LDA: 0.773462 (0.051592)


In [76]:
# Improve Accuracy with Algorithm Tuning 
alphas = numpy.array([1,0.1,0.01,0.001,0.0001,0])
param_grid = dict(alpha=alphas)
model = Ridge()
grid = GridSearchCV(estimator=model, param_grid=param_grid)
grid.fit(X, Y)
print(grid.best_score_)
print(grid.best_estimator_.alpha)

0.279617559313
1.0


In [78]:
# Improve Accuracy with Ensemble Predictions
num_trees = 100
max_features = 3
kfold = KFold(n_splits=10, random_state=7)
model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.765516062884


In [None]:
# Finalize And Save Your Model
