In [1]:
import pandas
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
import numpy
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [2]:
isDebug = True

In [3]:
# Load data from csv
data_path = './data/pima-indians-diabetes.data'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = pandas.read_csv(data_path, names=names)

In [5]:
# Understand data with descriptive statistics
if isDebug:
    # Understand your data using the head() function to look at the first few rows.
    print(data.head())
    # Review the dimensions of your data with the shape property.
    print(data.shape)
    # Look at the data types for each attribute with the dtypes property.
    print(data.dtypes)
    # Review the distribution of your data with the describe() function.
    print(data.describe())
    # Calculate pairwise correlation between your variables using the corr() function.
    print(data.corr())

   preg  plas  pres  skin  test  mass   pedi  age  class
0     6   148    72    35     0  33.6  0.627   50      1
1     1    85    66    29     0  26.6  0.351   31      0
2     8   183    64     0     0  23.3  0.672   32      1
3     1    89    66    23    94  28.1  0.167   21      0
4     0   137    40    35   168  43.1  2.288   33      1
(768, 9)
preg       int64
plas       int64
pres       int64
skin       int64
test       int64
mass     float64
pedi     float64
age        int64
class      int64
dtype: object
             preg        plas        pres        skin        test        mass  \
count  768.000000  768.000000  768.000000  768.000000  768.000000  768.000000   
mean     3.845052  120.894531   69.105469   20.536458   79.799479   31.992578   
std      3.369578   31.972618   19.355807   15.952218  115.244002    7.884160   
min      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000   
25%      1.000000   99.000000   62.000000    0.000000    0.000000   27.300000

In [32]:
# Understand data with visualization 
scatter_matrix(data)
plt.hist(data['Age'])
plt.show()

In [6]:
# Prepare For Modeling by Pre-Processing Data
array = data.values
# Separate array into input ant output components
X = array[:, 0:8]
Y = array[:, 8]
scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)
# Summarize transformed data
numpy.set_printoptions(precision=3)
print(rescaledX[0:2, :])

[[ 0.64   0.848  0.15   0.907 -0.693  0.204  0.468  1.426]
 [-0.845 -1.123 -0.161  0.531 -0.693 -0.684 -0.365 -0.191]]


In [19]:
# a random split into training and test sets
from sklearn import datasets
from sklearn import svm
from sklearn.model_selection import train_test_split

iris = datasets.load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.4, random_state=0)

for c in range(1, 3):
    clf = svm.SVC(kernel='linear', C=c).fit(X_train, y_train)
    print(clf.score(X_test, y_test))

from sklearn.model_selection import cross_val_score

clf = svm.SVC(kernel='linear', C=1)
scores = cross_val_score(clf, iris.data, iris.target, cv=5)
print(scores)

# Use other cross validation strategies by passing a cross validation iterator instead 
from sklearn.model_selection import ShuffleSplit
n_samples = iris.data.shape[0]
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0)
results = cross_val_score(clf, iris.data, iris.target, cv=cv)
print(results)

0.966666666667
0.9
[ 0.967  1.     0.967  0.967  1.   ]
[ 0.978  0.978  1.   ]


In [20]:
# Algorithm Evaluation With Resampling Methods
kfold = KFold(n_splits=10, random_state=7)
model = LogisticRegression()
results = cross_val_score(model, X, Y, cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)") % (results.mean()*100.0, results.std()*100.0)
kfold = KFold(n_splits=10, random_state=7)
model = LogisticRegression()
results = cross_val_score(model, rescaledX, Y, cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)") % (results.mean()*100.0, results.std()*100.0)
kfold = KFold(n_splits=10, random_state=7)
model = LogisticRegression()

Accuracy: 76.951% (4.841%)
Accuracy: 77.996% (5.009%)


In [39]:
# Pipeline makes it easier to compose estimators, providing this behavior under cross-validation
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
skf1 = StratifiedKFold(n_splits=10, shuffle=True)
skf2 = StratifiedKFold(n_splits=10, )

logisticRegressionModel = make_pipeline(preprocessing.StandardScaler(), model)
results = cross_val_score(logisticRegressionModel, X, Y, cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)") % (results.mean()*100.0, results.std()*100.0)

# Stratified k-fold
results = cross_val_score(logisticRegressionModel, X, Y, cv=skf1)
print("Accuracy: %.3f%% (%.3f%%)") % (results.mean()*100.0, results.std()*100.0)

results = cross_val_score(logisticRegressionModel, X, Y, cv=skf2)
print("Accuracy: %.3f%% (%.3f%%)") % (results.mean()*100.0, results.std()*100.0)

clf = make_pipeline(preprocessing.StandardScaler(), svm.SVC(C=5))
results = cross_val_score(clf, X, Y, cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)") % (results.mean()*100.0, results.std()*100.0)

results = cross_val_score(clf, X, Y, cv=skf1)
print("Accuracy: %.3f%% (%.3f%%)") % (results.mean()*100.0, results.std()*100.0)

from sklearn import metrics
from sklearn.model_selection import cross_val_predict 
predicted = cross_val_predict(clf, X, Y, cv=kfold)
metrics.accuracy_score(Y, predicted)

Accuracy: 77.996% (5.009%)
Accuracy: 77.220% (4.485%)


Accuracy: 77.088% (3.618%)


Accuracy: 77.088% (5.970%)


Accuracy: 73.836% (5.397%)


0.77083333333333337

In [64]:
# Algorithm Evaluation Metrics
scoring = 'neg_log_loss'
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print("Logloss: %.3f (%.3f)") % (results.mean(), results.std())
results = cross_val_score(model, rescaledX, Y, cv=kfold, scoring=scoring)
print("Logloss: %.3f (%.3f)") % (results.mean(), results.std())

Logloss: -0.493 (0.047)
Logloss: -0.484 (0.061)


In [67]:
# Spot-Check Algorithms
model = KNeighborsRegressor()
scoring = 'neg_mean_squared_error'
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print(results.mean())
results = cross_val_score(model, rescaledX, Y, cv=kfold, scoring=scoring)
print(results.mean())

-0.196342447027
-0.178674641148


In [73]:
# Model comparison and selection 
# prepare models
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
	kfold = KFold(n_splits=10, random_state=7)
	cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)

LR: 0.769515 (0.048411)
LDA: 0.773462 (0.051592)


In [74]:
# Model comparison and selection 
# prepare models
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
	kfold = KFold(n_splits=10, random_state=7)
	cv_results = cross_val_score(model, rescaledX, Y, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)

LR: 0.779956 (0.050088)
LDA: 0.773462 (0.051592)


In [76]:
# Improve Accuracy with Algorithm Tuning 
alphas = numpy.array([1,0.1,0.01,0.001,0.0001,0])
param_grid = dict(alpha=alphas)
model = Ridge()
grid = GridSearchCV(estimator=model, param_grid=param_grid)
grid.fit(X, Y)
print(grid.best_score_)
print(grid.best_estimator_.alpha)

0.279617559313
1.0


In [78]:
# Improve Accuracy with Ensemble Predictions
num_trees = 100
max_features = 3
kfold = KFold(n_splits=10, random_state=7)
model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.765516062884


In [None]:
# Finalize And Save Your Model