In [None]:
# Load CSV using Pandas from URL

from pandas import read_csv

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"

names = ['preg ','plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

data = read_csv(url, names=names)

print(data.shape)

In [None]:
data.head(5)

In [None]:
# Data description
from pandas import set_option

set_option('display.width', 100)
set_option('precision', 3)

description = data.describe()
print(description)

In [None]:
# Class distribution

class_counts = data.groupby('class').size()
print class_counts

In [None]:
# Corrlations between attributes
correlations = data.corr(method="pearson")
print(correlations)

In [None]:
# Skew of univariate distribution

skew = data.skew()
print(skew)

# TIPS

1. Review the numbers
2. Ask why
3. Write down ideas

# Data visualization

## Univariate plots

### Histograms

In [None]:
import matplotlib.pyplot as plt

data.hist()
plt.show()

In [None]:
### Denisity Plots

data.plot(kind='density', subplots=True, layout=(3,3), sharex=False)
plt.show()

In [None]:
### box and whisker plots

data.plot(kind='box', subplots=True, layout=(3,3), sharex=False, sharey=False)
plt.show()

## Multivariate Plots

In [None]:
# correlation matrix plot
import numpy as np
corrections = data.corr()

# plot correlation matrix
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)

fig.colorbar(cax)
ticks = np.arange(0, 9, 1)

ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(names)
ax.set_yticklabels(names)

plt.show()

In [None]:
# Scatter Plot Matrix
from pandas.tools.plotting import scatter_matrix

scatter_matrix(data)

plt.show()

# Prepare Data

In [None]:
# Rescale data
from numpy import set_printoptions

from sklearn.preprocessing import MinMaxScaler

X = data.iloc[:, 0:8]
Y = data.iloc[:, 8]

scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)

set_printoptions(precision=3)

print(rescaledX[0:5,:])

In [None]:
# standard data (mean 0, stdev 1)
from sklearn.preprocessing import StandardScaler

scaler_std = StandardScaler().fit(X)
rescale_std_X = scaler_std.transform(X)

print(rescale_std_X[0:5,:])

In [None]:
# Normalize data (length of 1)

from sklearn.preprocessing import Normalizer

scaler_nor = Normalizer().fit(X)
normalizedX = scaler_nor.transform(X)

print(normalizedX[0:5,:])


In [None]:
# Binarize data (make binary)

from sklearn.preprocessing import Binarizer

binarizer = Binarizer(threshold=0.0).fit(X)
binaryX = binarizer.transform(X)

print(binaryX[0:5,:])

# Feature Selection

In [None]:
# univariate selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# feature extraction
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X,Y)

# summarize scores
print(fit.scores_)

features = fit.transform(X)

print(features[0:5,:])

得分最高的留下来，这里是 plas， test， mass， age

In [None]:
# recursive feature elimination
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# feature extraction
model = LogisticRegression()
rfe = RFE(model, 3)
fit = rfe.fit(X, Y)

print("特征数目：{}".format(fit.n_features_))
print("Selected Features:{}".format(fit.support_))
print("Feature ranking: {}".format(fit.ranking_))

In [None]:
# PCA
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
fit = pca.fit(X)

print("Explain Variance:{}".format(fit.explained_variance_ratio_))
print(fit.components_)

In [None]:
# Feature importcance with Extra Trees classifier
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
model.fit(X,Y)

print(model.feature_importances_)

In [None]:
# Evaluate the Performance of ML Algorithms with Resampling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33,
                                                   random_state=42)

model = LogisticRegression()
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)

print("Accuracy:{:.3f}%".format(result*100.0))

In [None]:
# k fold

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits=10, random_state=42)
model = LogisticRegression()
results = cross_val_score(model, X, Y, cv=kfold)

print("Accuracy: {0:.3f}% (std: {1:.3f}%)".format(results.mean()*100.0, 
                                                results.std()*100.0))

In [None]:
# leave one out
from sklearn.model_selection import LeaveOneOut

loocv = LeaveOneOut()
model = LogisticRegression()

results = cross_val_score(model, X, Y, cv=loocv)
print("Accuracy: {0:.3f}% (std: {1:.3f}%)".format(results.mean()*100.0, 
                                                results.std()*100.0))

In [None]:
# Repeated Random Test-trian splits
from sklearn.model_selection import ShuffleSplit

kfold = ShuffleSplit(n_splits=10, test_size=0.33, random_state=42)

model = LogisticRegression()

results = cross_val_score(model, X, Y, cv=kfold)
print("Accuracy: {0:.3f}% ({1:.3f}%)".format(results.mean()*100.0, results.std()*100.0))

# ML performance Metrics

In [None]:
# classification accuracy

kfold = KFold(n_splits=10, random_state=42)

model = LogisticRegression()
scoring = 'accuracy'
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)

print("准确率：{0:.3f}%({1:.3f}%)".format(results.mean()*100.0, 
                                     results.std()*100.0))

In [None]:
# Logarithmic Loss

results = cross_val_score(model, X, Y, cv=kfold, scoring='neg_log_loss')
print("Log loss: {0:.3f} ({1:.3f})".format(results.mean(), results.std()))

In [None]:
# AUC

results = cross_val_score(model, X, Y, cv=kfold, scoring='roc_auc')

print("AUC: {0:.3f}({1:.3f})".format(results.mean(), results.std()))

In [None]:
# confusion matrix

from sklearn.metrics import confusion_matrix

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, 
                                                   random_state=42)

model = LogisticRegression()
model.fit(X_train, Y_train)

predicted = model.predict(X_test)

matrix = confusion_matrix(Y_test, predicted)
print(matrix)

In [None]:
# classification report

from sklearn.metrics import classification_report

report = classification_report(Y_test, predicted)

print(report)