# Welcome
Hi Members of Watermelon Biting Group! We will use this data set for Chapter 2 of Zhou's Machine Learning textbook. You are welcomed to fork this notebook and make your own.

# Data Pre-processing
Before we get started, there is some pre-processing work to be done. You can just run the code blocks below.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# Any results you write to the current directory are saved as output.

# Read the input file and put the data to pandas' dataframe format.
# df = pd.read_csv("../input/data.csv")
df = pd.read_csv("data.csv")

In [None]:
df.head(20)

In [None]:
print(df.shape)

In [None]:
df.drop('id', axis=1, inplace=True)
df.drop('Unnamed: 32', axis=1, inplace=True)
df['diagnosis'].replace({'M':1,'B':0},inplace=True)

We have removed the features that are not useful to us, and transformed the "M" "B" labels to "1" and "0" values.

In [None]:
df.head(5)

In [None]:
df.shape

In [None]:
# Data type for each Attribute
types = df.dtypes
print(types)

# Descriptive Statistics

In [None]:
description = df.describe()
print(description)

In [None]:
# Class distribution there are 357 are begin, and 212 are malignant
class_counts = df.groupby('diagnosis').size()
print(class_counts)

In [None]:
# Correlations Between Attributes
pd.set_option('display.width', 100)
pd.set_option('precision', 3)
corrections = df.corr(method='pearson')
print(corrections)

# Skew of Univariate Distributions

In [None]:
skew = df.skew()
print(skew)

# Visualization
## Univariate Plots
### 1 Histograms

In [None]:
import matplotlib.pyplot as plt

# Get dataframe column name
features = list(df) 
n_feature = df.shape[1]

for idx in range(1, n_feature):
    df.iloc[:, idx].hist()
    plt.title(features[idx])
    plt.show()

### Density plots

In [None]:
for idx in range(1, n_feature):
    df.iloc[:, idx].plot(kind='density')
    plt.title(features[idx])
    plt.show()

### Box and Whisker Plots

In [None]:
for idx in range(1, n_feature):
    df.iloc[:, idx].plot(kind='box')
    plt.title(features[idx])
    plt.show()

## Multivariate Plots
### Correlation Matrix Plot

In [None]:
correlations = df.corr()

plt.rcParams['font.size'] = 40

fig = plt.figure(figsize=(31, 31))
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0, 31,1)

ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(features, rotation=90)
ax.set_yticklabels(features)

plt.show()

In [None]:
# More generic correlation matrix plot

fig = plt.figure(figsize=(32,32))
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
plt.show()

In [None]:
### Scatter plot matrix
from pandas.tools.plotting import scatter_matrix

plt.rcParams['figure.figsize'] = [50.0, 50.0]

scatter_matrix(df)
plt.show()

# Prepare your data 

- Rescale data
- Standardize data
- Normalize data
- Binarize data

In [None]:
## Rescale data (between 0 and 1)

from numpy import set_printoptions
from sklearn.preprocessing import MinMaxScaler

array = df.values

# Separate array into input and output components
X = array[:, 1:31]
Y = array[:, 0] # Diagnosis is the output feature class

scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)

# Summarize transformed data
set_printoptions(precision=3)
print(rescaledX[0:5, :])

In [None]:
## Standardize Data

from sklearn.preprocessing import StandardScaler

scaler_std = StandardScaler().fit(X) 
rescalerX_std = scaler_std.transform(X)

# Summarize transformed data
set_printoptions(precision=3)
print(rescalerX_std[0:5, 0:6])

In [None]:
## Normalize Data

from sklearn.preprocessing import Normalizer

scaler_norm = Normalizer().fit(X)
normalizedX = scaler_norm.transform(X)

# Summarize transformd data
set_printoptions(precision=7)
print(normalizedX[0:5, 0:4])

In [None]:
## Binarize Data

from sklearn.preprocessing import Binarizer

binarizer = Binarizer(threshold=0.0).fit(X)
binaryX = binarizer.transform(X)

# summarize transformed data
set_printoptions(precision=3)
print(binaryX[0:10, :])

# Feature Selection 

In [None]:
# Univariate selection
# Feature extraction with Univariate statistical test (Chi-squared for classfication)

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# feature extraction 
test = SelectKBest(score_func=chi2, k=8)
fit = test.fit(X, Y)

# summarize scores
set_printoptions(precision=4)
print(fit.scores_)
features_selected = fit.transform(X)

# summarize selected features
print(features_selected[0:5, :])

In [None]:
# Recursive Feature Elimination

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# feature extraction
model = LogisticRegression()
rfe = RFE(model, 3) # Top 3 features
fit = rfe.fit(X, Y)

print("Num Features: %d") % fit.n_features_
print("Selected Features: %s") % fit.support_
print("Feature Ranking: %s") % fit.ranking_

features_rfe = fit.transform(X)
print(features_rfe[0:5, :])

In [None]:
# Principal Component Analysis

from sklearn.decomposition import PCA

# feature extraction
pca = PCA(n_components=3)
fit = pca.fit(X)

# summarize components
print("Explained Variance: %s") % fit.explained_variance_ratio_
print(fit.components_)

In [None]:
# Feature Importance  
# With Extra Trees Classifier

from sklearn.ensemble import ExtraTreesClassifier

# feature extraction
model = ExtraTreesClassifier()
model.fit(X, Y)
print(model.feature_importances_)

# Evaluate the Performance of ML Algorithm with Resampling

0. The best way to evaluate the performance of an algorithm would be to make predictions for new data to which you already know the answer.

1. Use clever techniques from statistics called resampling methods that allow you to make accurate estimates for how well your algorithm will perform on new data.

## Five different techniques 

We can use to split up our training dataset and create useful estimates of performance fo r our machine learning algorithm

- Hold-out: Train and test sets
- K-fold cross validation
- Leave one out cross validation
- Repeated random test-train splits
- Bootstrapping: (in sklearn "The deprecated Bootstrap cross-validation iterator was removed."

In [None]:
# Method 1: hold-out Split into train and test sets

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

test_size = 0.33
seed = 42

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, 
                                                    random_state=seed)

model = LogisticRegression()
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)

print("Accuracy: {0:.3f}%".format(result*100))

In [None]:
# Method 2: K-fold Cross-validation

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

num_folds = 10
seed = 42
kfold = KFold(n_splits=num_folds, random_state=seed)

model = LogisticRegression()
results = cross_val_score(model, X, Y, cv=kfold)

print("Accuracy:{0:.3f}%(std: {1:.3f}%)".format(results.mean()*100, results.std()*100))

In [None]:
# Method 3: Leave One Out Cross Validation

from sklearn.model_selection import LeaveOneOut

num_folds = 10
loocv = LeaveOneOut()
model = LogisticRegression()
results_loocv = cross_val_score(model, X, Y, cv=loocv)

print("Accuracy: {0:.3f}% (std: {1:.3f}%)".format(results_loocv.mean()*100.0, 
                                                 results_loocv.std()*100.0))

In [None]:
# Method 4: Ramdom repeated subsampling

from sklearn.model_selection import ShuffleSplit

n_splits = 10
test_size = 0.33
seed = 42
kfold = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=seed)

model = LogisticRegression()
results_shuffle = cross_val_score(model, X, Y, cv=kfold)

print("Accuracy:{0:.3f}%(std:{1:.3f}%)".format(results_shuffle.mean()*100.0, 
                                              results_shuffle.std()*100.0))

In [None]:
# Method 5: Bootstrapping

# from sklearn import cross_validation

# n_samples = df.shape[0]*(df.shape[1]-1)
# n_bootstraps = 10
# n_train = 0.66
# n_test = 0.33
# seed = 42

# bs = cross_validation.Bootstrap(n_samples, n_bootstraps=n_bootstraps, n_train=n_train, n_test=n_test,
#               random_state=seed)

# model = LogisticRegression()
# results_bs = cross_val_score(model, X, Y, cv=bs)

# print("Accuracy:{0:.3f}%(std:{1:.3f}%)".format(results_bs.mean()*100.0, 
#                                               results_bs.std()*100.0))

# Machine Learning Algorithm Performance Metrics

- Classification Accuracy
- Logarithmic Loss
- Area Under ROC Curve: AUC
- Confusion Matrix
- Classification Report

Run the following code to get train/test data sets, generate a classifier and to make predictions, so that the results can be used to practice different performance measures. 

In [None]:
# Classification Accuracy

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

kfold = KFold(n_splits=10, random_state=42)
model = LogisticRegression()

scoring = 'accuracy'
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)

print("Accuracy:{0:.3f}%(std: {1:.3f}%)".format(results.mean()*100.0, 
                                                results.std()*100.0))

In [None]:
# Logarithmic Loss

# Log Loss quantifies the accuracy of a classifier by penalising false classifications. 
# Minimising the Log Loss is basically equivalent to maximising the accuracy of the 
# classifier, but there is a subtle twist.

# Smaller logloss is better with 0 representing a perfect logloss.

scoring = 'neg_log_loss'
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)

print("Logloss: {0:.3f}(std:{1:.3f})".format(results.mean(), results.std()))

In [None]:
# ROC curve

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, roc_auc_score

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33,
                                                   random_state=42)

model = LogisticRegression()

y_score = model.fit(X_train, y_train).decision_function(X_test)

fpr, tpr, thresholds = roc_curve(y_test, y_score)

Logistic_Reg_auc = roc_auc_score(y_test, y_score)

print("AUC of LogisticRegression using hold-out method: {0:.3f}".format(Logistic_Reg_auc))

# PLOT roc curve

plt.figure()

plt.plot(fpr, tpr, color='darkorange',
         label='ROC curve (area = {:.3f})'.format(Logistic_Reg_auc))

plt.plot([0, 1], [0, 1], color='navy', linestyle='--')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc='lower right')
plt.show()

In [None]:
# AUC 

results = cross_val_score(model, X, Y, cv=kfold, scoring='roc_auc')
print("AUC: {0:.3f}({1:.3f})".format(results.mean(), results.std()))

In [None]:
# Cross Validation Classification Confusion Matrix
from sklearn.metrics import confusion_matrix

predicted = model.fit(X_train, y_train).predict(X_test)

matrix = confusion_matrix(y_test, predicted)

print(matrix)

In [None]:
# Classification Report
from sklearn.metrics import classification_report

report = classification_report(y_test, predicted)

print(report)