Perform SVM with PCA operation on Breast Cancer Dataset and Iris Dataset.

With Breast Cancer Dataset

In [1]:
from sklearn import datasets
breast_cancer = datasets.load_breast_cancer()
breast_data = breast_cancer.data
breast_labels = breast_cancer.target

print(breast_data.shape)
print(breast_labels.shape)

(569, 30)
(569,)


In [2]:
import numpy as np
labels = np.reshape(breast_labels,(569,1))
final_breast_data = np.concatenate([breast_data,labels],axis=1)
final_breast_data.shape

import pandas as pd
breast_dataset = pd.DataFrame(final_breast_data)
features = breast_cancer.feature_names
features

final_breast_data[0:5]

array([[1.799e+01, 1.038e+01, 1.228e+02, 1.001e+03, 1.184e-01, 2.776e-01,
        3.001e-01, 1.471e-01, 2.419e-01, 7.871e-02, 1.095e+00, 9.053e-01,
        8.589e+00, 1.534e+02, 6.399e-03, 4.904e-02, 5.373e-02, 1.587e-02,
        3.003e-02, 6.193e-03, 2.538e+01, 1.733e+01, 1.846e+02, 2.019e+03,
        1.622e-01, 6.656e-01, 7.119e-01, 2.654e-01, 4.601e-01, 1.189e-01,
        0.000e+00],
       [2.057e+01, 1.777e+01, 1.329e+02, 1.326e+03, 8.474e-02, 7.864e-02,
        8.690e-02, 7.017e-02, 1.812e-01, 5.667e-02, 5.435e-01, 7.339e-01,
        3.398e+00, 7.408e+01, 5.225e-03, 1.308e-02, 1.860e-02, 1.340e-02,
        1.389e-02, 3.532e-03, 2.499e+01, 2.341e+01, 1.588e+02, 1.956e+03,
        1.238e-01, 1.866e-01, 2.416e-01, 1.860e-01, 2.750e-01, 8.902e-02,
        0.000e+00],
       [1.969e+01, 2.125e+01, 1.300e+02, 1.203e+03, 1.096e-01, 1.599e-01,
        1.974e-01, 1.279e-01, 2.069e-01, 5.999e-02, 7.456e-01, 7.869e-01,
        4.585e+00, 9.403e+01, 6.150e-03, 4.006e-02, 3.832e-02, 2.058e-02

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(breast_data,
        breast_labels, random_state=46)

print(X_train.shape, X_test.shape)

(426, 30) (143, 30)


Preprocessing: Principal Component Analysis
-------------------------------------------

We can use PCA to reduce these features to a manageable size, while maintaining most of the information
in the dataset.



In [4]:
from sklearn import decomposition
pca = decomposition.PCA(n_components=20, whiten=True)
pca.fit(X_train)

PCA(copy=True, iterated_power='auto', n_components=20, random_state=None,
    svd_solver='auto', tol=0.0, whiten=True)

The principal components measure deviations about this mean along
orthogonal axes.

In [5]:
print(pca.components_.shape)

(20, 30)


With this projection computed, we can now project our original training
and test data onto the PCA basis:


In [6]:
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print(X_train_pca.shape)

print(X_test_pca.shape)

(426, 20)
(143, 20)


Doing the Learning: Support Vector Machines
-------------------------------------------

Now we'll perform support-vector-machine classification on this reduced
dataset:

In [7]:
from sklearn import svm
clf = svm.SVC(C=5., gamma=0.001)
clf.fit(X_train_pca, y_train)

from sklearn import metrics
y_pred = clf.predict(X_test_pca)
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.89      0.93        53
           1       0.94      0.99      0.96        90

    accuracy                           0.95       143
   macro avg       0.96      0.94      0.95       143
weighted avg       0.95      0.95      0.95       143



Another interesting metric is the *confusion matrix*, which indicates
how often any two items are mixed-up. The confusion matrix of a perfect
classifier would only have nonzero entries on the diagonal, with zeros
on the off-diagonal:


In [8]:
print(metrics.confusion_matrix(y_test, y_pred))

[[47  6]
 [ 1 89]]


# With Iris Dataset

In [9]:
iris = datasets.load_iris()
iris_data = iris.data
iris_labels = iris.target

print(iris_data.shape)
print(iris_labels.shape)

features = iris.feature_names
features

(150, 4)
(150,)


['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris_data,
        iris_labels, random_state=46)

print(X_train.shape, X_test.shape)

(112, 4) (38, 4)


Preprocessing: Principal Component Analysis

We can use PCA to reduce these features to a manageable size, while maintaining most of the information in the dataset.

In [12]:
from sklearn import decomposition
pca = decomposition.PCA(n_components=2, whiten=True)
pca.fit(X_train)

print(pca.components_.shape)

(2, 4)


In [13]:
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print(X_train_pca.shape)

print(X_test_pca.shape)

(112, 2)
(38, 2)


In [14]:
from sklearn import svm
clf = svm.SVC(C=5., gamma=0.001)
clf.fit(X_train_pca, y_train)

from sklearn import metrics
y_pred = clf.predict(X_test_pca)
print(metrics.classification_report(y_test, y_pred))

print(metrics.confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.93      0.97        15
           1       0.56      1.00      0.71        10
           2       1.00      0.46      0.63        13

    accuracy                           0.79        38
   macro avg       0.85      0.80      0.77        38
weighted avg       0.88      0.79      0.79        38

[[14  1  0]
 [ 0 10  0]
 [ 0  7  6]]
