# PCA for Breast Cancer Dataset

### MGE30301, 20121229 JunPyoPark

In [8]:
import plotly.plotly as py
from plotly.graph_objs import *
import plotly.tools as tls
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv(
    filepath_or_buffer='https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', 
    header=None, 
    sep=',')

df.dropna(how="all", inplace=True) # drops the empty line at file-end
print(len(df))
df.head()

569


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


Number of instances: 569  <br>
	Number of attributes: 32 (ID, diagnosis, 30 real-valued input features <br>
	Attribute information <br>

1) ID number <br>
2) Diagnosis (M = malignant, B = benign) <br>
3-32) <br>
<br>
Ten real-valued features are computed for each cell nucleus: <br>
	
    a) radius (mean of distances from center to points on the perimeter) 
	b) texture (standard deviation of gray-scale values)
	c) perimeter
	d) area
	e) smoothness (local variation in radius lengths)
	f) compactness (perimeter^2 / area - 1.0)
	g) concavity (severity of concave portions of the contour)
	h) concave points (number of concave portions of the contour)
	i) symmetry 
	j) fractal dimension ("coastline approximation" - 1)

In [5]:
# split data table into data X and class labels y
X = df.iloc[:,2:32].values
y = df.iloc[:,1].values

X : 569 x 30 matrix where the columns are the different features <br>
y :  Diagnosis (M = malignant, B = benign)

Each sample row x can be pictured as a 30-dimensional vector

### Standardizing

Each of 30 features were measured on different scales, it makes sense to standardize the data.

Although all featrues are real valued, let us continue with the transformation of the data onto unist scale (mean=0 and variance = 1)

In [20]:
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)

In [21]:
X_std

array([[ 1.09706398, -2.07333501,  1.26993369, ...,  2.29607613,
         2.75062224,  1.93701461],
       [ 1.82982061, -0.35363241,  1.68595471, ...,  1.0870843 ,
        -0.24388967,  0.28118999],
       [ 1.57988811,  0.45618695,  1.56650313, ...,  1.95500035,
         1.152255  ,  0.20139121],
       ..., 
       [ 0.70228425,  2.0455738 ,  0.67267578, ...,  0.41406869,
        -1.10454895, -0.31840916],
       [ 1.83834103,  2.33645719,  1.98252415, ...,  2.28998549,
         1.91908301,  2.21963528],
       [-1.80840125,  1.22179204, -1.81438851, ..., -1.74506282,
        -0.04813821, -0.75120669]])

### Calculating Covariance Matrix

In [9]:
mean_vec = np.mean(X_std, axis=0)
cov_mat = (X_std - mean_vec).T.dot((X_std - mean_vec)) / (X_std.shape[0]-1)
print('Covariance matrix \n%s' %cov_mat)

Covariance matrix 
[[  1.00176056e+00   3.24351929e-01   9.99612069e-01   9.89095475e-01
    1.70881506e-01   5.07014640e-01   6.77955036e-01   8.23976636e-01
    1.48001350e-01  -3.12179472e-01   6.80285970e-01  -9.74887767e-02
    6.75358538e-01   7.37159198e-01  -2.22992026e-01   2.06362656e-01
    1.94545531e-01   3.76831225e-01  -1.04504545e-01  -4.27163418e-02
    9.71245907e-01   2.97530545e-01   9.66835698e-01   9.42739295e-01
    1.19826732e-01   4.14190751e-01   5.27839123e-01   7.45524434e-01
    1.64241985e-01   7.07832563e-03]
 [  3.24351929e-01   1.00176056e+00   3.30113223e-01   3.21650988e-01
   -2.34296930e-02   2.37118951e-01   3.02950254e-01   2.93980713e-01
    7.15266864e-02  -7.65717560e-02   2.76354360e-01   3.87037830e-01
    2.82169018e-01   2.60302460e-01   6.62542133e-03   1.92312595e-01
    1.43545353e-01   1.64139495e-01   9.14323671e-03   5.45533955e-02
    3.53193674e-01   9.13650301e-01   3.58669926e-01   3.44150782e-01
    7.76398084e-02   2.78318729e-0

Eigendecomposition of the raw data based on the cov_mat

In [18]:
eig_vals, eig_vecs = np.linalg.eig(cov_mat)

print('Eigenvectors \n%s' %eig_vecs)
print('\nEigenvalues \n%s' %eig_vals)

Eigenvectors 
[[  2.18902444e-01  -2.33857132e-01  -8.53124284e-03   4.14089623e-02
   -3.77863538e-02   1.87407904e-02   1.24088340e-01   7.45229622e-03
   -2.23109764e-01   9.54864432e-02   4.14714866e-02   5.10674568e-02
    1.19672116e-02  -5.95061348e-02  -5.11187749e-02   1.50583883e-01
   -2.02924255e-01  -1.46712338e-01  -2.25384659e-01  -7.02414091e-01
    2.11460455e-01  -2.11194013e-01  -1.31526670e-01   1.29476396e-01
    1.92264989e-02  -1.82579441e-01   9.85526942e-02  -7.29289034e-02
   -4.96986642e-02   6.85700057e-02]
 [  1.03724578e-01  -5.97060883e-02   6.45499033e-02  -6.03050001e-01
    4.94688505e-02  -3.21788366e-02  -1.13995382e-02  -1.30674825e-01
    1.12699390e-01   2.40934066e-01  -3.02243402e-01   2.54896423e-01
    2.03461333e-01   2.15600995e-02  -1.07922421e-01   1.57841960e-01
    3.87061187e-02   4.11029851e-02  -2.97886446e-02  -2.73661018e-04
   -1.05339342e-02   6.58114593e-05  -1.73573093e-02   2.45566636e-02
   -8.47459309e-02   9.87867898e-02   5

Check the orthogonality

In [15]:
for ev in eig_vecs:
    np.testing.assert_array_almost_equal(1.0, np.linalg.norm(ev))
print('Everything ok!')

Everything ok!


Sorting and Print Eigenvalues

In [16]:
# Make a list of (eigenvalue, eigenvector) tuples
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]

# Sort the (eigenvalue, eigenvector) tuples from high to low
eig_pairs.sort()
eig_pairs.reverse()

# Visually confirm that the list is correctly sorted by decreasing eigenvalues
print('Eigenvalues in descending order:')
for i in eig_pairs:
    print(i[0])

Eigenvalues in descending order:
13.2816076823
5.69135461321
2.81794897723
1.98064047464
1.6487305477
1.20735661197
0.675220113895
0.476617140006
0.416894812368
0.350693456824
0.293915696279
0.261161370221
0.241357496159
0.157009723648
0.0941349650288
0.0798628009546
0.0593990377597
0.0526187835068
0.0494775917768
0.0311594024502
0.029972893911
0.0274394025316
0.024340837767
0.0180550070002
0.015481271375
0.00817763986432
0.00690046387518
0.00158933787114
0.000748803097406
0.000133044822821


Check the cumulative explained variance in order

In [17]:
tot = sum(eig_vals)
var_exp = [(i / tot)*100 for i in sorted(eig_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)

trace1 = Bar(
        x=['PC %s' %i for i in range(1,31)],
        y=var_exp,
        showlegend=False)

trace2 = Scatter(
        x=['PC %s' %i for i in range(1,31)], 
        y=cum_var_exp,
        name='cumulative explained variance')

data = Data([trace1, trace2])

layout=Layout(
        yaxis=YAxis(title='Explained variance in percent'),
        title='Explained variance by different principal components')

fig = Figure(data=data, layout=layout)
py.iplot(fig)

<img src="https://trello-attachments.s3.amazonaws.com/59103d52b56a24582f00dc97/5ab926ab8931f6abf5802534/5307ae6e2de4c00e5e40c795485eadf6/image.png"></img>

The plot above clearly shows that most of the variance (95.15% of the variance to be precise) can be explained by the first 10 principal components.

### Calculating PC Score

In [57]:
reduction_eig_vec = np.matrix(eig_vecs[0:10]).transpose()

Below table sohws PC Score for each data and each PC axis. <br>
There are 569 rows, but I printed only last 5 rows because of the long length of the table.

In [58]:
pc_score = X_std * reduction_eig_vec
col_name = []
for i in range(10):
    col_name.append('PC ' + str(i+1))
score_df = pd.DataFrame(pc_score,columns = col_name)
print('Matrix Size : ', score_df.shape[0], ' x ', score_df.shape[1])
score_df.tail()

Matrix Size :  569  x  10


Unnamed: 0,PC 1,PC 2,PC 3,PC 4,PC 5,PC 6,PC 7,PC 8,PC 9,PC 10
564,0.673816,-1.434012,1.233718,-0.456948,0.81207,0.084353,4.548586,1.069209,1.428688,1.642352
565,0.204172,-0.287256,0.090364,-0.160146,1.546597,0.455844,3.234631,0.360333,-0.049974,1.426389
566,-0.323082,-0.089308,0.05574,-0.225401,0.312937,0.443717,0.852211,-0.151359,-0.288958,0.396241
567,-1.396256,-1.674992,0.563041,0.200963,2.441154,-0.692266,2.648838,0.117838,5.682817,-0.283359
568,-0.461377,0.579047,-1.032332,0.201326,-1.808807,0.912583,-1.446229,-0.566382,-2.705483,0.76377
