<a href="https://colab.research.google.com/github/HenningBuhl/SGM/blob/main/PCA_breast_cancer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PCA Breast Cancer

# TODO

In [None]:
"""

TODO:

"""

# Settings

## Seed Settings

In [None]:
import numpy as np
np.random.seed(0)

## Plot Settings

In [None]:
plot_width = 8
plot_height = 8
font_scale = 1.5
title_size = 28
label_size = 22
dpi = 80

In [None]:
from matplotlib import pyplot as plt
import matplotlib.patches as mpatches
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
plt.rcParams["figure.figsize"] = (plot_width, plot_height)
#% matplotlib notebook

import seaborn as sns
sns.set(font_scale=font_scale) # Affects all plots.

# Data

## Paths

In [None]:
data_set_name = "breast cancer"
base_dir = f"./data/{data_set_name}/"
zip_path = base_dir + f"{data_set_name}.zip"
data_path = base_dir + "wdbc.data" # "breast-cancer-wisconsin.data"

## Clear Directories

In [None]:
import shutil
#shutil.rmtree("/content/results")

## Download

In [None]:
# Load data from web.
!pip install googledrivedownloader
from google_drive_downloader import GoogleDriveDownloader as gdd
file_id = "1miaCgQTpeIYucy4KFMwpT8as6BVkSoxR"
gdd.download_file_from_google_drive(file_id=file_id,
                                dest_path=zip_path,
                                unzip=True)

## Load data into Pandas

In [None]:
# Column names.
names = [
         "id number",
         "diagnosis",

         "mean radius",
         "mean texture",
         "mean perimeter",
         "mean area",
         "mean smoothness",
         "mean compactness",
         "mean concavity",
         "mean concave points",
         "mean symmetry",
         "mean fractal dimension",

         "worst radius",
         "worst texture",
         "worst perimeter",
         "worst area",
         "worst smoothness",
         "worst compactness",
         "worst concavity",
         "worst concave points",
         "worst symmetry",
         "worst fractal dimension",

         "sd radius",
         "sd texture",
         "sd perimeter",
         "sd area",
         "sd smoothness",
         "sd compactness",
         "sd concavity",
         "sd concave points",
         "sd symmetry",
         "sd fractal dimension",
         ]

In [None]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

data = pd.read_csv(data_path, names=names, na_values="?")

In [None]:
# Replace values.
data["diagnosis"].replace(to_replace=['M', 'B'], value=[1, 0], inplace=True)

## Inspect Data

In [None]:
# Show some rows.
data.head(10)

In [None]:
# Print description.
print(data.describe())

## Value Distribution

In [None]:
# Print number of unique values in each column.
def print_unique_count(df, verbose=0):
    print("Data contains: {:3d} instances".format(len(data)))
    for col in df.columns:
        value_counts = df[col].value_counts()
        values = value_counts.keys().tolist()
        counts = value_counts.tolist()
        print("Unique values in column {:12s}: {:d}".format(col, len(value_counts)))
        if verbose:
            for value, count in zip(values, counts):
                print("\tOccurences of value {:10s}: {:d}".format(str(value), count))

In [None]:
print_unique_count(data, 1)

## Null Values

In [None]:
# Print number of null entries in each column.
def print_null_count(df):
    print("Data contains: {:3d} instances".format(len(data)))
    for col in df.columns:
        print("Null values in column {:12s}: {:d}".format(col, df[col].isna().sum()))

In [None]:
print_null_count(data)

In [None]:
data.dropna(inplace=True)

In [None]:
data.head(10)

## Normalization

In [None]:
from sklearn.preprocessing import StandardScaler

features = names[2::]

x = data.loc[:, features].values
y = data["diagnosis"].values
x = StandardScaler().fit_transform(x)

In [None]:
finalData = pd.concat([pd.DataFrame(x, columns=features), pd.DataFrame(y, columns=["diagnosis"])], axis=1)

# PCA

## Scikit-Learn

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
components = pca.fit_transform(x)
components = pd.DataFrame(data=components, columns=['principal component 1', 'principal component 2'])

In [None]:
print(pca.explained_variance_ratio_)

In [None]:
finalPCA = pd.concat([components, pd.DataFrame(y, columns=["diagnosis"])], axis=1)

## Own PCA Algorithm

In [None]:
# Breast cancer data.
x = data.to_numpy()

In [None]:
# Toy data.
mu = np.array([5.0, 10.0]) # The desired mean values of the sample.
r = np.array([ # The desired covariance matrix.
        [  3.40, -2.75],
        [ -2.00,  1.50]])
x = np.random.multivariate_normal(mu, r, size=50) # Generate the random samples.
print(x)

In [None]:
# Plot data.
plt.scatter(x[:, 0], x[:, 1])
plt.show()

In [None]:
# Calculate mean vectors.
mean_vec = np.mean(x, axis=0)
print(mean_vec)

# Subtract mean from data (not necessary).
x = x - mean_vec
print(x)

In [None]:
# Plot data.
plt.scatter(x[:, 0], x[:, 1])
plt.show()

In [None]:
# Calcualte co-variance matrix.
cov_mat = np.cov([x[:,0], x[:,1]])
print(cov_mat)

# Or alternatively use scatter matrix (in this case, the only difference is a scaling factor).
# ...

# Even using the correlation matrix is possible.
# ...

In [None]:
# Calculate eigenvalues and eigenvectors.
eig_val, eig_vec = np.linalg.eig(cov_mat)
for i, (val, vec) in enumerate(zip(eig_val, eig_vec.T), 1):
    print(f"{i}. Eigenvalue:  {val}")
    print(f"{i}. Eigenvector: {vec}")
    print(50 * '-')

In [None]:
# Check if eigenvalue solutions give zero determinant solution to equation.
for i, val in enumerate(eig_val):
    print(f"{i+1}. Eigenvalue {val:.8f} gives determinant of {np.linalg.det(cov_mat - np.eye(2) * val):.8f}")

In [None]:
# Dot product is 0 if vectors are orthogonal.
np.dot(*eig_vec.T)

In [None]:
# Plot principal component axis.
plt.scatter(x[:, 0], x[:, 1])
origin = np.array([0, 0]) # Origin point.
plt.arrow(*origin, *eig_vec.T[0], color='r', width=.1)
plt.arrow(*origin, *eig_vec.T[1], color='b', width=.1)
plt.xlim(-5, 5)
plt.ylim(-5, 5)
plt.show()

In [None]:
# Transform data using principal components.
transformed = x[:,0:2] @ eig_vec
plt.scatter(transformed[:,0], transformed[:,1])
plt.show()

In [None]:
# Check if co-variance matrix of transformed data is a diagonal matrix.
transformed_cov_mat = np.cov([transformed[:,0], transformed[:,1]])
print(transformed_cov_mat)

In [None]:
# Sorting the eigenvalue and eigenvector pairs.
pairs = [(eig_val[i], eig_vec.T[i]) for i in range(len(eig_val))]
pairs.sort(key=lambda x: x[0], reverse=True)
print(pairs)

In [None]:
# Using the k best principal components.
K = 1
eig_vec_k = np.array([pairs[k][1] for k in range(K)]).reshape(2, K)
transformed = x[:,0:2] @ eig_vec_k
plt.scatter(transformed[:,0], np.zeros(len(x)))
plt.show()

In [None]:
# Explained variance of principal components.
eig_val_sum = np.sum(eig_val)
cumsum = [val/eig_val_sum for val, _ in pairs[::-1]]
print(cumsum)

# Plot cumulative explained variance.
plt.xlabel("Number of Principal Components")
plt.ylabel("Explained Cumulative Variance")
plt.plot(cumsum)
plt.show()

In [None]:
# Composition of principle components.
# ...

In [None]:
# Applying the linear transformation of the co-variance matrix to the data and inspect the transformed data and eigenvectors
cov_mat_transformed = x @ cov_mat
plt.scatter(cov_mat_transformed[:,0], cov_mat_transformed[:,1])
plt.show()

In [None]:
# 
transformed_cov_mat = np.cov([cov_mat_transformed[:,0], cov_mat_transformed[:,1]])
print(transformed_cov_mat)

# Hypothesen

## Erste Hypothese

1.) Die Verteilungen für die 1. Hauptkomponente (principal component) der BreastCancer Daten unterscheiden sich signifikant für die *malignen* (M, bösartigen) und die benignen (B, gutartigen) Fälle.

In [None]:
# Distribution.
benign = finalPCA[finalPCA["diagnosis"] == 0]
malignant = finalPCA[finalPCA["diagnosis"] == 1]
print(benign.describe())
print(200 * '-')
print(malignant.describe())

In [None]:
# 2 PCA plot.
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(1, 1, 1)
ax.set_xlabel("1. Hauptkomponente", fontsize=15)
ax.set_ylabel("2. Hauptkomponente", fontsize=15)
targets = [0, 1]
colors = ['b', 'r']
for target, color in zip(targets, colors):
    indicesToKeep = finalPCA["diagnosis"] == target
    ax.scatter(finalPCA.loc[indicesToKeep, "principal component 1"],
               finalPCA.loc[indicesToKeep, "principal component 2"],
               c=color,
               s=50)
ax.legend(labels=["gutartig", "bösartig"])
ax.grid(linewidth=1)
plt.show()

In [None]:
# 1 PCA plot.
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(1, 1, 1)
ax.set_xlabel("1. Hauptkomponente", fontsize=15)
targets = [0, 1]
colors = ['b', 'r']
for target, color in zip(targets, colors):
    indicesToKeep = finalPCA["diagnosis"] == target
    ax.scatter(finalPCA.loc[indicesToKeep, "principal component 1"],
               np.zeros(len(finalPCA.loc[indicesToKeep, "principal component 1"])),
               c=color,
               s=50)
ax.legend(labels=["gutartig", "bösartig"])
ax.grid(linewidth=1)
plt.show()

In [None]:
# M / B separated 2 PCA plot.
for target, color in zip(targets, colors):
    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(1, 1, 1)
    ax.set_xlabel("Principal Component 1", fontsize=15)
    ax.set_ylabel("Principal Component 2", fontsize=15)
    ax.set_title("2 component PCA", fontsize=20)
    targets = [0, 1]
    colors = ['b', 'r']
    indicesToKeep = finalPCA["diagnosis"] == target
    ax.scatter(finalPCA.loc[indicesToKeep, "principal component 1"],
               finalPCA.loc[indicesToKeep, "principal component 2"],
               c=color,
               s=50)
    ax.legend(labels=["benign", "malignant"])
    ax.grid()
    plt.show()

In [None]:
# M / B separated 1 PCA plot.
# 1 PCA plot.
for target, color in zip(targets, colors):
    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(1, 1, 1)
    ax.set_xlabel("Principal Component 1", fontsize=15)
    ax.set_title("1 component PCA", fontsize=20)
    targets = [0, 1]
    colors = ['b', 'r']
    indicesToKeep = finalPCA["diagnosis"] == target
    ax.scatter(finalPCA.loc[indicesToKeep, "principal component 1"],
               np.zeros(len(finalPCA.loc[indicesToKeep, "principal component 1"])),
               c=color,
               s=50)
    ax.legend(labels=["benign", "malignant"])
    ax.grid()
    plt.show()

## Zweite Hypothese

2.) Die Verteilungen für die 1. Hauptkomponente (principal component) der BreastCancer Daten unterscheiden sich signifikant für die geradzahligen Fälle und die ungeradzahligen Fälle.

In [None]:
# Distribution.
fullPCAData = pd.concat([data["id number"], finalPCA], axis=1)
even = fullPCAData[fullPCAData["id number"] % 2 == 0]
odd = fullPCAData[fullPCAData["id number"] % 2 == 1]
print(even.describe())
print(200 * '-')
print(odd.describe())

In [None]:
# 2 PCA plot for even sample code.
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(1, 1, 1)
ax.set_xlabel("1. Hauptkomponente", fontsize=15)
ax.set_ylabel("2. Hauptkomponente", fontsize=15)
targets = [0, 1]
colors = ['b', 'r']
for target, color in zip(targets, colors):
    indicesToKeep = fullPCAData["id number"] % 2 == target
    ax.scatter(fullPCAData.loc[indicesToKeep, "principal component 1"],
               fullPCAData.loc[indicesToKeep, "principal component 2"],
               c=color,
               s=50)
ax.legend(labels=["gerade", "ungerade"])
ax.grid(linewidth=1)
plt.show()

In [None]:
# 2 PCA plot for even sample code.
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(1, 1, 1)
ax.set_xlabel("1. Hauptkomponente", fontsize=15)
ax.set_ylabel("2. Hauptkomponente", fontsize=15)
targets = [0, 1]
colors = ['b', 'r']
for target, color in zip(targets, colors):
    indicesToKeep = even["diagnosis"] == target
    ax.scatter(even.loc[indicesToKeep, "principal component 1"],
               even.loc[indicesToKeep, "principal component 2"],
               c=color,
               s=50)
ax.legend(labels=["gutartig", "bösartig"])
ax.grid(linewidth=1)
plt.show()

In [None]:
# 2 PCA plot for odd sample code.
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(1, 1, 1)
ax.set_xlabel("1. Hauptkomponente", fontsize=15)
ax.set_ylabel("2. Hauptkomponente", fontsize=15)
targets = [0, 1]
colors = ['b', 'r']
for target, color in zip(targets, colors):
    indicesToKeep = odd["diagnosis"] == target
    ax.scatter(odd.loc[indicesToKeep, "principal component 1"],
               odd.loc[indicesToKeep, "principal component 2"],
               c=color,
               s=50)
ax.legend(labels=["gutartig", "bösartig"])
ax.grid(linewidth=1)
plt.show()

# Ergänzende Frage

## Erste Frage

1.) Wie gut wäre eine Klassifikation  M / B  auf Basis allein der 1. Haupt-komponente?

In [None]:
# Classification model.
from keras.models import Sequential
from keras.layers import Dense, Input
def eval(x, y):
    model = Sequential()
    model.add(Input(shape=(1,)))
    #model.add(Dense(50, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    model.summary()
    model.fit(x, y, batch_size=16, epochs=100)

In [None]:
# Classification on first principal component.
x = finalPCA["principal component 1"].to_numpy()
eval(x, y)

## Zweite Frage

2.) Wie vergleicht sich dies mit einer Klassifikation allein auf Basis der 1. oder 2. Input-Variablen?

In [None]:
# Classification on first column.
x = finalData[features[0]]
eval(x, y)

In [None]:
# Classification on second column.
x = finalData[features[1]]
eval(x, y)

In [None]:
# 2 Feature plot.
feature1 = features[0]
feature2 = features[1]

fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(1, 1, 1)
ax.set_xlabel(feature1, fontsize=15)
ax.set_ylabel(feature2, fontsize=15)
ax.set_title("2 feature plot", fontsize=20)
targets = [0, 1]
colors = ['b', 'r']
for target, color in zip(targets, colors):
    indicesToKeep = finalData["diagnosis"] == target
    ax.scatter(finalData.loc[indicesToKeep, feature1],
               finalData.loc[indicesToKeep, feature2],
               c=color,
               s=50)
ax.legend(labels=["benign", "malignant"])
ax.grid()
plt.show()

In [None]:
# 1 Feature plot.
feature1 = features[0]

fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(1, 1, 1)
ax.set_xlabel(feature1, fontsize=15)
ax.set_ylabel(feature2, fontsize=15)
ax.set_title("1 feature plot", fontsize=20)
targets = [0, 1]
colors = ['b', 'r']
for target, color in zip(targets, colors):
    indicesToKeep = finalData["diagnosis"] == target
    ax.scatter(finalData.loc[indicesToKeep, feature1],
               np.zeros(len(finalData.loc[indicesToKeep, feature1])),
               c=color,
               s=50)
ax.legend(labels=["benign", "malignant"])
ax.grid()
plt.show()

In [None]:
# 1 Feature plot.
feature1 = features[1]

fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(1, 1, 1)
ax.set_xlabel(feature1, fontsize=15)
ax.set_ylabel(feature2, fontsize=15)
ax.set_title("1 feature plot", fontsize=20)
targets = [0, 1]
colors = ['b', 'r']
for target, color in zip(targets, colors):
    indicesToKeep = finalData["diagnosis"] == target
    ax.scatter(finalData.loc[indicesToKeep, feature1],
               np.zeros(len(finalData.loc[indicesToKeep, feature1])),
               c=color,
               s=50)
ax.legend(labels=["benign", "malignant"])
ax.grid()
plt.show()

# Explained Variance

In [None]:
x = data.loc[:, features].values
covmat = np.cov(x.T)
diag = np.diag(covmat)

for d in diag:
    print(f"{d/diag.sum():5.5f}")

In [None]:
pca = PCA().fit(x)
plt.plot(np.arange(0, 1+len(pca.explained_variance_ratio_)), np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Anzahl der Hauptkomponenten')
plt.ylabel('Kumulative Varianz')
ax.grid(linewidth=1)
plt.show()

In [None]:
for v in pca.explained_variance_ratio_:
    print(f"{v:5.5f}")