<a href="https://colab.research.google.com/github/HenningBuhl/SGM/blob/main/PCA_breast_cancer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PCA Breast Cancer

# TODO

In [None]:
"""

TODO:

Eigene PCA schreiben
Fragen aus Themenauswahl
Paper schreiben

"""

# Settings

## Seed Settings

In [None]:
import numpy as np
np.random.seed(0)

## Plot Settings

In [None]:
plot_width = 12
plot_height = 8
font_scale = 1.5
title_size = 28
label_size = 22
dpi = 80

# Data

## Paths

In [None]:
data_set_name = "breast cancer"
base_dir = f"./data/{data_set_name}/"
zip_path = base_dir + f"{data_set_name}.zip"
data_path = base_dir + "breast-cancer-wisconsin.data"

## Clear Directories

In [None]:
import shutil
#shutil.rmtree("/content/results")

## Download

In [None]:
# Load data from web.
!pip install googledrivedownloader
from google_drive_downloader import GoogleDriveDownloader as gdd
file_id = "1miaCgQTpeIYucy4KFMwpT8as6BVkSoxR"
gdd.download_file_from_google_drive(file_id=file_id,
                                dest_path=zip_path,
                                unzip=True)

## Load data into Pandas

In [None]:
# Column names.
names = [
         "Sample code number",
         "Clump Thickness",
         "Uniformity of Cell Size",
         "Uniformity of Cell Shape",
         "Marginal Adhesion",
         "Single Epithelial Cell Size",
         "Bare Nuclei",
         "Bland Chromatin",
         "Normal Nucleoli",
         "Mitoses",
         "Class",
         ]

# Data types of columns.
dtype = { # All other columns are subject to normalization/standardization and are treated as np.float64.
            "class" : np.int,
        }

In [None]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

data = pd.read_csv(data_path, names=names, dtype=dtype, na_values="?")

In [None]:
# Replace values 2 and 4 by values 0 and 1 respectively (benign = 0, malignant = 1).
data["Class"] = data["Class"].replace(to_replace=[2, 4], value=[0, 1])

## Inspect Data

In [None]:
# Show some rows.
data.head(10)

In [None]:
# Print description.
print(data.describe())

## Value Distribution

In [None]:
# Print number of unique values in each column.
def print_unique_count(df, verbose=0):
    print("Data contains: {:3d} instances".format(len(data)))
    for col in df.columns:
        value_counts = df[col].value_counts()
        values = value_counts.keys().tolist()
        counts = value_counts.tolist()
        print("Unique values in column {:12s}: {:d}".format(col, len(value_counts)))
        if verbose:
            for value, count in zip(values, counts):
                print("\tOccurences of value {:10s}: {:d}".format(str(value), count))

In [None]:
print_unique_count(data, 1)

## Null Values

In [None]:
# Print number of null entries in each column.
def print_null_count(df):
    print("Data contains: {:3d} instances".format(len(data)))
    for col in df.columns:
        print("Null values in column {:12s}: {:d}".format(col, df[col].isna().sum()))

In [None]:
print_null_count(data)

In [None]:
data.dropna(inplace=True)

In [None]:
print(data)

## Normalization

In [None]:
from sklearn.preprocessing import StandardScaler

features = [
            "Clump Thickness",
            "Uniformity of Cell Size",
            "Uniformity of Cell Shape",
            "Marginal Adhesion",
            "Single Epithelial Cell Size",
            "Bare Nuclei",
            "Bland Chromatin",
            "Normal Nucleoli",
            "Mitoses",
            ]

x = data.loc[:, features].values
y = data["Class"].values
x = StandardScaler().fit_transform(x)

In [None]:
finalData = pd.concat([pd.DataFrame(x, columns=features), pd.DataFrame(y, columns=["Class"])], axis=1)

# PCA

## Scikit-Learn

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
components = pca.fit_transform(x)
components = pd.DataFrame(data=components, columns=['principal component 1', 'principal component 2'])

In [None]:
print(pca.explained_variance_ratio_)

In [None]:
finalPCA = pd.concat([components, data["Class"]], axis=1)

## Own Solution

## Visualization

In [None]:
from matplotlib import pyplot as plt
import matplotlib.patches as mpatches
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
plt.rcParams["figure.figsize"] = (plot_width, plot_height)
#% matplotlib notebook

import seaborn as sns
sns.set(font_scale=font_scale) # Affects all plots.

In [None]:
# 2 Feature plot.
feature1 = "Clump Thickness"
feature2 = "Marginal Adhesion"

fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(1, 1, 1)
ax.set_xlabel(feature1, fontsize=15)
ax.set_ylabel(feature2, fontsize=15)
ax.set_title("2 feature plot", fontsize=20)
targets = [0, 1]
colors = ['b', 'r']
for target, color in zip(targets, colors):
    indicesToKeep = finalData["Class"] == target
    ax.scatter(finalData.loc[indicesToKeep, feature1],
               finalData.loc[indicesToKeep, feature2],
               c=color,
               s=50)
ax.legend(labels=["benign", "malignant"])
ax.grid()

In [None]:
# 2 PCA plot.
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(1, 1, 1)
ax.set_xlabel("Principal Component 1", fontsize=15)
ax.set_ylabel("Principal Component 2", fontsize=15)
ax.set_title("2 component PCA", fontsize=20)
targets = [0, 1]
colors = ['b', 'r']
for target, color in zip(targets, colors):
    indicesToKeep = finalPCA["Class"] == target
    ax.scatter(finalPCA.loc[indicesToKeep, "principal component 1"],
               finalPCA.loc[indicesToKeep, "principal component 2"],
               c=color,
               s=50)
ax.legend(labels=["benign", "malignant"])
ax.grid()

In [None]:
# 2 own PCA plot.


# Hypothesen

1.) Die Verteilungen für die 1. Hauptkomponente (principal component) der BreastCancer Daten unterscheiden sich signifikant für die *malignen* (M, bösartigen) und die benignen (B, gutartigen) Fälle.

2.) Die Verteilungen für die 1. Hauptkomponente (principal component) der BreastCancer Daten unterscheiden sich signifikant für die geradzahligen Fälle und die ungeradzahligen Fälle.

# Ergänzende Frage

1.) Wie gut wäre eine Klassifikation  M / B  auf Basis allein der 1. Haupt-komponente?

2.) Wie vergleicht sich dies mit einer Klassifikation allein auf Basis der 1. oder 2. Input-Variablen?