# Breast Cancer Wisconsin - Diagnostic Dataset Analysis


### Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.ticker as mtick
from IPython.display import display, Markdown, Latex
from pprint import pprint
from sklearn.metrics import *
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from collections import defaultdict

pd.set_option('display.max_columns', None)

___
## Pre-Processing

### Load, Clean, & Standardize Data

In [None]:
# Load
df = pd.read_csv('data.csv')
display(df.head())

# Clean
df.drop(columns=['Unnamed: 32', 'id'], inplace=True)
df['diagnosis'] = [1 if row[1]['diagnosis'] == 'M' else 0 for row in df.iterrows()]
display(df.head())

# Standardize
features = list(df.columns)
features.remove('diagnosis')
for col in features:
    df[col] = (df[col] - df[col].mean()) / df[col].std()
display(df.head())

y = df['diagnosis']
x = df.loc[:, df.columns != 'diagnosis']

___
## Visualizing the Data

### Correlation Matrix

In [None]:
corr = df.sort_index(axis=1).corr()
plt.figure(figsize=(15,8))
sns.heatmap(corr, cmap="RdYlGn")
plt.show()

### Features Visualized via Box Plots

In [None]:
plt.figure(figsize=(20,8))
chart = sns.boxplot(data=df)
chart.set_xticklabels(labels=df.columns, rotation=90)
plt.show()

### Histogram Analysis: Benign vs. Malignant Tumors Across Features

In [None]:
subplot_columns = 5
subplot_rows = 6
f, axs = plt.subplots(subplot_rows,subplot_columns,figsize=(15,15))

num = 1
for column in df.columns:
    if df[column].dtypes != 'float64':
        continue
    
    bins = 25
    plt.subplot(subplot_rows,subplot_columns,num)
    # plt.hist(df[column], bins=25, alpha=0.0, label='A', color='b')
    plt.hist(df[df['diagnosis'] == 1][column], bins=bins, alpha=0.5, label='M', color='r')
    plt.hist(df[df['diagnosis'] == 0][column], bins=bins, alpha=0.5, label='B', color='g')
    plt.legend(loc='upper right')
    plt.title(column)
    
    num += 1

plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)
plt.show()

### Principal Component Analysis

In [None]:
eig_vals, eig_vecs = np.linalg.eig(x.cov())
eig_percents = eig_vals / sum(eig_vals) * 100

sort_ix,  sort_eig_percents = zip(*sorted(enumerate(eig_percents), reverse = True, key = lambda x: x[1]))

In [None]:
labels = [f'PC{i + 1}' for i,v in enumerate(sort_eig_percents)]
sns.set(rc={'figure.figsize':(20,5)})
ax = sns.barplot(x=labels, y=list(sort_eig_percents), color='cornflowerblue')
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
ax.set_title('Pricipal Components & Percent of Varation Explained', fontsize=15)
ax.set_xlabel('Pricipal Components', fontsize=12)
ax.set_ylabel('Percent of Varation Explained', fontsize=12)
plt.show()

___
## Modeling

### Final Data Preparation Steps

#### Create Condensed Feature List with PCA

In [None]:
pca = PCA(n_components=3)
pca.fit(x)
x_ENG = pca.transform(x)
print(x.shape, x_ENG.shape)

#### Visualize PCA Data

In [None]:
principal_df = pd.DataFrame({"PC1":x_ENG[:,0],"PC2":x_ENG[:,1],})
plt.figure(figsize=(6,6))
plt.xticks(fontsize=12)
plt.yticks(fontsize=14)
plt.xlabel(f"PC 1:  {eig_percents[0]:.2f}% " ,fontsize = 12)
plt.ylabel(f"PC 2:  {eig_percents[1]:.2f}% " ,fontsize = 12)
plt.title("Principal Component Analysis of Breast Cancer Dataset",fontsize=15)
targets = [0, 1]
colors = ['g','r']
labs = ['Begnign','Malignant']

for target, color, lab in zip(targets,colors,labs):
    indicesToKeep = df['diagnosis'] == target
    plt.scatter(principal_df.loc[indicesToKeep, 'PC1'],
                principal_df.loc[indicesToKeep, 'PC2'], c = color, s = 50, label = lab);

plt.legend(prop={'size': 15});
plt.show()

#### Create Function for Flagging Correlated Features

In [None]:
def correlated(dataset, threshold=0.9):
    '''
    This function returns attributes that are correlated more than a given a threshold i.e. 90%
    '''
    col_corr = set()
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i,j]) > threshold:
                # print(corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i,j])
                colname = corr_matrix.columns[i]
                col_corr.add(colname)
    return col_corr

#### Create DataFrame with Correlated Features Removed

In [None]:
x_minus_corr = x.copy(deep=True)
features_to_drop = correlated(x_minus_corr)
x_minus_corr = x_minus_corr.drop(features_to_drop,axis=1)
print(f'Correlated Features Removed:')
for col in features_to_drop:
    print(f'\t{col}')

#### Create Testing & Training Data

In [None]:
test_size = 0.2
random_state = 42

# All Features
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=random_state)

# Correlated Features Removed
x_train_CORR, x_test_CORR, y_train, y_test = train_test_split(x_minus_corr, y, test_size=test_size, random_state=random_state)

# PCA Dataset
x_train_ENG, x_test_ENG, y_train, y_test = train_test_split(x_ENG, y, test_size=test_size, random_state=random_state)

#### Our Classification Models

In [None]:
clf_models = [('Logistic Regression', LogisticRegression(solver='lbfgs')),
              ('Random Forest', RandomForestClassifier()),
              ('Gradient Boosting', GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)),
              ('KNeighbors Classifier', KNeighborsClassifier(n_neighbors=3))]

### Classification Testing Functions

#### Create Function - Run Single Classifier

In [None]:
def run_classifier(classifier, x_train, y_train, x_test, y_test, print_stats=False):
    
    '''
    This function returns a classification and confusion matrix for a given classification model
    This function accepts a classifier type + training and testing data as parameters
    '''
    
    clf = classifier[1]
    clf.fit(x_train, y_train)
    predictions = clf.predict(x_test)
    
    if print_stats==True:
        display(Markdown(f'#### {classifier[0]} \n'))
        print(classification_report(y_test, predictions, target_names=['Benign (0)', 'Malignant (1)']))
        
        scores = cross_val_score(clf, x, y, cv=5)
        print('\nCross-Validation Results:')
        print("\t%0.2f accuracy with a standard deviation of %0.2f\n" % (scores.mean(), scores.std()))
        
        cm = confusion_matrix(y_test, predictions)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Benign (0)', 'Malignant (1)'])
        disp.plot()
        plt.show()
    
    if classifier[0] == 'Logistic Regression':
        importance = clf.coef_[0]
    elif classifier[0] == 'Random Forest':
        importance = clf.feature_importances_
    elif classifier[0] == 'Gradient Boosting':
        importance = clf.feature_importances_
    else:
        importance = []
    
    return predictions, importance, clf.score(x_test, y_test)

#### Create Function - Feature Importance

In [None]:
def feature_importance(x, y, model, test_size=0.2, random_state=42):
    
    '''
    This function will run a classification model with one feature type at a time
    It will then compare the performance of the single feature type model to the performance of the full model
    The function will output a grid displaying the performance comparison
    '''
    
    column_types = defaultdict(list)
    for column in x.columns:
        column_types[column.split('_')[0]].append(column)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=random_state)
    predictions_og, importance_og, score_og = run_classifier(model, x_train, y_train, x_test, y_test)
    precision_og = precision_score(y_test, predictions_og)
    recall_og = recall_score(y_test, predictions_og)
    importance_df = pd.DataFrame(columns = ['model_type', 'feature_type', 'r2_all', 'r2_one', 'r2_diff', 'prec_all', 'prec_one', 'prec_diff', 'prec_diff_pct', 'recall_all', 'recall_one', 'recall_diff', 'recall_diff_pct'])
    for column_type, columns in column_types.items():
        x_imp = x.copy()
        x_imp = x_imp[columns]
        x_train, x_test, y_train, y_test = train_test_split(x_imp, y, test_size=test_size, random_state=random_state)
        predictions_new, importance_new, score_new = run_classifier(model, x_train, y_train, x_test, y_test)
        precision_new = precision_score(y_test, predictions_new)
        recall_new = recall_score(y_test, predictions_new)
        importance_df = importance_df.append({'model_type': model[0],
                                              'feature_type': column_type,
                                              'r2_all': score_og,
                                              'r2_one': score_new,
                                              'r2_diff': score_new - score_og,
                                              'r2_dff_pct': np.NaN,
                                              'prec_all': precision_og,
                                              'prec_one': precision_new,
                                              'prec_diff': precision_new - precision_og,
                                              'prec_diff_pct': np.NaN,
                                              'recall_all': recall_og,
                                              'recall_one': recall_new,
                                              'recall_diff': recall_new - recall_og,
                                              'recall_diff_pct': np.NaN}, ignore_index=True)
    importance_df['r2_dff_pct'] = pd.Series(["{0:.2f}%".format(val * 100) for val in importance_df['r2_diff']], index = importance_df.index)
    importance_df['prec_diff_pct'] = pd.Series(["{0:.2f}%".format(val * 100) for val in importance_df['prec_diff']], index = importance_df.index)
    importance_df['recall_diff_pct'] = pd.Series(["{0:.2f}%".format(val * 100) for val in importance_df['recall_diff']], index = importance_df.index)
    importance_df = importance_df.drop(['r2_all', 'r2_one', 'r2_diff', 'r2_dff_pct'], axis=1)
    display(importance_df.sort_values(by=['recall_diff']))

## Classification Testing (All Features)
In this section we seek to establish a baseline by running our classifiers with all features included and examining performance and feature importance

In [None]:
model_num = 1
p,i,r2 = run_classifier(clf_models[model_num - 1], x_train, y_train, x_test, y_test, print_stats=True)

In [None]:
model_num = 2
p,i,r2 = run_classifier(clf_models[model_num - 1], x_train, y_train, x_test, y_test, print_stats=True)

In [None]:
model_num = 3
p,i,r2 = run_classifier(clf_models[model_num - 1], x_train, y_train, x_test, y_test, print_stats=True)

In [None]:
model_num = 4
p,i,r2 = run_classifier(clf_models[model_num - 1], x_train, y_train, x_test, y_test, print_stats=True)

#### Identify Most Important Features Pt. 1

In [None]:
predictions, importance, score = run_classifier(clf_models[0], x_train, y_train, x_test, y_test)
ax = sns.barplot(y=importance, x=x.columns, color='cornflowerblue')
ax.set_xticklabels(x.columns, rotation=90)
ax.set_title(f'Feature Importance: {clf_models[0][0]}', fontsize=15)
ax.set_xlabel('Feature', fontsize=12)
ax.set_ylabel('Importance', fontsize=12)
plt.show()

#### Identify Most Important Features Pt. 2

In [None]:
feature_importance(x, y, clf_models[0], test_size, random_state)

## Classification Testing (Correlated Features Removed)

In this section we seek to understand the impact that removing highly correlated features has on predictive performance & feature importance

In [None]:
model_num = 1
p,i,r2 = run_classifier(clf_models[model_num - 1], x_train_CORR, y_train, x_test_CORR, y_test, print_stats=True)

In [None]:
model_num = 2
p,i,r2 = run_classifier(clf_models[model_num - 1], x_train_CORR, y_train, x_test_CORR, y_test, print_stats=True)

In [None]:
model_num = 3
p,i,r2 = run_classifier(clf_models[model_num - 1], x_train_CORR, y_train, x_test_CORR, y_test, print_stats=True)

In [None]:
model_num = 4
p,i,r2 = run_classifier(clf_models[model_num - 1], x_train_CORR, y_train, x_test_CORR, y_test, print_stats=True)

#### Identify Most Important Features Pt. 1

In [None]:
predictions, importance, score = run_classifier(clf_models[0], x_train_CORR, y_train, x_test_CORR, y_test)
ax = sns.barplot(y=importance, x=x_minus_corr.columns, color='cornflowerblue')
ax.set_xticklabels(x_minus_corr.columns, rotation=90)
ax.set_title(f'Feature Importance: {clf_models[0][0]}', fontsize=15)
ax.set_xlabel('Feature', fontsize=12)
ax.set_ylabel('Importance', fontsize=12)
plt.show()

#### Identify Most Important Features Pt. 2

In [None]:
feature_importance(x_minus_corr, y, clf_models[0], test_size, random_state)

## Classification Testing (w/ PCA)

In this section we seek to analyze the performance of the models when utilizing a dataset that has had its dimensions reduced via PCA

In [None]:
model_num = 1
predictions, importance, score = run_classifier(clf_models[model_num - 1], x_train_ENG, y_train, x_test_ENG, y_test, print_stats=True)

In [None]:
model_num = 2
predictions, importance, score = run_classifier(clf_models[model_num - 1], x_train_ENG, y_train, x_test_ENG, y_test, print_stats=True)

In [None]:
model_num = 3
predictions, importance, score = run_classifier(clf_models[model_num - 1], x_train_ENG, y_train, x_test_ENG, y_test, print_stats=True)

In [None]:
model_num = 4
predictions, importance, score = run_classifier(clf_models[model_num - 1], x_train_ENG, y_train, x_test_ENG, y_test, print_stats=True)