# Data
#### We'll be using the "mammographic masses" public dataset from the UCI repository (source: https://archive.ics.uci.edu/ml/datasets/Mammographic+Mass)

The data is stored in the ./data/ directory

In [143]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [144]:
headers = ["BI-RADS", "Age", "Shape", "Margin", "Density", "Severity"]
dataset = pd.read_csv('./data/mammographic_masses.data', header=None, names=headers, na_values='?')
dataset.head()

Unnamed: 0,BI-RADS,Age,Shape,Margin,Density,Severity
0,5.0,67.0,3.0,5.0,3.0,1
1,4.0,43.0,1.0,1.0,,1
2,5.0,58.0,4.0,5.0,3.0,1
3,4.0,28.0,1.0,1.0,3.0,0
4,5.0,74.0,1.0,5.0,,1


In [145]:
dataset.describe()

Unnamed: 0,BI-RADS,Age,Shape,Margin,Density,Severity
count,959.0,956.0,930.0,913.0,885.0,961.0
mean,4.348279,55.487448,2.721505,2.796276,2.910734,0.463059
std,1.783031,14.480131,1.242792,1.566546,0.380444,0.498893
min,0.0,18.0,1.0,1.0,1.0,0.0
25%,4.0,45.0,2.0,1.0,3.0,0.0
50%,4.0,57.0,3.0,3.0,3.0,0.0
75%,5.0,66.0,4.0,4.0,3.0,1.0
max,55.0,96.0,4.0,5.0,4.0,1.0


In [146]:
dataset.isna().sum()

BI-RADS      2
Age          5
Shape       31
Margin      48
Density     76
Severity     0
dtype: int64

In [147]:
clean.corr()

Unnamed: 0,BI-RADS,Age,Shape,Margin,Density,Severity
BI-RADS,1.0,0.094623,0.18012,0.157771,0.028356,0.223826
Age,0.094623,1.0,0.380096,0.420913,0.052417,0.455216
Shape,0.18012,0.380096,1.0,0.738014,0.073969,0.564763
Margin,0.157771,0.420913,0.738014,1.0,0.12486,0.574498
Density,0.028356,0.052417,0.073969,0.12486,1.0,0.068651
Severity,0.223826,0.455216,0.564763,0.574498,0.068651,1.0


This is a binary classification problem, our goal is to correctly predict the severity of a cancer based on the other features. 

The data work-flow is inspired by this video: https://www.youtube.com/watch?v=MpFZUshKypk

In [93]:
def stage_details(f):
    def wrapper(dataframe, *args, **kwargs):
        result = f(dataframe, *args, **kwargs)
        print(f'<{f.__name__}> dataframe shape: {result.shape}')
        return result
    
    return wrapper

In [171]:
@stage_details
def start_pipeline(dataframe):
    return dataframe.copy()

@stage_details
def format_data(dataframe):
    dataframe = dataframe.drop(columns=["BI-RADS"])
    return dataframe

@stage_details
def clean_data(dataframe):
    imp = IterativeImputer(max_iter=10, random_state=0, verbose=2)
    clean = dataframe.dropna()
    imp.fit(clean)
    imputed = np.round(imp.transform(dataframe))
    
    imputed_dataframe = pd.DataFrame(data=imputed, columns=["Age", "Shape", "Margin", "Density", "Severity"])
    return imputed_dataframe

@stage_details
def visualise_data(dataframe):
#     sns.pairplot(dataframe, hue='Severity', height=2.5)
    return dataframe


In [172]:
(dataset.pipe(start_pipeline)
        .pipe(format_data)
        .pipe(clean_data)
        .pipe(visualise_data))

<start_pipeline> dataframe shape: (961, 6)
<format_data> dataframe shape: (961, 5)
[IterativeImputer] Completing matrix with shape (831, 5)
[IterativeImputer] Ending imputation round 1/10, elapsed time 0.00
[IterativeImputer] Early stopping criterion reached.
[IterativeImputer] Completing matrix with shape (961, 5)
<clean_data> dataframe shape: (961, 5)
<visualise_data> dataframe shape: (961, 5)


Unnamed: 0,Age,Shape,Margin,Density,Severity
0,67.0,3.0,5.0,3.0,1.0
1,43.0,1.0,1.0,3.0,1.0
2,58.0,4.0,5.0,3.0,1.0
3,28.0,1.0,1.0,3.0,0.0
4,74.0,1.0,5.0,3.0,1.0
...,...,...,...,...,...
956,47.0,2.0,1.0,3.0,0.0
957,56.0,4.0,5.0,3.0,1.0
958,64.0,4.0,5.0,3.0,0.0
959,66.0,4.0,5.0,3.0,1.0


In [110]:
961 - 831

130