In [1]:
from libraries import * 
%run -i configuration.py
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

## Import Data

In [5]:
features = ["clump_thickness", "uniformity_cell_size", 
           "uniformity_cell_shape", "marginal_adhesion",
           "single_epithelial_cell_size", "bare_nuclei", 
           "bland_chromatin", "normal_nucleoli", "mitoses"]

columns = ["id"] + features + ["class"]

# Read data
dataset = pd.read_csv("./data/breast-cancer-wisconsin.data", header=0, names=columns)

## Rename and Clean

In [6]:
# Rename the class column as "malignant"
dataset = dataset.rename(columns={"class":"malignant"})

# Transform malignant column into a boolean 0,1 variable
dataset["malignant"].replace(to_replace={2:0, 4:1}, inplace=True)

In [7]:
dataset.describe()

Unnamed: 0,id,clump_thickness,uniformity_cell_size,uniformity_cell_shape,marginal_adhesion,single_epithelial_cell_size,bland_chromatin,normal_nucleoli,mitoses,malignant
count,698.0,698.0,698.0,698.0,698.0,698.0,698.0,698.0,698.0,698.0
mean,1071806.791,4.417,3.138,3.211,2.809,3.218,3.438,2.87,1.59,0.345
std,617532.274,2.818,3.053,2.973,2.857,2.215,2.44,3.055,1.716,0.476
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,870258.25,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,0.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,0.0
75%,1238354.0,6.0,5.0,5.0,4.0,4.0,5.0,4.0,1.0,1.0
max,13454352.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,1.0


In [8]:
# Inspect all unique values
for col in features:
    print(f"{col} -> {dataset[col].unique()}")
    
# we see that "bare_nuclei" contains question marks. we'll replace "?" with np.nan
dataset["bare_nuclei"] = dataset["bare_nuclei"].replace(to_replace={"?":np.nan})

#we then drop all rows with missing values
# https://scikit-learn.org/stable/modules/impute.html
"""many real world datasets contain missing values, often encoded as blanks, 
NaNs or other placeholders. These are incompatible with scikit-learn estimators 
which assume that all values in an array are numerical, and that all have and hold meaning."""

# Count null values in each feature (16 found in bare_nuclei)
dataset.isnull().sum(axis=0)

# Drop nan
dataset = dataset.dropna()

clump_thickness -> [ 5  3  6  4  8  1  2  7 10  9]
uniformity_cell_size -> [ 4  1  8 10  2  3  7  5  6  9]
uniformity_cell_shape -> [ 4  1  8 10  2  3  5  6  7  9]
marginal_adhesion -> [ 5  1  3  8 10  4  6  2  9  7]
single_epithelial_cell_size -> [ 7  2  3  1  6  4  5  8 10  9]
bare_nuclei -> ['10' '2' '4' '1' '3' '9' '7' '?' '5' '8' '6']
bland_chromatin -> [ 3  9  1  2  4  5  7  8  6 10]
normal_nucleoli -> [ 2  1  7  4  5  3 10  6  9  8]
mitoses -> [ 1  5  4  2  3  7 10  8  6]


'many real world datasets contain missing values, often encoded as blanks, \nNaNs or other placeholders. These are incompatible with scikit-learn estimators \nwhich assume that all values in an array are numerical, and that all have and hold meaning.'

id                              0
clump_thickness                 0
uniformity_cell_size            0
uniformity_cell_shape           0
marginal_adhesion               0
single_epithelial_cell_size     0
bare_nuclei                    16
bland_chromatin                 0
normal_nucleoli                 0
mitoses                         0
malignant                       0
dtype: int64

## Save Cleaned Dtaset

In [14]:
dataset = dataset.set_index("id")
dataset.to_csv("./data/breast-cancer-wisconsin-cleaned.csv")