### Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from library.sb_utils import save_file

## 1. Data Collection

In [3]:
bc_data = pd.read_csv('../raw_data/breast_cancer_dataset.csv')

In [4]:
bc_data.head()

Unnamed: 0,id,Clump_Thickness,Cell_Size_Uniformity,Cell_Shape_Uniformity,Marginal_Adhesion,Single_Epi_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class
0,1,7.581819,9.745087,1.0,4.50341,7.03993,10.0,4.412282,10.0,5.055266,malignant
1,2,5.210921,8.169596,7.841875,6.033275,4.269619,10.0,4.236312,4.84535,1.0,malignant
2,3,4.0,4.594296,2.33038,2.0,3.0,1.0,10.701823,1.101305,1.0,benign
3,4,2.428871,1.0,1.0,1.0,4.099291,1.0,2.0,1.0,1.0,benign
4,5,8.855971,2.697539,6.047068,3.301891,3.0,1.0,5.297592,4.104791,3.115741,malignant


In [5]:
bc_data.info

<bound method DataFrame.info of           id  Clump_Thickness  Cell_Size_Uniformity  Cell_Shape_Uniformity  \
0          1         7.581819              9.745087               1.000000   
1          2         5.210921              8.169596               7.841875   
2          3         4.000000              4.594296               2.330380   
3          4         2.428871              1.000000               1.000000   
4          5         8.855971              2.697539               6.047068   
...      ...              ...                   ...                    ...   
39361  39362         9.927819              9.356578               8.735863   
39362  39363         4.781275              1.000000               1.000000   
39363  39364         5.479501              9.688672               2.000000   
39364  39365         4.000000              1.000000               1.000000   
39365  39366         9.761903              9.817482               4.279460   

       Marginal_Adhesion  Singl

## 2. Data Definition

In [6]:
bc_data.columns

Index(['id', 'Clump_Thickness', 'Cell_Size_Uniformity',
       'Cell_Shape_Uniformity', 'Marginal_Adhesion', 'Single_Epi_Cell_Size',
       'Bare_Nuclei', 'Bland_Chromatin', 'Normal_Nucleoli', 'Mitoses',
       'Class'],
      dtype='object')

In [7]:
bc_data.dtypes

id                         int64
Clump_Thickness          float64
Cell_Size_Uniformity     float64
Cell_Shape_Uniformity    float64
Marginal_Adhesion        float64
Single_Epi_Cell_Size     float64
Bare_Nuclei              float64
Bland_Chromatin          float64
Normal_Nucleoli          float64
Mitoses                  float64
Class                     object
dtype: object

In [8]:
bc_data.describe()

Unnamed: 0,id,Clump_Thickness,Cell_Size_Uniformity,Cell_Shape_Uniformity,Marginal_Adhesion,Single_Epi_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses
count,39366.0,39366.0,39366.0,39366.0,39366.0,39366.0,39366.0,39366.0,39366.0,39366.0
mean,19683.5,4.394013,3.13007,3.203657,2.827221,3.209844,3.497453,3.409142,2.894595,1.591809
std,11364.129685,2.812104,3.039493,2.975983,2.872543,2.220422,3.619992,2.422371,3.069489,1.706766
min,1.0,0.73546,0.564014,1.0,1.0,1.0,-0.117818,1.0,0.758343,1.0
25%,9842.25,2.243989,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0
50%,19683.5,4.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0
75%,29524.75,5.630522,4.553797,4.966335,3.551935,4.003072,6.333434,4.561324,3.797023,1.0
max,39366.0,13.717991,10.933095,12.604289,11.158505,14.414889,13.160789,12.005376,10.700432,12.044924


In [9]:
bc_data.nunique()

id                       39366
Clump_Thickness          26387
Cell_Size_Uniformity     17780
Cell_Shape_Uniformity    16030
Marginal_Adhesion        13232
Single_Epi_Cell_Size     10910
Bare_Nuclei               8561
Bland_Chromatin          11991
Normal_Nucleoli          11095
Mitoses                   2988
Class                        2
dtype: int64

## 3. Data Cleaning

In [10]:
bc_data.isnull().sum()

id                       0
Clump_Thickness          0
Cell_Size_Uniformity     0
Cell_Shape_Uniformity    0
Marginal_Adhesion        0
Single_Epi_Cell_Size     0
Bare_Nuclei              0
Bland_Chromatin          0
Normal_Nucleoli          0
Mitoses                  0
Class                    0
dtype: int64

### Check for outliers: 

Additional Information provided with the dataset indicates that the domains of 'Clump_Thickness', 'Cell_Size_Uniformity', 'Cell_Shape_Uniformity', 'Marginal_Adhesion', 'Single_Epi_Cell_Size', 'Bare_Nuclei', 'Bland_Chromatin', 'Normal_Nucleoli', 'Mitoses' are between 1-10. Below, I take a look at the dataset and identify whether values above 10 or below 1 for theses columns need to be removed.

In [11]:
def check_min_outlier(column_name):
    return len(bc_data[bc_data[column_name]<1])

In [12]:
print('Number of min outliers in Clump_Thickness: ', check_min_outlier('Clump_Thickness'))
print('Number of min outliers in Cell_Size_Uniformity: ', check_min_outlier('Cell_Size_Uniformity'))
print('Number of min outliers in Cell_Shape_Uniformity: ', check_min_outlier('Cell_Shape_Uniformity'))
print('Number of min outliers in Marginal_Adhesion: ', check_min_outlier('Marginal_Adhesion'))
print('Number of min outliers in Single_Epi_Cell_Size: ', check_min_outlier('Single_Epi_Cell_Size'))
print('Number of min outliers in Bare_Nuclei: ', check_min_outlier('Bare_Nuclei'))
print('Number of min outliers in Bland_Chromatin: ', check_min_outlier('Bland_Chromatin'))
print('Number of min outliers in Normal_Nucleoli: ', check_min_outlier('Normal_Nucleoli'))
print('Number of min outliers in Mitoses: ', check_min_outlier('Mitoses'))

Number of min outliers in Clump_Thickness:  3
Number of min outliers in Cell_Size_Uniformity:  8
Number of min outliers in Cell_Shape_Uniformity:  0
Number of min outliers in Marginal_Adhesion:  0
Number of min outliers in Single_Epi_Cell_Size:  0
Number of min outliers in Bare_Nuclei:  35
Number of min outliers in Bland_Chromatin:  0
Number of min outliers in Normal_Nucleoli:  3
Number of min outliers in Mitoses:  0


In [13]:
def check_max_outlier(column_name):
    return len(bc_data[bc_data[column_name]>10])

In [14]:
print('Number of max outliers in Clump_Thickness: ', check_max_outlier('Clump_Thickness'))
print('Number of max outliers in Cell_Size_Uniformity: ', check_max_outlier('Cell_Size_Uniformity'))
print('Number of max outliers in Cell_Shape_Uniformity: ', check_max_outlier('Cell_Shape_Uniformity'))
print('Number of max outliers in Marginal_Adhesion: ', check_max_outlier('Marginal_Adhesion'))
print('Number of max outliers in Single_Epi_Cell_Size: ', check_max_outlier('Single_Epi_Cell_Size'))
print('Number of max outliers in Bare_Nuclei: ', check_max_outlier('Bare_Nuclei'))
print('Number of max outliers in Bland_Chromatin: ', check_max_outlier('Bland_Chromatin'))
print('Number of max outliers in Normal_Nucleoli: ', check_max_outlier('Normal_Nucleoli'))
print('Number of max outliers in Mitoses: ', check_max_outlier('Mitoses'))

Number of max outliers in Clump_Thickness:  1329
Number of max outliers in Cell_Size_Uniformity:  1597
Number of max outliers in Cell_Shape_Uniformity:  1212
Number of max outliers in Marginal_Adhesion:  1343
Number of max outliers in Single_Epi_Cell_Size:  510
Number of max outliers in Bare_Nuclei:  69
Number of max outliers in Bland_Chromatin:  184
Number of max outliers in Normal_Nucleoli:  11
Number of max outliers in Mitoses:  261


Based on the results of these queries, it seems that there is a signficant subset of dataset with max outliers so they can't be removed, however the min outliers are not as significant and can be removed. 

In [15]:
def remove_min_outliers(column_name):
    return (bc_data[bc_data[column_name]>=1])

In [16]:
bc_data = remove_min_outliers('Clump_Thickness')
bc_data = remove_min_outliers('Cell_Size_Uniformity')
bc_data = remove_min_outliers('Cell_Shape_Uniformity')
bc_data = remove_min_outliers('Marginal_Adhesion')
bc_data = remove_min_outliers('Single_Epi_Cell_Size')
bc_data = remove_min_outliers('Bare_Nuclei')
bc_data = remove_min_outliers('Bland_Chromatin')
bc_data = remove_min_outliers('Normal_Nucleoli')
bc_data = remove_min_outliers('Mitoses')

In [17]:
print('Number of min outliers in Clump_Thickness: ', check_min_outlier('Clump_Thickness'))
print('Number of min outliers in Cell_Size_Uniformity: ', check_min_outlier('Cell_Size_Uniformity'))
print('Number of min outliers in Cell_Shape_Uniformity: ', check_min_outlier('Cell_Shape_Uniformity'))
print('Number of min outliers in Marginal_Adhesion: ', check_min_outlier('Marginal_Adhesion'))
print('Number of min outliers in Single_Epi_Cell_Size: ', check_min_outlier('Single_Epi_Cell_Size'))
print('Number of min outliers in Bare_Nuclei: ', check_min_outlier('Bare_Nuclei'))
print('Number of min outliers in Bland_Chromatin: ', check_min_outlier('Bland_Chromatin'))
print('Number of min outliers in Normal_Nucleoli: ', check_min_outlier('Normal_Nucleoli'))
print('Number of min outliers in Mitoses: ', check_min_outlier('Mitoses'))

Number of min outliers in Clump_Thickness:  0
Number of min outliers in Cell_Size_Uniformity:  0
Number of min outliers in Cell_Shape_Uniformity:  0
Number of min outliers in Marginal_Adhesion:  0
Number of min outliers in Single_Epi_Cell_Size:  0
Number of min outliers in Bare_Nuclei:  0
Number of min outliers in Bland_Chromatin:  0
Number of min outliers in Normal_Nucleoli:  0
Number of min outliers in Mitoses:  0


In [18]:
bc_data.describe()

Unnamed: 0,id,Clump_Thickness,Cell_Size_Uniformity,Cell_Shape_Uniformity,Marginal_Adhesion,Single_Epi_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses
count,39317.0,39317.0,39317.0,39317.0,39317.0,39317.0,39317.0,39317.0,39317.0,39317.0
mean,19680.358649,4.393465,3.129445,3.203115,2.826947,3.20955,3.500404,3.40823,2.894197,1.591045
std,11363.605788,2.811865,3.039286,2.975756,2.872436,2.220396,3.620691,2.421583,3.06932,1.705605
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,9839.0,2.244085,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0
50%,19679.0,4.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0
75%,29520.0,5.629741,4.553463,4.965623,3.551612,4.002653,6.338096,4.559898,3.793476,1.0
max,39366.0,13.717991,10.933095,12.604289,11.158505,14.414889,13.160789,12.005376,10.700432,12.044924


In [19]:
datapath = '../data'
save_file(bc_data, 'bc_data_cleaned.csv', datapath)

A file already exists with this name.

Do you want to overwrite? (Y/N)n

Please re-run this cell with a new filename.


In [20]:
check_max_outlier('Cell_Size_Uniformity')/39317

0.04056769336419361