## Wisconsin Diagnostic Breast Cancer (WDBC) Dataset

In [1]:
# load modules
import pandas as pd
from src import read
from src import preprocessing
from src import eda
from src import modelling
from src import experimental

In [2]:
# define data folder path and data file names
raw_data_folder = r"C:\Users\35799\Desktop\cookiecutter-analytical-project\biolizard-internship-marios\data\raw"
datafile_name = r"wdbc.data"

# load data
data_df = read.data_load(folder=raw_data_folder, filename=datafile_name, delimiter=',')

# create column labels
column_labels = ["F"+str(i+1) for i in range(data_df.shape[1]-2)]
column_labels.insert(0, "Diagnosis")
column_labels.insert(0, "ID")
data_df.columns = column_labels


### Data Info

In [3]:
data_df.head()

Unnamed: 0,ID,Diagnosis,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28,F29,F30
0,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
1,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
2,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
3,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
4,843786,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,0.07613,0.3345,0.8902,2.217,27.19,0.00751,0.03345,0.03672,0.01137,0.02165,0.005082,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244


In [4]:
# data info report
id_features, categorical_features, continuous_features = read.data_info(data_df, threshold=20)

DIMENSIONS:
----------------------------------------------------------------------------------------------------
Entries: 568
Features: 32
----------------------------------------------------------------------------------------------------


CATEGORICAL FEATURES:
----------------------------------------------------------------------------------------------------
Features    Data Type    Categories & Counts
----------  -----------  ---------------------
Diagnosis   object       {'B': 357, 'M': 211}
----------------------------------------------------------------------------------------------------


CONTINUOUS FEATURES:
----------------------------------------------------------------------------------------------------
Features    Data Type      Count     Mean      Std      Min     25th    Median      75th       Max
----------  -----------  -------  -------  -------  -------  -------  --------  --------  --------
F1          float64          568   14.12     3.52     6.981   11.698    13

### Data Preprocessing

In [5]:
# data split
X, y, X_train, y_train, X_test, y_test, _, _ = preprocessing.data_split(data_df, target="Diagnosis", method="tt", train_proportion=0.8, stratify=True, random_state=0)

In [6]:
# treat missing values
train_df, test_df = preprocessing.treat_nan(X_train, y_train, X_test, y_test, id_features, categorical_features, continuous_features, target="Diagnosis", drop_nan_rows=False, impute_cutoff=0.5, categorical_imputer="mode", continuous_imputer="median")

In [7]:
# treat outliers
train_df, test_df = preprocessing.treat_outliers(train_df, test_df, id_features, categorical_features, continuous_features, target="Diagnosis", method="if", outlier_fraction=0.1)

The following entries are probable outliers as identified by the Isolation Forest technique (train set):


Unnamed: 0,ID,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28,F29,F30,Diagnosis
115,864726,8.95,15.76,58.74,245.2,0.09462,0.1243,0.09263,0.02308,0.1305,0.07163,0.3132,0.9789,3.28,16.94,0.01835,0.0676,0.09263,0.02308,0.02384,0.005601,9.414,17.07,63.34,270.0,0.1179,0.1879,0.1544,0.03846,0.1652,0.07722,B
563,926424,21.56,22.39,142.0,1479.0,0.111,0.1159,0.2439,0.1389,0.1726,0.05623,1.176,1.256,7.673,158.7,0.0103,0.02891,0.05198,0.02454,0.01114,0.004239,25.45,26.4,166.1,2027.0,0.141,0.2113,0.4107,0.2216,0.206,0.07115,M
537,921092,7.729,25.49,47.98,178.8,0.08098,0.04878,0.0,0.0,0.187,0.07285,0.3777,1.462,2.492,19.14,0.01266,0.009692,0.0,0.0,0.02882,0.006872,9.077,30.92,57.17,248.0,0.1256,0.0834,0.0,0.0,0.3058,0.09938,B
351,899987,25.73,17.46,174.2,2010.0,0.1149,0.2363,0.3368,0.1913,0.1956,0.06121,0.9948,0.8509,7.222,153.1,0.006369,0.04243,0.04266,0.01508,0.02335,0.003385,33.13,23.58,229.3,3234.0,0.153,0.5937,0.6451,0.2756,0.369,0.08815,M
350,899667,15.75,19.22,107.1,758.6,0.1243,0.2364,0.2914,0.1242,0.2375,0.07603,0.5204,1.324,3.477,51.22,0.009329,0.06559,0.09953,0.02283,0.05543,0.00733,17.36,24.17,119.4,915.3,0.155,0.5046,0.6872,0.2135,0.4245,0.105,M
107,86355,22.27,19.67,152.8,1509.0,0.1326,0.2768,0.4264,0.1823,0.2556,0.07039,1.215,1.545,10.05,170.0,0.006515,0.08668,0.104,0.0248,0.03112,0.005037,28.4,28.01,206.8,2360.0,0.1701,0.6997,0.9608,0.291,0.4055,0.09789,M
271,8910988,21.75,20.99,147.3,1491.0,0.09401,0.1961,0.2195,0.1088,0.1721,0.06194,1.167,1.352,8.867,156.8,0.005687,0.0496,0.06329,0.01561,0.01924,0.004614,28.19,28.18,195.9,2384.0,0.1272,0.4725,0.5807,0.1841,0.2833,0.08858,M
502,915143,23.09,19.83,152.1,1682.0,0.09342,0.1275,0.1676,0.1003,0.1505,0.05484,1.291,0.7452,9.635,180.2,0.005753,0.03356,0.03976,0.02156,0.02201,0.002897,30.79,23.87,211.5,2782.0,0.1199,0.3625,0.3794,0.2264,0.2908,0.07277,M
317,894329,9.042,18.9,60.07,244.5,0.09968,0.1972,0.1975,0.04908,0.233,0.08743,0.4653,1.911,3.769,24.2,0.009845,0.0659,0.1027,0.02527,0.03491,0.007877,10.06,23.4,68.62,297.1,0.1221,0.3748,0.4609,0.1145,0.3135,0.1055,B
2,84348301,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,M


The following entries are probable outliers as identified by the Isolation Forest technique (test set):


Unnamed: 0,ID,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28,F29,F30,Diagnosis
76,8610637,18.05,16.15,120.2,1006.0,0.1065,0.2146,0.1684,0.108,0.2152,0.06673,0.9806,0.5505,6.311,134.8,0.00794,0.05839,0.04658,0.0207,0.02591,0.007054,22.39,18.91,150.1,1610.0,0.1478,0.5634,0.3786,0.2102,0.3751,0.1108,M
218,88119002,19.53,32.47,128.0,1223.0,0.0842,0.113,0.1145,0.06637,0.1428,0.05313,0.7392,1.321,4.722,109.9,0.005539,0.02644,0.02664,0.01078,0.01332,0.002256,27.9,45.41,180.2,2477.0,0.1408,0.4097,0.3995,0.1625,0.2713,0.07568,M
137,868826,14.95,17.57,96.85,678.1,0.1167,0.1305,0.1539,0.08624,0.1957,0.06216,1.296,1.452,8.419,101.9,0.01,0.0348,0.06577,0.02801,0.05168,0.002887,18.55,21.43,121.4,971.4,0.1411,0.2164,0.3355,0.1667,0.3414,0.07147,M
82,8611792,19.1,26.29,129.1,1132.0,0.1215,0.1791,0.1937,0.1469,0.1634,0.07224,0.519,2.91,5.801,67.1,0.007545,0.0605,0.02134,0.01843,0.03056,0.01039,20.33,32.72,141.3,1298.0,0.1392,0.2817,0.2432,0.1841,0.2311,0.09203,M
378,9013838,11.08,18.83,73.3,361.6,0.1216,0.2154,0.1689,0.06367,0.2196,0.0795,0.2114,1.027,1.719,13.99,0.007405,0.04549,0.04588,0.01339,0.01738,0.004435,13.24,32.82,91.76,508.1,0.2184,0.9379,0.8402,0.2524,0.4154,0.1403,M
504,915276,9.676,13.14,64.12,272.5,0.1255,0.2204,0.1188,0.07038,0.2057,0.09575,0.2744,1.39,1.787,17.67,0.02177,0.04888,0.05189,0.0145,0.02632,0.01148,10.6,18.04,69.47,328.1,0.2006,0.3663,0.2913,0.1075,0.2848,0.1364,B
77,8610862,20.18,23.97,143.7,1245.0,0.1286,0.3454,0.3754,0.1604,0.2906,0.08142,0.9317,1.885,8.649,116.4,0.01038,0.06835,0.1091,0.02593,0.07895,0.005987,23.37,31.72,170.3,1623.0,0.1639,0.6164,0.7681,0.2508,0.544,0.09964,M
338,89812,23.51,24.27,155.1,1747.0,0.1069,0.1283,0.2308,0.141,0.1797,0.05506,1.009,0.9245,6.462,164.1,0.006292,0.01971,0.03582,0.01301,0.01479,0.003118,30.67,30.73,202.4,2906.0,0.1515,0.2678,0.4819,0.2089,0.2593,0.07738,M
235,88299702,23.21,26.97,153.5,1670.0,0.09509,0.1682,0.195,0.1237,0.1909,0.06309,1.058,0.9635,7.247,155.8,0.006428,0.02863,0.04497,0.01716,0.0159,0.003053,31.01,34.51,206.0,2944.0,0.1481,0.4126,0.582,0.2593,0.3103,0.08677,M
8,84501001,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,0.203,0.08243,0.2976,1.599,2.039,23.94,0.007149,0.07217,0.07743,0.01432,0.01789,0.01008,15.09,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075,M


In [8]:
# target feature imbalance check
preprocessing.target_balance_check(train_df, target="Diagnosis", imbalance_fraction=0.3, graphic=False)

Target Feature Levels      Counts    Percentages (%)
-----------------------  --------  -----------------
B                             269               65.9
M                             139               34.1


The tagret feature levels are unbalanced.


In [9]:
# sampling techniques to treat target feature imbalance

# oversampling ratios must be greater or equal to 1 and are based on the class with the most counts
oversampling_ratios = {"B": 1, "M": 1}
train_df_oversampled = preprocessing.sampler(train_df, target="Diagnosis", method="over", sampling_ratios=oversampling_ratios, random_state=0, graphic=False)

Balanced target feature (oversampling):
Target Feature Levels      Counts    Percentages (%)
-----------------------  --------  -----------------
B                             269                 50
M                             269                 50


In [10]:
# undersampling ratios must be less or equal to 1 and are based on the class with the least counts
undersampling_ratios = {"B": 1, "M": 1}
train_df_undersampled = preprocessing.sampler(train_df, target="Diagnosis", method="under", sampling_ratios=undersampling_ratios, random_state=0, graphic=False)

Balanced target feature (undersampling):
Target Feature Levels      Counts    Percentages (%)
-----------------------  --------  -----------------
B                             139                 50
M                             139                 50


### Data Exploration

In [11]:
# data sample for examples
sample_df = train_df.iloc[:, 1:11]
sample_df.head()

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10
380,11.04,14.93,70.67,372.7,0.07987,0.07079,0.03546,0.02074,0.2003,0.06246
143,10.75,14.97,68.26,355.3,0.07793,0.05139,0.02251,0.007875,0.1399,0.05688
135,11.71,16.67,74.72,423.6,0.1051,0.06095,0.03592,0.026,0.1339,0.05945
34,16.74,21.59,110.1,869.5,0.0961,0.1336,0.1348,0.06018,0.1896,0.05656
403,12.34,14.95,78.29,469.1,0.08682,0.04571,0.02109,0.02054,0.1571,0.05708


In [12]:
# correlation matrix
pearson_correlation_matrix = eda.correlations(sample_df, type="pearson", printout="matrix")

Pearson Correlation Matrix:


Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10
F1,1.0,0.267,0.998,0.993,0.104,0.493,0.678,0.795,0.114,-0.387
F2,0.267,1.0,0.272,0.27,-0.032,0.197,0.275,0.241,0.068,-0.096
F3,0.998,0.272,1.0,0.991,0.141,0.543,0.717,0.824,0.142,-0.343
F4,0.993,0.27,0.991,1.0,0.103,0.481,0.676,0.795,0.12,-0.375
F5,0.104,-0.032,0.141,0.103,1.0,0.663,0.511,0.532,0.522,0.607
F6,0.493,0.197,0.543,0.481,0.663,1.0,0.889,0.83,0.499,0.478
F7,0.678,0.275,0.717,0.676,0.511,0.889,1.0,0.937,0.398,0.221
F8,0.795,0.241,0.824,0.795,0.532,0.83,0.937,1.0,0.397,0.094
F9,0.114,0.068,0.142,0.12,0.522,0.499,0.398,0.397,1.0,0.363
F10,-0.387,-0.096,-0.343,-0.375,0.607,0.478,0.221,0.094,0.363,1.0


In [13]:
# correlation heatmap
spearman_correlation_matrix = eda.correlations(sample_df, type="spearman", printout="heatmap")

#### Boxplots

In [14]:
eda.box_plot(train_df, features=continuous_features, stratify_var="Diagnosis", group_var="", title="Boxplot of ", xtitle="Diagnosis", ytitle="Feature value", widget_description="Select: ", stratify=True, group=False)

VBox(children=(HBox(children=(Dropdown(description='Select: ', options=('F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F…

In [15]:
eda.box_subplots(train_df, features=list(sample_df.columns), stratify_var="Diagnosis", columns=5, width=250, height=250, stratify=True)

#### Histograms

In [16]:
eda.hist_plot(train_df, features=list(sample_df.columns), group_var="Diagnosis", title="Histogram of ", xtitle="Bins", ytitle="Count", widget_description="Select: ", group=True)

VBox(children=(VBox(children=(Dropdown(description='Select: ', options=('F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F…

In [17]:
eda.hist_subplots(sample_df, columns=5, width=250, height=250)

#### Distribution plots

In [18]:
eda.dist_plot(sample_df.iloc[:, 0:1], title="Distribution Plot:")

In [19]:
eda.dist_subplots(sample_df, columns=5, width=250, height=250)

#### Dimensionality Reduction

##### PCA

In [20]:
# Principal Component Analysis
eda.pca_variance_plot(train_df, identifier=id_features, categorical=categorical_features, continuous=continuous_features, target="Diagnosis")

In [21]:
_, pca2d_train_df, _, pca2d_test_df = eda.dimensionality_reduction(train_df, test_df, identifier=id_features, categorical=categorical_features, continuous=continuous_features, target="Diagnosis", method="pca", plot_type="2d", components=2)

In [22]:
_, pca3d_train_df, _, pca3d_test_df = eda.dimensionality_reduction(train_df, test_df, identifier=id_features, categorical=categorical_features, continuous=continuous_features, target="Diagnosis", method="pca", plot_type="3d", components=3)

In [23]:
_, pca5d_train_df, _, pca5d_test_df = eda.dimensionality_reduction(train_df, test_df, identifier=id_features, categorical=categorical_features, continuous=continuous_features, target="Diagnosis", method="pca", plot_type="multi", components=5)

#### MDS

In [24]:
_, mds2d_train_df, _, mds2d_test_df = eda.dimensionality_reduction(train_df, test_df, identifier=id_features, categorical=categorical_features, continuous=continuous_features, target="Diagnosis", method="mds", plot_type="2d", components=2)

In [25]:
_, mds3d_train_df, _, mds3d_test_df = eda.dimensionality_reduction(train_df, test_df, identifier=id_features, categorical=categorical_features, continuous=continuous_features, target="Diagnosis", method="mds", plot_type="3d", components=3)

In [26]:
_, mds5d_train_df, _, mds5d_test_df = eda.dimensionality_reduction(train_df, test_df, identifier=id_features, categorical=categorical_features, continuous=continuous_features, target="Diagnosis", method="mds", plot_type="multi", components=5)

#### t-SNE

In [29]:
_, tsne2d_train_df, _, tsne2d_test_df = eda.dimensionality_reduction(train_df, test_df, identifier=id_features, categorical=categorical_features, continuous=continuous_features, target="Diagnosis", method="tsne", plot_type="2d", components=2, perplexity=30)

In [30]:
_, tsne3d_train_df, _, tsne3d_test_df = eda.dimensionality_reduction(train_df, test_df, identifier=id_features, categorical=categorical_features, continuous=continuous_features, target="Diagnosis", method="tsne", plot_type="3d", components=3, perplexity=30)

#### UMAP

In [34]:
_, umpa2d_train_df, _, umpa2d_test_df = eda.dimensionality_reduction(train_df, test_df, identifier=id_features, categorical=categorical_features, continuous=continuous_features, target="Diagnosis", method="umap", plot_type="2d", components=2, neighbors=100, min_distance=0.1, metric="euclidean")

In [35]:
_, umpa3d_train_df, _, umpa3d_test_df = eda.dimensionality_reduction(train_df, test_df, identifier=id_features, categorical=categorical_features, continuous=continuous_features, target="Diagnosis", method="umap", plot_type="3d", components=3, neighbors=100, min_distance=0.1, metric="euclidean")

In [36]:
_, umpa5d_train_df, _, umpa5d_test_df = eda.dimensionality_reduction(train_df, test_df, identifier=id_features, categorical=categorical_features, continuous=continuous_features, target="Diagnosis", method="umap", plot_type="multi", components=5, neighbors=100, min_distance=0.1, metric="euclidean")