In [1]:
import numpy as np
import pandas as pd

import mlcookbook

from sklearn.datasets import load_boston, load_breast_cancer

boston_data = load_boston()
data = pd.DataFrame(boston_data.data, columns=boston_data.feature_names)
data['Target'] = boston_data.target

breast_cancer_data = load_breast_cancer()
categorical_data = pd.DataFrame(breast_cancer_data.data, columns=breast_cancer_data.feature_names)
categorical_data['Target'] = breast_cancer_data.target

In [4]:
print('Percent Missing')
mlcookbook.eda.percent_missing(data)

Percent Missing Test
No missing values


## Outliers

In [7]:
print('IQR outliers')
mlcookbook.eda.iqr_indices_of_outliers(data.iloc[:, 0])

IQR outliers


(array([367, 371, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 384,
        385, 386, 387, 388, 392, 394, 398, 399, 400, 401, 402, 403, 404,
        405, 406, 407, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418,
        419, 420, 422, 425, 426, 427, 429, 431, 434, 435, 436, 437, 438,
        439, 440, 441, 443, 444, 445, 447, 448, 454, 468, 469, 477, 478,
        479], dtype=int64),)

In [9]:
print('Z score outliers')
mlcookbook.eda.z_score_indices_of_outliers(data.iloc[:, 0])

Z score outliers


(array([380, 398, 404, 405, 410, 414, 418, 427], dtype=int64),)

In [10]:
print('Percentile outliers')
mlcookbook.eda.percentile_indices_of_outliers(data.iloc[:, 0])

Percentile outliers


(array([  0, 284, 285, 380, 405, 418], dtype=int64),)

In [11]:
print('Ellipses outliers')
mlcookbook.eda.ellipses_indices_of_outliers(data)

Ellipses outliers


(array([367, 374, 375, 376, 378, 379, 380, 381, 384, 385, 386, 387, 388,
        392, 394, 398, 400, 401, 403, 404, 405, 406, 407, 409, 410, 411,
        412, 413, 414, 415, 416, 417, 418, 419, 422, 425, 426, 427, 434,
        435, 436, 437, 438, 440, 444, 445, 454, 468, 469, 477, 479],
       dtype=int64),)

In [12]:
print('Isolation Forest outliers')
mlcookbook.eda.isolation_forest_indices_of_outliers(data)

Isolation Forest outliers


(array([102, 141, 142, 143, 144, 145, 146, 147, 148, 152, 153, 155, 156,
        159, 161, 162, 163, 166, 195, 197, 204, 225, 253, 257, 258, 261,
        262, 263, 265, 267, 283, 353, 354, 355, 365, 367, 368, 380, 404,
        405, 410, 412, 414, 418, 419, 424, 426, 427, 438, 489, 490],
       dtype=int64),)

In [13]:
print('One class SVM outliers')
mlcookbook.eda.one_class_svm_indices_of_outliers(data)

One class SVM outliers




(array([  0,   2,   3,   5,   6,   7,   9,  10,  12,  13,  15,  16,  17,
         20,  21,  22,  25,  30,  31,  32,  33,  34,  35,  38,  39,  41,
         42,  43,  44,  48,  49,  50,  51,  53,  54,  55,  56,  63,  68,
         73,  75,  76,  79,  81,  89,  93,  94,  97,  99, 101, 107, 109,
        110, 112, 114, 115, 118, 119, 120, 121, 122, 123, 127, 130, 135,
        136, 138, 140, 141, 143, 144, 148, 151, 153, 154, 155, 161, 164,
        166, 167, 168, 171, 172, 176, 177, 180, 181, 185, 186, 192, 194,
        200, 201, 205, 206, 208, 209, 213, 214, 216, 220, 221, 222, 223,
        225, 226, 228, 229, 230, 234, 239, 242, 244, 247, 249, 255, 258,
        259, 260, 262, 263, 264, 265, 266, 267, 268, 271, 272, 273, 281,
        282, 283, 291, 292, 293, 297, 302, 305, 308, 309, 312, 314, 315,
        322, 323, 329, 332, 333, 337, 341, 343, 344, 346, 347, 349, 350,
        352, 353, 354, 359, 363, 364, 371, 373, 374, 375, 376, 378, 382,
        383, 385, 389, 390, 394, 397, 399, 403, 407

In [14]:
print('Outlier Report')
mlcookbook.eda.outlier_report(data)

Outlier Report
Detecting outliers 

Single feature outlier tests
         IQR Z Score Percentile Multiple       IQR %   Z Score % Percentile %  \
CRIM      66       8          6        8    0.130435   0.0158103    0.0118577   
INDUS      0       0          3        0           0           0   0.00592885   
NOX        0       0          2        0           0           0   0.00395257   
RM        30       8          5        8   0.0592885   0.0158103   0.00988142   
AGE        0       0          3        0           0           0   0.00592885   
DIS        5       5          6        5  0.00988142  0.00988142    0.0118577   
TAX        0       0          1        0           0           0   0.00197628   
PTRATIO   15       0          5        0   0.0296443           0   0.00988142   
B         77      25          3       25    0.152174   0.0494071   0.00592885   
LSTAT      7       5          6        5    0.013834  0.00988142    0.0118577   
Target    40       0          3        0   0



{'Ellipses Envelope': (array([367, 373, 374, 375, 376, 378, 379, 380, 381, 384, 385, 386, 387,
         388, 392, 394, 398, 400, 401, 403, 404, 405, 406, 407, 409, 410,
         411, 412, 413, 414, 415, 416, 417, 418, 419, 422, 425, 426, 427,
         434, 435, 436, 437, 438, 440, 444, 445, 468, 469, 477, 479],
        dtype=int64),),
 'Isolation Forest': (array([  8,  48, 126, 141, 142, 143, 144, 145, 146, 147, 148, 149, 152,
         153, 155, 156, 159, 161, 162, 163, 166, 195, 197, 204, 214, 253,
         257, 258, 262, 267, 268, 283, 354, 355, 367, 368, 373, 374, 380,
         384, 410, 412, 414, 418, 419, 424, 426, 427, 438, 457, 490],
        dtype=int64),),
 'One Class SVM': (array([  0,   2,   3,   5,   6,   7,   9,  10,  12,  13,  15,  16,  17,
          20,  21,  22,  25,  30,  31,  32,  33,  34,  35,  38,  39,  41,
          42,  43,  44,  48,  49,  50,  51,  53,  54,  55,  56,  63,  68,
          73,  75,  76,  79,  81,  89,  93,  94,  97,  99, 101, 107, 109,
         110, 

## Preprocessing

In [15]:
print('PCA')
mlcookbook.process.fit_PCA(data)

PCA
Total variance % explained: 99.47547798971576

Variance % explained by principal component:
0 : 52.40359199401038
1 : 11.90709019180578
2 : 9.14680592105265
3 : 6.065593317676156
4 : 5.329659989504557
5 : 4.189201829407195
6 : 3.0784920531932687
7 : 2.4340904719387266
8 : 1.334769086567469
9 : 1.192136172161672
10 : 0.859088493451943
11 : 0.790408698079883
12 : 0.7445497708660539


array([[-0.56709519,  0.15613447,  0.01698635, ..., -0.13715014,
        -0.0358962 ,  0.04556913],
       [-0.42633164,  0.25114179, -0.18768805, ...,  0.04660025,
         0.00684545,  0.0536297 ],
       [-0.58068173,  0.143294  ,  0.00128317, ...,  0.06890589,
        -0.00825431, -0.0158398 ],
       ...,
       [-0.18116716,  0.39693602, -0.20295807, ..., -0.02189769,
        -0.05947226,  0.07618167],
       [-0.17393509,  0.38548429, -0.23495641, ..., -0.01993153,
        -0.04965579,  0.077509  ],
       [-0.14446137,  0.35112255, -0.37054574, ..., -0.05930353,
        -0.02947907,  0.14699223]])

In [19]:
print('Oversampling binary label')
mlcookbook.process.oversample_binary_label(categorical_data, 'Target')  # TODO: Fix this one

Oversampling Test
Initial number of observations in each class:
1    357
0    212
Name: Target, dtype: int64

Oversampled number of observations in each class:
1    357
0    212
Name: Target, dtype: int64


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,Target
19,13.540,14.36,87.46,566.3,0.09779,0.08129,0.066640,0.047810,0.1885,0.05766,...,19.26,99.70,711.2,0.14400,0.17730,0.239000,0.12880,0.2977,0.07259,1
20,13.080,15.71,85.63,520.0,0.10750,0.12700,0.045680,0.031100,0.1967,0.06811,...,20.49,96.09,630.5,0.13120,0.27760,0.189000,0.07283,0.3184,0.08183,1
21,9.504,12.44,60.34,273.9,0.10240,0.06492,0.029560,0.020760,0.1815,0.06905,...,15.66,65.13,314.9,0.13240,0.11480,0.088670,0.06227,0.2450,0.07773,1
37,13.030,18.42,82.61,523.8,0.08983,0.03766,0.025620,0.029230,0.1467,0.05863,...,22.81,84.46,545.9,0.09701,0.04619,0.048330,0.05013,0.1987,0.06169,1
46,8.196,16.84,51.71,201.9,0.08600,0.05943,0.015880,0.005917,0.1769,0.06503,...,21.96,57.26,242.2,0.12970,0.13570,0.068800,0.02564,0.3105,0.07409,1
48,12.050,14.63,78.04,449.3,0.10310,0.09092,0.065920,0.027490,0.1675,0.06043,...,20.70,89.88,582.6,0.14940,0.21560,0.305000,0.06548,0.2747,0.08301,1
49,13.490,22.30,86.91,561.0,0.08752,0.07698,0.047510,0.033840,0.1809,0.05718,...,31.82,99.00,698.8,0.11620,0.17110,0.228200,0.12820,0.2871,0.06917,1
50,11.760,21.60,74.72,427.9,0.08637,0.04966,0.016570,0.011150,0.1495,0.05888,...,25.72,82.98,516.5,0.10850,0.08615,0.055230,0.03715,0.2433,0.06563,1
51,13.640,16.34,87.21,571.8,0.07685,0.06059,0.018570,0.017230,0.1353,0.05953,...,23.19,96.08,656.7,0.10890,0.15820,0.105000,0.08586,0.2346,0.08025,1
52,11.940,18.24,75.71,437.6,0.08261,0.04751,0.019720,0.013490,0.1868,0.06110,...,21.33,83.67,527.2,0.11440,0.08906,0.092030,0.06296,0.2785,0.07408,1


In [21]:
print('Oversampling with SMOTE Test')
# TODO: Fix this
# mlcookbook.process.fit_PCA(categorical_data.drop('Target', axis=1), categorical_data['Target'])

Oversampling with SMOTE Test


In [23]:
print('Target Mean Encoding')
# TODO: Fix this
mlcookbook.process.target_encode(data.drop('Target', axis=1), data.drop('Target', axis=1), data['Target'])

Target Mean Encoding


AttributeError: 'DataFrame' object has no attribute 'name'