# 1) To set up your own data cleaning pipeline

In [3]:
#Load the datasets
def read_dataset(name): #when only one dataset is provided as input
    import pandas as pd
    if name == "gpsa":
        df = pd.read_csv('../datasets/googleplaystore.csv', sep=',', encoding ='ISO-8859-1')
    elif name == "gpsu":
        df = pd.read_csv('../datasets/googleplaystore_reviews.csv', sep=',',encoding = 'ISO-8859-1')  
    elif name == "titanic":
        df = pd.read_csv('../datasets/titanic/titanic_train.csv', sep=',', encoding ='ISO-8859-1')
    elif name == "house":
        df = pd.read_csv('../datasets/house/house_train.csv', sep=',', encoding ='ISO-8859-1')
    else: 
        raise ValueError('Invalid dataset name')               
    return df


In [4]:
read_dataset("house").head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## - Loading your data

In [5]:
import learn2clean.loading.reader as rd 
import learn2clean.normalization.normalizer as nl 
import pandas as pd

# executing profiling function for one dataset as input
rd.profile_summary(read_dataset('house'), plot=False)


Profiling datasets
        Attribute     Type  Num. Missing Values  Num. Unique Values           Sknewness   Kurtosis
0              Id    int64                  0.0              1460.0                   0       -1.2
1      MSSubClass    int64                  0.0                15.0             1.40621    1.57067
2     LotFrontage  float64                259.0               111.0  2.1608659947055435    17.3753
3         LotArea    int64                  0.0              1073.0             12.1951    202.544
4     OverallQual    int64                  0.0                10.0            0.216721  0.0918565
5     OverallCond    int64                  0.0                 9.0            0.692355    1.09852
6       YearBuilt    int64                  0.0               112.0           -0.612831  -0.442155
7    YearRemodAdd    int64                  0.0                61.0           -0.503044     -1.272
8      MasVnrArea  float64                  8.0               328.0  2.6663261001607443   

In [6]:

read_dataset('house')['SalePrice'].head() # the target variable is numerical 


0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64

In [17]:

# no encoding of the target variable
d_not_enc = rd.Reader(sep=',',verbose=True, encoding=False) 

# when you have two datasets as inputs: train and test datasets
house  = ["../datasets/house/house_train.csv", "../datasets/house/test.csv"]
house_not_encoded = d_not_enc.train_test_split(house, 'SalePrice')
house_not_encoded['train'].head()
house_not_encoded['test'].head()


Reading csv : house_train.csv ...
Reading data ...
CPU time: 2.6323201656341553 seconds
Profiling datasets
        Attribute     Type  Num. Missing Values  Num. Unique Values           Sknewness   Kurtosis
0              Id  float64                  0.0              1460.0                   0       -1.2
1      MSSubClass  float64                  0.0                15.0             1.40621    1.57067
2     LotFrontage  float64                259.0               111.0  2.1608659947055435    17.3753
3         LotArea  float64                  0.0              1073.0             12.1951    202.544
4     OverallQual  float64                  0.0                10.0            0.216721  0.0918565
5     OverallCond  float64                  0.0                 9.0            0.692355    1.09852
6       YearBuilt  float64                  0.0               112.0           -0.612831  -0.442155
7    YearRemodAdd  float64                  0.0                61.0           -0.503044     -1.272
8

Profiling datasets
        Attribute     Type  Num. Missing Values  Num. Unique Values             Sknewness   Kurtosis
0              Id  float64                  0.0              1459.0                     0       -1.2
1      MSSubClass  float64                  0.0                16.0                1.3453    1.34024
2     LotFrontage  float64                227.0               116.0    0.6611148912627807    2.57186
3         LotArea  float64                  0.0              1106.0               3.11201    20.6714
4     OverallQual  float64                  0.0                10.0               0.18101  0.0334023
5     OverallCond  float64                  0.0                 9.0              0.448703    1.84137
6       YearBuilt  float64                  0.0               106.0             -0.587052  -0.581447
7    YearRemodAdd  float64                  0.0                61.0             -0.399495   -1.41186
8      MasVnrArea  float64                 15.0               304.0     

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,SaleCondition,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
0,896.0,0.0,0.0,,2.0,1Fam,TA,No,468.0,144.0,...,Normal,120.0,Pave,5.0,882.0,AllPub,140.0,1961.0,1961.0,2010.0
1,1329.0,0.0,0.0,,3.0,1Fam,TA,No,923.0,0.0,...,Normal,0.0,Pave,6.0,1329.0,AllPub,393.0,1958.0,1958.0,2010.0
2,928.0,701.0,0.0,,3.0,1Fam,TA,No,791.0,0.0,...,Normal,0.0,Pave,6.0,928.0,AllPub,212.0,1997.0,1998.0,2010.0
3,926.0,678.0,0.0,,3.0,1Fam,TA,No,602.0,0.0,...,Normal,0.0,Pave,7.0,926.0,AllPub,360.0,1998.0,1998.0,2010.0
4,1280.0,0.0,0.0,,2.0,TwnhsE,TA,No,263.0,0.0,...,Normal,144.0,Pave,5.0,1280.0,AllPub,0.0,1992.0,1992.0,2010.0


In [8]:

# otherwise for only one dataset as input, 
# train_test_split function will split the input dataset into train/test dataset
house_train_only  = ["../datasets/house/house_train.csv"]
house_train_only_not_encoded = d_not_enc.train_test_split(house_train_only, 'SalePrice')
house_train_only_not_encoded['train'].head() 


Reading csv : house_train.csv ...
Reading data ...
CPU time: 0.19720816612243652 seconds
Profiling datasets
        Attribute     Type  Num. Missing Values  Num. Unique Values           Sknewness   Kurtosis
0              Id  float64                  0.0              1460.0                   0       -1.2
1      MSSubClass  float64                  0.0                15.0             1.40621    1.57067
2     LotFrontage  float64                259.0               111.0  2.1608659947055435    17.3753
3         LotArea  float64                  0.0              1073.0             12.1951    202.544
4     OverallQual  float64                  0.0                10.0            0.216721  0.0918565
5     OverallCond  float64                  0.0                 9.0            0.692355    1.09852
6       YearBuilt  float64                  0.0               112.0           -0.612831  -0.442155
7    YearRemodAdd  float64                  0.0                61.0           -0.503044     -1.272


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
704,705.0,20.0,RL,70.0,8400.0,Pave,,Reg,Lvl,AllPub,...,0.0,,,,0.0,5.0,2010.0,WD,Normal,213000.0
342,343.0,90.0,RL,,8544.0,Pave,,Reg,Lvl,AllPub,...,0.0,,,,0.0,5.0,2006.0,WD,Normal,87500.0
1353,1354.0,50.0,RL,56.0,14720.0,Pave,,IR1,Lvl,AllPub,...,0.0,,,,0.0,3.0,2010.0,WD,Normal,410000.0
974,975.0,70.0,RL,60.0,11414.0,Pave,,IR1,Lvl,AllPub,...,0.0,,GdPrv,,0.0,10.0,2009.0,WD,Normal,167500.0
319,320.0,80.0,RL,,14115.0,Pave,,Reg,Lvl,AllPub,...,0.0,,,,0.0,6.0,2009.0,WD,Normal,187500.0


## - Normalize your data

In [16]:
# >> Examples of normalization
# The choice for the normalizer : 'ZS', 'MM','DS' or 'Log10'
#    Available strategies=
#       - 'ZS' z-score normalization
#       - 'MM' MinMax scaling
#       - 'DS' decimal scaling
#       - 'Log10 log10 scaling

import learn2clean.normalization.normalizer as nl 

# MM normalization with exclude = None, all numeric variables will be normalized
n1= nl.Normalizer(house_not_encoded.copy(),strategy='MM',exclude=None, verbose =False)
print('"LotArea" original variable\n',house_not_encoded['train']['LotArea'].head())
print('"LotArea" normalized variable\n', n1.transform()['train']['LotArea'].head())

#n1.transform()['train']
n1.transform()['target'].head()

"LotArea" original variable
 0     8450.0
1     9600.0
2    11250.0
3     9550.0
4    14260.0
Name: LotArea, dtype: float64
>>Normalization 
* For train dataset
... train dataset
* For test dataset
... test dataset
Normalization done -- CPU time: 0.0735011100769043 seconds

"LotArea" normalized variable
 0    0.033420
1    0.038795
2    0.046507
3    0.038561
4    0.060576
Name: LotArea, dtype: float64
>>Normalization 
* For train dataset
... train dataset
* For test dataset
... test dataset
Normalization done -- CPU time: 0.06507110595703125 seconds



0    208500.0
1    181500.0
2    223500.0
3    140000.0
4    250000.0
Name: SalePrice, dtype: float64

In [19]:

#ZS normalization
n3= nl.Normalizer(house_not_encoded.copy(),strategy='ZS',exclude=None, verbose = False)
print('ZS normalized variables\n',n3.transform()['train'].head())

#DS scaling
n4= nl.Normalizer(house_not_encoded.copy(),strategy='DS',exclude=None, verbose = False)
print('DS normalized variables\n',n4.transform()['train'].head())

#Log10 scaling
n5= nl.Normalizer(house_not_encoded.copy(),strategy='Log10',exclude=None, verbose = False)
#print('Log10 normalized variables\n',n5.transform()['train'].head())
print('Log10 normalized variables none excluded in train dataset\n',n5.transform()['train']['LotArea'].head())
print('Log10 normalized variables none excluded in test dataset\n',n5.transform()['test']['LotArea'].head())

#Log10 scaling  excluding target variable 'SalePrice'
n6= nl.Normalizer(house_not_encoded.copy(),strategy='Log10',exclude='LotArea',verbose = False)

print('Log10 normalized variables LotArea excluded in train dataset \n',n6.transform()['train']['LotArea'].head())
print('Log10 normalized variables LotArea excluded  in test dataset \n',n6.transform()['test']['LotArea'].head())

n7= nl.Normalizer(house_not_encoded.copy(),strategy='Log10',exclude='Fare', verbose = False)
print('Log10 normalized variables none excluded\n', n5.transform()['train']['YearBuilt'].head())
print("Log10 normalized variables 'YearBuilt' excluded\n")
n7.transform()['train']['YearBuilt'].head()
# here YearBuilt is not excluded from normalization!



>>Normalization 
* For train dataset
... train dataset
* For test dataset
... test dataset
Normalization done -- CPU time: 0.11875510215759277 seconds

ZS normalized variables
    1stFlrSF  2ndFlrSF  3SsnPorch  BedroomAbvGr  BsmtFinSF1  BsmtFinSF2  \
0 -0.793162  1.161454  -0.116299      0.163723    0.575228   -0.288554   
1  0.257052 -0.794891  -0.116299      0.163723    1.171591   -0.288554   
2 -0.627611  1.188943  -0.116299      0.163723    0.092875   -0.288554   
3 -0.521555  0.936955  -0.116299      0.163723   -0.499103   -0.288554   
4 -0.045596  1.617323  -0.116299      1.389547    0.463410   -0.288554   

   BsmtFullBath  BsmtHalfBath  BsmtUnfSF  EnclosedPorch    ...      \
0      1.107431     -0.240978  -0.944267      -0.359202    ...       
1     -0.819684      3.947457  -0.641008      -0.359202    ...       
2      1.107431     -0.240978  -0.301540      -0.359202    ...       
3      1.107431     -0.240978  -0.061648       4.091122    ...       
4      1.107431     -0.24097

0    4.0
1    4.0
2    4.0
3    4.0
4    4.0
Name: YearBuilt, dtype: float64

In [20]:
# To prevent that and avoid normalization for more than one numeric or year/month/date variables, 
# please change the data type into 'object' data type right after loading 
# and before normalization and learn2clean pipeline

house  = ["../datasets/house/house_train.csv", "../datasets/house/test.csv"]
house_not_encoded = d_not_enc.train_test_split(house, 'SalePrice')

# not exhaustively
house_not_encoded['train']['YrSold'] = house_not_encoded['train']['YrSold'].astype('object')
house_not_encoded['train']['YearBuilt'] = house_not_encoded['train']['YearBuilt'].astype('object')
house_not_encoded['train']['MoSold'] = house_not_encoded['train']['MoSold'].astype('object')
house_not_encoded['train']['MiscVal'] = house_not_encoded['train']['MiscVal'].astype('object')
house_not_encoded['train']['GarageYrBlt'] = house_not_encoded['train']['GarageYrBlt'].astype('object')
house_not_encoded['train']['YearRemodAdd'] = house_not_encoded['train']['YearRemodAdd'].astype('object')

normalized_house = nl.Normalizer(house_not_encoded.copy(),strategy='DS',exclude='Age', verbose = False).transform()
normalized_house['train'].head()




Reading csv : house_train.csv ...
Reading data ...
CPU time: 2.4944140911102295 seconds
Profiling datasets
        Attribute     Type  Num. Missing Values  Num. Unique Values           Sknewness   Kurtosis
0              Id  float64                  0.0              1460.0                   0       -1.2
1      MSSubClass  float64                  0.0                15.0             1.40621    1.57067
2     LotFrontage  float64                259.0               111.0  2.1608659947055435    17.3753
3         LotArea  float64                  0.0              1073.0             12.1951    202.544
4     OverallQual  float64                  0.0                10.0            0.216721  0.0918565
5     OverallCond  float64                  0.0                 9.0            0.692355    1.09852
6       YearBuilt  float64                  0.0               112.0           -0.612831  -0.442155
7    YearRemodAdd  float64                  0.0                61.0           -0.503044     -1.272
8

Profiling datasets
        Attribute     Type  Num. Missing Values  Num. Unique Values             Sknewness   Kurtosis
0              Id  float64                  0.0              1459.0                     0       -1.2
1      MSSubClass  float64                  0.0                16.0                1.3453    1.34024
2     LotFrontage  float64                227.0               116.0    0.6611148912627807    2.57186
3         LotArea  float64                  0.0              1106.0               3.11201    20.6714
4     OverallQual  float64                  0.0                10.0               0.18101  0.0334023
5     OverallCond  float64                  0.0                 9.0              0.448703    1.84137
6       YearBuilt  float64                  0.0               106.0             -0.587052  -0.581447
7    YearRemodAdd  float64                  0.0                61.0             -0.399495   -1.41186
8      MasVnrArea  float64                 15.0               304.0     

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BsmtFinSF1,BsmtFinSF2,BsmtFullBath,BsmtHalfBath,BsmtUnfSF,EnclosedPorch,...,PavedDrive,PoolQC,RoofMatl,RoofStyle,SaleCondition,Street,Utilities,YearBuilt,YearRemodAdd,YrSold
0,0.213333,0.845819,1e-07,0.555556,0.755265,1e-07,0.7777778,1e-07,0.169169,1e-07,...,Y,,CompShg,Gable,Normal,Pave,AllPub,2003,2003,2008
1,0.666667,1e-07,1e-07,0.555556,0.873755,1e-07,1e-07,0.9444444,0.297852,1e-07,...,Y,,CompShg,Gable,Normal,Pave,AllPub,1976,1976,2007
2,0.305185,0.8540664,1e-07,0.555556,0.598183,1e-07,0.7777778,1e-07,0.438795,1e-07,...,Y,,CompShg,Gable,Normal,Pave,AllPub,2001,2002,2008
3,0.357536,0.7784651,1e-07,0.555556,0.426237,1e-07,0.7777778,1e-07,0.520202,0.9302441,...,Y,,CompShg,Gable,Abnorml,Pave,AllPub,1915,1970,2006
4,0.565104,0.9020803,1e-07,0.888889,0.718228,1e-07,0.7777778,1e-07,0.482323,1e-07,...,Y,,CompShg,Gable,Normal,Pave,AllPub,2000,2000,2008


## - Replace missing values

In [22]:
#>> Examples for missing value imputation
# Available strategies:
#            - 'EM': only for numerical variables; imputation based on
#                expectation maximization
#            - 'MICE': only for numerical variables  missing at random (MAR);
#                Multivariate Imputation by Chained Equations
#            - 'KNN', only for numerical variables; k-nearest neighbor
#                imputation (k=4) which weights samples using the mean squared
#                difference on features for which two rows both have observed
#                data
#            - 'RAND', 'MF': both for numerical and categorical variables;
#                replace missing values by randomly selected value in the 
#                variable domain or by the most frequent value in the variable
#                domain respectively
#            - 'MEAN', 'MEDIAN': only for numerical variables; replace missing
#                values by mean or median of the numerical variable respectvely
#            - or 'DROP' remove the row with at least one missing value

import learn2clean.imputation.imputer as imp

# replace missing values by the most frequent ones in the training and testing datasets

print('Number of missing values',normalized_house['train'].isnull().sum().sum()) 
imp1 = imp.Imputer(normalized_house.copy(),strategy='MF', verbose=True).transform()
imp2 = imp.Imputer(normalized_house.copy(),strategy='RAND', verbose=True).transform()
imp3 = imp.Imputer(normalized_house.copy(),strategy='DROP', verbose=True).transform()
imp4 = imp.Imputer(normalized_house.copy(),strategy='MEAN', verbose=True).transform() 
imp5 = imp.Imputer(normalized_house.copy(),strategy='KNN', verbose=True).transform()
imp6 = imp.Imputer(normalized_house.copy(),strategy='MEDIAN', verbose=True).transform()
imp6 = imp.Imputer(normalized_house.copy(),strategy='EM', verbose=True).transform()
imp6 = imp.Imputer(normalized_house.copy(),strategy='MICE', verbose=True).transform()


Number of missing values 6965
>>Imputation 
* For train dataset
Before imputation:
Total 6965 missing values in ['LotFrontage', 'MasVnrArea', 'Alley', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual', 'Electrical', 'Fence', 'FireplaceQu', 'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType', 'GarageYrBlt', 'MasVnrType', 'MiscFeature', 'PoolQC']
- 267 numerical missing values in ['LotFrontage', 'MasVnrArea']
- 6698 non-numerical missing values in ['Alley', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual', 'Electrical', 'Fence', 'FireplaceQu', 'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType', 'GarageYrBlt', 'MasVnrType', 'MiscFeature', 'PoolQC']
Most frequent value for  1stFlrSF is: 0.2222222222222222
Most frequent value for  2ndFlrSF is: 9.999999977795539e-08
Most frequent value for  3SsnPorch is: 9.999999977795539e-08
Most frequent value for  BedroomAbvGr is: 0.5555555555555555
Most frequent value for  BsmtFinSF1 is: 9.99999997779

Most frequent value for  Fence is: MnPrv
Most frequent value for  FireplaceQu is: Gd
Most frequent value for  Foundation is: PConc
Most frequent value for  Functional is: Typ
Most frequent value for  GarageCond is: TA
Most frequent value for  GarageFinish is: Unf
Most frequent value for  GarageQual is: TA
Most frequent value for  GarageType is: Attchd
Most frequent value for  Heating is: GasA
Most frequent value for  HeatingQC is: Ex
Most frequent value for  HouseStyle is: 1Story
Most frequent value for  KitchenQual is: TA
Most frequent value for  LandContour is: Lvl
Most frequent value for  LandSlope is: Gtl
Most frequent value for  LotConfig is: Inside
Most frequent value for  LotShape is: Reg
Most frequent value for  MSZoning is: RL
Most frequent value for  MasVnrType is: None
Most frequent value for  MiscFeature is: Shed
Most frequent value for  Neighborhood is: NAmes
Most frequent value for  PavedDrive is: Y
Most frequent value for  PoolQC is: Ex
Most frequent value for  RoofMatl 

Using TensorFlow backend.


After imputation:
Total 6698 missing values
- 0 numerical missing values
- 6698 non-numerical missing values
* For test dataset
Before imputation:
Total 6999 missing values in ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtUnfSF', 'GarageArea', 'GarageCars', 'GarageYrBlt', 'LotFrontage', 'MasVnrArea', 'TotalBsmtSF', 'Alley', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual', 'Exterior1st', 'Exterior2nd', 'Fence', 'FireplaceQu', 'Functional', 'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType', 'KitchenQual', 'MSZoning', 'MasVnrType', 'MiscFeature', 'PoolQC', 'Utilities']
- 330 numerical missing values in ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtUnfSF', 'GarageArea', 'GarageCars', 'GarageYrBlt', 'LotFrontage', 'MasVnrArea', 'TotalBsmtSF']
- 6669 non-numerical missing values in ['Alley', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual', 'Exterior1st', 'Exterior2nd', 'Fence', 'FireplaceQu', 'Functi



After imputation:
Total 6669 missing values
- 0 numerical missing values
- 6669 non-numerical missing values
Imputation done -- CPU time: 29.289858102798462 seconds

>>Imputation 
* For train dataset
Before imputation:
Total 6965 missing values in ['LotFrontage', 'MasVnrArea', 'Alley', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual', 'Electrical', 'Fence', 'FireplaceQu', 'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType', 'GarageYrBlt', 'MasVnrType', 'MiscFeature', 'PoolQC']
- 267 numerical missing values in ['LotFrontage', 'MasVnrArea']
- 6698 non-numerical missing values in ['Alley', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual', 'Electrical', 'Fence', 'FireplaceQu', 'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType', 'GarageYrBlt', 'MasVnrType', 'MiscFeature', 'PoolQC']
After imputation:
Total 6698 missing values
- 0 numerical missing values
- 6698 non-numerical missing values
* For test dataset
Before imputation:
Total 699

## - Detect outliers and remove them

In [23]:
# >> Examples for outlier detection and removal
# Available strategies =
#            'ZS': detects outliers using the robust Zscore as a function
#            of median and median absolute deviation (MAD)
#            'IQR': detects outliers using Q1 and Q3 +/- 1.5*InterQuartile Range
#            'LOF': detects outliers using Local Outlier Factor

                
import learn2clean.outlier_detection.outlier_detector as out

#to remove rows having 30% and more ZSB-based outling values among the numerical variables
out1=out.Outlier_detector(house_not_encoded.copy(), strategy='ZSB', threshold = .3, verbose=True)
out1.transform()

#to remove rows having at least one IQR-based outlying value using threshold '-1'
out2=out.Outlier_detector(house_not_encoded.copy(), strategy='IQR', threshold = -1, verbose=False)
out2.transform()

#to remove rows having 40% and more ZSB-based outling values among the numerical variables; 
# since LOF requires non missing values, rows with NaN are also removed
out3=out.Outlier_detector(house_not_encoded.copy(), strategy='LOF', threshold = .4, verbose=False)
out3.transform()



>>Outlier detection and removal:
* For train dataset
90 outlying rows have been removed:
with indexes: [4, 20, 53, 58, 65, 75, 112, 114, 118, 125, 159, 165, 167, 178, 185, 197, 198, 231, 251, 278, 304, 320, 321, 330, 344, 349, 363, 378, 389, 417, 434, 440, 477, 480, 496, 515, 520, 523, 527, 529, 533, 583, 591, 614, 635, 642, 649, 654, 661, 664, 691, 718, 745, 747, 759, 769, 798, 803, 825, 854, 888, 898, 918, 1007, 1024, 1031, 1032, 1044, 1046, 1067, 1068, 1142, 1169, 1173, 1182, 1184, 1228, 1230, 1243, 1266, 1268, 1284, 1298, 1313, 1350, 1353, 1373, 1386, 1405, 1449]

Outliers:
      1stFlrSF  2ndFlrSF  3SsnPorch Alley  BedroomAbvGr BldgType BsmtCond  \
4       1145.0    1053.0        0.0   NaN           4.0     1Fam       TA   
20      1158.0    1218.0        0.0   NaN           4.0     1Fam       TA   
53      1842.0       0.0        0.0   NaN           0.0     1Fam       TA   
58      1426.0    1519.0        0.0   NaN           3.0     1Fam       TA   
65      1143.0    1330.0     

  r = func(a, **kwargs)
  r = func(a, **kwargs)
  r = func(a, **kwargs)
  r = func(a, **kwargs)
  r = func(a, **kwargs)
  r = func(a, **kwargs)
  r = func(a, **kwargs)
  r = func(a, **kwargs)
  r = func(a, **kwargs)
  r = func(a, **kwargs)
  r = func(a, **kwargs)
  r = func(a, **kwargs)
  r = func(a, **kwargs)
  r = func(a, **kwargs)
  r = func(a, **kwargs)
  r = func(a, **kwargs)
  r = func(a, **kwargs)
  r = func(a, **kwargs)
  r = func(a, **kwargs)
  r = func(a, **kwargs)
  r = func(a, **kwargs)
  r = func(a, **kwargs)
  r = func(a, **kwargs)
  r = func(a, **kwargs)
  r = func(a, **kwargs)
  r = func(a, **kwargs)


1460 outlying rows have been removed
* For test dataset
1459 outlying rows have been removed
Outlier detection and removal done -- CPU time: 0.07787704467773438 seconds


>>Outlier detection and removal:
* For train dataset
LOF requires no missing values, so missing valueshave been removed using DROP.
Error: Need at least one continous variable forLOF outlier detection
 Dataset inchanged
* For test dataset
LOF requires no missing values, so missing valueshave been removed using DROP.
Error: Need at least one continous variable forLOF outlier detection
 Dataset inchanged
Outlier detection and removal done -- CPU time: 0.05606508255004883 seconds



{'train': Empty DataFrame
 Columns: [1stFlrSF, 2ndFlrSF, 3SsnPorch, Alley, BedroomAbvGr, BldgType, BsmtCond, BsmtExposure, BsmtFinSF1, BsmtFinSF2, BsmtFinType1, BsmtFinType2, BsmtFullBath, BsmtHalfBath, BsmtQual, BsmtUnfSF, CentralAir, Condition1, Condition2, Electrical, EnclosedPorch, ExterCond, ExterQual, Exterior1st, Exterior2nd, Fence, FireplaceQu, Fireplaces, Foundation, FullBath, Functional, GarageArea, GarageCars, GarageCond, GarageFinish, GarageQual, GarageType, GarageYrBlt, GrLivArea, HalfBath, Heating, HeatingQC, HouseStyle, Id, KitchenAbvGr, KitchenQual, LandContour, LandSlope, LotArea, LotConfig, LotFrontage, LotShape, LowQualFinSF, MSSubClass, MSZoning, MasVnrArea, MasVnrType, MiscFeature, MiscVal, MoSold, Neighborhood, OpenPorchSF, OverallCond, OverallQual, PavedDrive, PoolArea, PoolQC, RoofMatl, RoofStyle, SaleCondition, ScreenPorch, Street, TotRmsAbvGrd, TotalBsmtSF, Utilities, WoodDeckSF, YearBuilt, YearRemodAdd, YrSold]
 Index: []
 
 [0 rows x 79 columns], 'test': Emp

## - Detect duplicates and remove them

In [28]:
# >> Examples for duplicate detection and removal
# House dataset has no duplicate anyway
# Available strategies =
#        'ED':  exact duplicate detection/removal or
#        'AD':  for aproximate duplicate records detection and removal
#        based on Jaccard similarity 


# import the Duplicate_detector class
import learn2clean.duplicate_detection.duplicate_detector as dup

#Remove exact duplicates with 'ED' strategy of the Duplicate_detector class

dup.Duplicate_detector(house_not_encoded.copy(), strategy='ED', verbose=False).transform()

#Remove approximate duplicates with thresholding Jaccard similarity 
# using 'AD'strategy of the Duplicate_detector class
dup.Duplicate_detector(house_not_encoded.copy(), strategy='AD', threshold = .5, verbose=False).transform()



Reading csv : house_train.csv ...
Reading data ...
CPU time: 0.18453502655029297 seconds
Profiling datasets
        Attribute     Type  Num. Missing Values  Num. Unique Values           Sknewness   Kurtosis
0              Id  float64                  0.0              1460.0                   0       -1.2
1      MSSubClass  float64                  0.0                15.0             1.40621    1.57067
2     LotFrontage  float64                259.0               111.0  2.1608659947055435    17.3753
3         LotArea  float64                  0.0              1073.0             12.1951    202.544
4     OverallQual  float64                  0.0                10.0            0.216721  0.0918565
5     OverallCond  float64                  0.0                 9.0            0.692355    1.09852
6       YearBuilt  float64                  0.0               112.0           -0.612831  -0.442155
7    YearRemodAdd  float64                  0.0                61.0           -0.503044     -1.272


Profiling datasets
        Attribute     Type  Num. Missing Values  Num. Unique Values             Sknewness   Kurtosis
0              Id  float64                  0.0              1459.0                     0       -1.2
1      MSSubClass  float64                  0.0                16.0                1.3453    1.34024
2     LotFrontage  float64                227.0               116.0    0.6611148912627807    2.57186
3         LotArea  float64                  0.0              1106.0               3.11201    20.6714
4     OverallQual  float64                  0.0                10.0               0.18101  0.0334023
5     OverallCond  float64                  0.0                 9.0              0.448703    1.84137
6       YearBuilt  float64                  0.0               106.0             -0.587052  -0.581447
7    YearRemodAdd  float64                  0.0                61.0             -0.399495   -1.41186
8      MasVnrArea  float64                 15.0               304.0     

Number of duplicate rows removed: 0
* For test dataset
Metric is not considered for 'ED'.
Number of duplicate rows removed: 0
Deduplication done -- CPU time: 0.331218957901001 seconds



{'train':       1stFlrSF  2ndFlrSF  3SsnPorch Alley  BedroomAbvGr BldgType BsmtCond  \
 0        856.0     854.0        0.0   NaN           3.0     1Fam       TA   
 1       1262.0       0.0        0.0   NaN           3.0     1Fam       TA   
 2        920.0     866.0        0.0   NaN           3.0     1Fam       TA   
 3        961.0     756.0        0.0   NaN           3.0     1Fam       Gd   
 4       1145.0    1053.0        0.0   NaN           4.0     1Fam       TA   
 5        796.0     566.0      320.0   NaN           1.0     1Fam       TA   
 6       1694.0       0.0        0.0   NaN           3.0     1Fam       TA   
 7       1107.0     983.0        0.0   NaN           3.0     1Fam       TA   
 8       1022.0     752.0        0.0   NaN           2.0     1Fam       TA   
 9       1077.0       0.0        0.0   NaN           2.0   2fmCon       TA   
 10      1040.0       0.0        0.0   NaN           3.0     1Fam       TA   
 11      1182.0    1142.0        0.0   NaN           4.

## - Detect inconsistencies

In [4]:
# >> Examples for inconsistency detection 
# Available consistency checking strategies :
#            - 'CC': checks whether the data satisfy the constraints
#                specified in a 'file_name'_constraint.tdda stored in 'save' directory
#            - 'PC': checks whether the data satisfy the patterns
#                specified in 'file_name'_patterns.txt stored in 'save' directory

# import the Consistency_checker class                
import learn2clean.consistency_checking.consistency_checker as cc
import learn2clean.loading.reader as rd 
d_not_enc = rd.Reader(sep=',',verbose=False, encoding=False) 
house  = ["../datasets/house/house_train.csv", "../datasets/house/test.csv"]
house_not_encoded = d_not_enc.train_test_split(house, 'SalePrice')
            
# discover the constraints from the input (train) dataset and store them in a file entitled 'house_discovered'_constraint.tdda in the 'save' directory
#cc.constraint_discovery(read_dataset('house'), file_name='house_discovered')

# discover the patterns from the input (train) dataset and store them in a file entitled 'house_discovered'_patterns.txt in the 'save' directory
#cc.pattern_discovery(read_dataset('house'), file_name='house_discovered')

# detect pattern violations with respect to a given file of patterns entitled 'house_example'_constraint.tdda" stored in the 'save' directory
cc.Consistency_checker(house_not_encoded, strategy='CC', file_name='house_example',verbose=True).transform()

# detect pattern violations with respect to a given file of patterns entitled 'house_example1'_patterns.txt" stored in the 'save' directory
# with too strong patterns resulting in an empty dataframe fro the training set
cc.Consistency_checker(house_not_encoded, strategy='PC', file_name='house_example1', verbose=False).transform()

# detect pattern violations with respect to a given file of patterns entitled 'house_example2'_patterns.txt" stored in the 'save' directory
cc.Consistency_checker(house_not_encoded, strategy='PC', file_name='house_example2', verbose=False).transform()


>>Consistency checking
* For train dataset
Constraints from the file: house_example_constraints.tdda
Constraints passing: 6

Constraints failing: 1

FIELDS:

HouseStyle: 1 failure  4 passes  type ✓  min_length ✓  max_length ✓  max_nulls ✓  rex ✗

YearBuilt: 0 failures  2 passes  type ✓  max_nulls ✓

SUMMARY:

Constraints passing: 6
Constraints failing: 1
        field  failures  passes  type min_length max_length  max_nulls    rex
0  HouseStyle         1       4  True       True       True       True  False
1   YearBuilt         0       2  True        NaN        NaN       True    NaN
Row index with constraint failure:

[5, 8, 9, 15, 21, 46, 51, 61, 69, 77, 88, 93, 104, 108, 110, 121, 125, 127, 142, 149, 155, 163, 165, 170, 177, 183, 184, 185, 187, 198, 202, 204, 218, 239, 242, 249, 263, 267, 275, 286, 292, 296, 304, 307, 312, 325, 328, 335, 345, 352, 354, 361, 371, 380, 383, 386, 390, 394, 406, 418, 431, 436, 437, 439, 442, 448, 449, 459, 488, 491, 497, 514, 522, 545, 546, 555, 557, 56

{'train':      Alley BldgType BsmtCond BsmtExposure BsmtFinType1 BsmtFinType2 BsmtQual  \
 0      NaN     1Fam       TA           No          GLQ          Unf       Gd   
 1      NaN     1Fam       TA           Gd          ALQ          Unf       Gd   
 2      NaN     1Fam       TA           Mn          GLQ          Unf       Gd   
 3      NaN     1Fam       Gd           No          ALQ          Unf       TA   
 4      NaN     1Fam       TA           Av          GLQ          Unf       Gd   
 6      NaN     1Fam       TA           Av          GLQ          Unf       Ex   
 7      NaN     1Fam       TA           Mn          ALQ          BLQ       Gd   
 10     NaN     1Fam       TA           No          Rec          Unf       TA   
 11     NaN     1Fam       TA           No          GLQ          Unf       Ex   
 12     NaN     1Fam       TA           No          ALQ          Unf       TA   
 13     NaN     1Fam       TA           Av          Unf          Unf       Gd   
 14     NaN     1Fa

## - Select features

In [2]:
# >> Examples for Feature selection
# Available strategies=
#           'MR': using a default threshold on the missing ratio per variable,
#            i.e., variables with 20% (by default) and more missing values
#            are removed
#            'LC': detects pairs of linearly correlated variables and remove one
#            'VAR': uses threshold on the variance
#            'Tree': uses decision tree classification as model for feature
#                selection given the target set for classification task
#                'SVC': uses linear SVC as model for feature selection given
#                 the target set for classification task
#            'WR': uses the selectKbest (k=10) and Chi2 for feature selection
#                given the target set for classification task
#            'L1': uses Lasso L1 for feature selection given the target set for
#                regression task
#            'IMP': uses Random Forest regression for feature selection given
#                the target set for regression task

                
import learn2clean.feature_selection.feature_selector as fs
import learn2clean.loading.reader as rd 
d_not_enc = rd.Reader(sep=',',verbose=False, encoding=False) 
house  = ["../datasets/house/house_train.csv", "../datasets/house/test.csv"]
house_not_encoded = d_not_enc.train_test_split(house, 'SalePrice')
  
#Available strategies for feature selection 
#        'MR': using a default threshold on the missing ratio per variable, i.e., variables
#                with 20% (by default) and more missing values are removed
#        'LC': detects pairs of linearly correlated variables and remove one
#        'VAR': uses threshold on the variance
#        'Tree': uses decision tree classification as model for feature selection given the target set for classification task
#        'SVC': uses linear SVC as model for feature selection given the target set for classification task
#        'WR': uses the selectKbest (k=10) and Chi2 for feature selection given the target set for classification task
#        'L1': uses Lasso L1 for feature selection given the target set for regression task
#        'IMP': uses Random Forest regression for feature selection given the target set for regression task

fs.Feature_selector(dataset = house_not_encoded.copy(), strategy= 'MR', threshold=0.1, exclude=None, verbose=True).transform()

fs.Feature_selector(dataset = house_not_encoded.copy(), strategy= 'LC', threshold=0.2,  exclude=None, verbose=True).transform()

fs.Feature_selector(dataset = house_not_encoded.copy(), strategy= 'L1',  exclude= None, threshold=.3,verbose=True).transform()

fs.Feature_selector(dataset = house_not_encoded.copy(), strategy= 'IMP', exclude = 'SibSp',verbose=True, threshold=.4).transform()

fs.Feature_selector(dataset = house_not_encoded.copy(), strategy= 'Tree',  exclude='Pclass',verbose=True).transform()

fs.Feature_selector(dataset = house_not_encoded.copy(), strategy= 'WR', exclude= None, verbose=True).transform()

fs.Feature_selector(dataset = house_not_encoded.copy(), strategy= 'WR', exclude= 'Sex', verbose=True).transform()

fs.Feature_selector(dataset = house_not_encoded.copy(), strategy= 'SVC',  exclude=None).transform()

fs.Feature_selector(dataset = house_not_encoded.copy(), strategy= 'VAR',  exclude=None).transform()

fs.Feature_selector(dataset = house_not_encoded.copy(), strategy= 'VAR',  exclude='Cabin').transform()



>>Feature selection 
Before feature selection:
79 features 
Apply MR feature selection with missing threshold= 0.1
               missing_fraction
PoolQC                 0.995205
MiscFeature            0.963014
Alley                  0.937671
Fence                  0.807534
FireplaceQu            0.472603
LotFrontage            0.177397
GarageCond             0.055479
GarageFinish           0.055479
GarageQual             0.055479
GarageType             0.055479
GarageYrBlt            0.055479
BsmtExposure           0.026027
BsmtFinType2           0.026027
BsmtQual               0.025342
BsmtFinType1           0.025342
BsmtCond               0.025342
MasVnrArea             0.005479
MasVnrType             0.005479
Electrical             0.000685
MiscVal                0.000000
1stFlrSF               0.000000
Neighborhood           0.000000
MSZoning               0.000000
MSSubClass             0.000000
LowQualFinSF           0.000000
LotShape               0.000000
LotConfig           

Exclude variable invalid. Please choose a variablefrom the input training dataset.
After feature selection:
22 features remain
['GarageYrBlt', 'GarageArea', '2ndFlrSF', 'WoodDeckSF', 'MasVnrArea', 'OverallQual', 'BsmtUnfSF', 'YearBuilt', 'MoSold', 'BsmtFinSF1', 'GarageCars', 'OpenPorchSF', 'FullBath', 'LotArea', 'LotFrontage', 'TotRmsAbvGrd', 'Id', 'GrLivArea', 'OverallCond', '1stFlrSF', 'TotalBsmtSF', 'YearRemodAdd']
Feature selection done -- CPU time: 0.3775022029876709 seconds


>>Feature selection 
Before feature selection:
79 features 
Apply Tree-based feature selection 
Best features to keep ['1stFlrSF', 'BsmtFinSF1', 'BsmtUnfSF', 'GarageArea', 'GarageYrBlt', 'GrLivArea', 'Id', 'LotArea', 'LotFrontage', 'MasVnrArea', 'MoSold', 'OpenPorchSF', 'OverallQual', 'TotRmsAbvGrd', 'TotalBsmtSF', 'WoodDeckSF', 'YearBuilt', 'YearRemodAdd', 'YrSold']
Exclude variable invalid. Please choose a variablefrom the input training dataset.
After feature selection:
19 features remain
['LotArea', 'GrL

{'train':       1stFlrSF  2ndFlrSF  3SsnPorch Alley  BedroomAbvGr BldgType BsmtCond  \
 0          856       854          0   NaN             3     1Fam       TA   
 1         1262         0          0   NaN             3     1Fam       TA   
 2          920       866          0   NaN             3     1Fam       TA   
 3          961       756          0   NaN             3     1Fam       Gd   
 4         1145      1053          0   NaN             4     1Fam       TA   
 5          796       566        320   NaN             1     1Fam       TA   
 6         1694         0          0   NaN             3     1Fam       TA   
 7         1107       983          0   NaN             3     1Fam       TA   
 8         1022       752          0   NaN             2     1Fam       TA   
 9         1077         0          0   NaN             2   2fmCon       TA   
 10        1040         0          0   NaN             3     1Fam       TA   
 11        1182      1142          0   NaN             

## >> Classification 

In [8]:
import learn2clean.classification.classifier as cl
#output is accuracy of classification for k=10 cross-validation and execution time 
#plus a detailed classification report if verbose = True
import learn2clean.loading.reader as rd 
d_enc = rd.Reader(sep=',',verbose=False, encoding=True) 
house  = ["../datasets/house/house_train.csv", "../datasets/house/test.csv"]
house_encoded = d_enc.train_test_split(house, 'SaleCondition')


Cl1 = cl.Classifier(dataset = house_encoded,target = 'SaleCondition',strategy = 'LDA', verbose = False).transform()
Cl2 = cl.Classifier(dataset = house_encoded,target = 'SaleCondition',strategy = 'MNB',verbose = False).transform()
Cl3 = cl.Classifier(dataset = house_encoded,target = 'SaleCondition',strategy = 'NB',verbose = False).transform()
Cl4 = cl.Classifier(dataset = house_encoded,target = 'SaleCondition',strategy = 'CART',verbose = True).transform()



>>Classification task

Accuracy of LDA result for 10 cross-validation : 0.803705337450375

Classification done -- CPU time: 0.15099120140075684 seconds

>>Classification task
Accuracy of Multinomial Naive Bayes classification for 10 cross-validation : 0.214

Classification done -- CPU time: 8.736706018447876 seconds

>>Classification task
Accuracy of Naive Naive Bayes classification for 10 cross-validation : 0.09572121746801941

Classification done -- CPU time: 0.1269359588623047 seconds

>>Classification task
{'mean_fit_time': array([0.00854752, 0.01225059, 0.01615422, 0.01999979, 0.02105217]), 'std_fit_time': array([0.00031048, 0.00020734, 0.00027083, 0.00084551, 0.00045577]), 'mean_score_time': array([0.00046787, 0.00049503, 0.00055208, 0.0006681 , 0.0005549 ]), 'std_score_time': array([5.52533913e-05, 8.20290235e-05, 5.80722472e-05, 2.98350208e-04,
       5.64673857e-05]), 'param_max_depth': masked_array(data=[3, 5, 7, 9, 10],
             mask=[False, False, False, False, False],

## >> Regression

In [9]:
import learn2clean.regression.regressor as rg
# output is MSE and computation time, with regression summary if verbose = True

import learn2clean.loading.reader as rd 
d_not_enc = rd.Reader(sep=',',verbose=False, encoding=False) 
house  = ["../datasets/house/house_train.csv", "../datasets/house/test.csv"]
house_not_encoded = d_not_enc.train_test_split(house, 'SalePrice')
  
    
rg1 = rg.Regressor(dataset = house_not_encoded,target = 'SalePrice',strategy= 'LASSO', verbose = True).transform()

rg3 = rg.Regressor(dataset = house_not_encoded,target = 'SalePrice',strategy= 'OLS',verbose = True).transform()

rg2 = rg.Regressor(dataset = house_not_encoded,target = 'SalePrice',strategy= 'MARS',verbose = True).transform()



>>Regression task
MSE values of cross validation
[[8.23014880e+08 9.43324263e+08 7.85739907e+08 2.11508746e+09
  1.90954945e+09 1.23725599e+09 1.00010508e+09 1.17530792e+09
  4.78814172e+09 9.80573235e+08]
 [8.20323363e+08 9.40411065e+08 7.83080690e+08 2.11443141e+09
  1.90636612e+09 1.23488480e+09 9.98348335e+08 1.16859980e+09
  4.79071404e+09 9.78396852e+08]
 [8.17519024e+08 9.37751124e+08 7.80568693e+08 2.11448351e+09
  1.90299198e+09 1.23321462e+09 9.97060396e+08 1.16237424e+09
  4.79389124e+09 9.75593687e+08]
 [8.14892384e+08 9.35343382e+08 7.78257153e+08 2.11454302e+09
  1.89985067e+09 1.23168256e+09 9.95937646e+08 1.15652492e+09
  4.79720251e+09 9.72969444e+08]
 [8.12430596e+08 9.33162697e+08 7.76123935e+08 2.11459638e+09
  1.89692381e+09 1.23029063e+09 9.94974879e+08 1.15102240e+09
  4.80066142e+09 9.70476627e+08]
 [8.10120644e+08 9.31187363e+08 7.74152175e+08 2.11463494e+09
  1.89419490e+09 1.22903007e+09 9.94160916e+08 1.14584442e+09
  4.80425725e+09 9.68091644e+08]
 [8.0795

## >> Clustering

In [11]:
import learn2clean.clustering.clusterer as ct
# clustering is applied to one dataset (i.e., the training set if two datasets are given in the path)
# output is silhouette, best k, and computation time, plus the training dataset with cluster IDs

ct.Clusterer(dataset = house_not_encoded,strategy= 'KMEANS', verbose=True).transform()
ct.Clusterer(dataset = house_not_encoded,strategy='HCA', verbose = True).transform()
ct.Clusterer(dataset = house_not_encoded,strategy='HCA', metric= 'euclidean', verbose = True).transform()
ct.Clusterer(dataset = house_not_encoded,strategy='HCA', metric= 'cosine', verbose = True).transform()
ct.Clusterer(dataset = house_not_encoded,strategy='HCA', metric= 'cityblock', verbose = True).transform()



>>Clustering task
Note: The clustering is applied on the training dataset only.
Best silhouette = 0.9738  for k= 2
Quality of clustering 0.9738
Labels distribution:
0    1120
1       1
Name: cluster_ID, dtype: int64
Clustering done -- CPU time: 1.1059057712554932 seconds

>>Clustering task
Note: The clustering is applied on the training dataset only.
Best silhouette = 0.9738  for k= 2
Quality of clustering 0.9738
Labels distribution:
0    1120
1       1
Name: cluster_ID, dtype: int64
Clustering done -- CPU time: 0.31883811950683594 seconds

>>Clustering task
Note: The clustering is applied on the training dataset only.
Best silhouette = 0.9738  for k= 2
Quality of clustering 0.9738
Labels distribution:
0    1120
1       1
Name: cluster_ID, dtype: int64
Clustering done -- CPU time: 0.31737494468688965 seconds

>>Clustering task
Note: The clustering is applied on the training dataset only.
Best silhouette = 0.9738  for k= 2
Quality of clustering 0.3559
Labels distribution:
1    1016
0  

{'quality_metric': 0.9738,
 'result': {'train':       1stFlrSF  2ndFlrSF  3SsnPorch  BedroomAbvGr  BsmtFinSF1  BsmtFinSF2  \
  0          856       854          0             3         706           0   
  1         1262         0          0             3         978           0   
  2          920       866          0             3         486           0   
  3          961       756          0             3         216           0   
  4         1145      1053          0             4         655           0   
  5          796       566        320             1         732           0   
  6         1694         0          0             3        1369           0   
  8         1022       752          0             2           0           0   
  9         1077         0          0             2         851           0   
  10        1040         0          0             3         906           0   
  11        1182      1142          0             4         998           0   
  13  

## - Create your own pipeline

In [36]:
# create your preprocessing pipeline for classification

import learn2clean.loading.reader as rd 
import learn2clean.normalization.normalizer as nl 
import learn2clean.feature_selection.feature_selector as fs
import learn2clean.duplicate_detection.duplicate_detector as dd
import learn2clean.outlier_detection.outlier_detector as od
import learn2clean.imputation.imputer as imp
import learn2clean.classification.classifier as cl

d_not_enc = rd.Reader(sep=',',verbose=False, encoding=False) 
house  = ["../datasets/house/house_train.csv", "../datasets/house/test.csv"]
dataset= d_not_enc.train_test_split(house, 'SaleCondition')
  
# not exhaustively
dataset['train']['YrSold'] = dataset['train']['YrSold'].astype('object')
dataset['train']['YearBuilt'] = dataset['train']['YearBuilt'].astype('object')
dataset['train']['MoSold'] = dataset['train']['MoSold'].astype('object')
dataset['train']['MiscVal'] = dataset['train']['MiscVal'].astype('object')
dataset['train']['GarageYrBlt'] = dataset['train']['GarageYrBlt'].astype('object')
dataset['train']['YearRemodAdd'] = dataset['train']['YearRemodAdd'].astype('object')

# eliminate variables with more than30% missing values
d1 = fs.Feature_selector(dataset=dataset.copy(),strategy= 'MR', threshold=.3).transform()
# replace numeric null values by mean 
d2 = imp.Imputer(d1.copy(), strategy = 'MEAN',verbose=False).transform()
# replace remaining non numerical values by the most frequent
d2 = imp.Imputer(d2.copy(), strategy = 'MF',verbose=False).transform()
# eliminate 40 LOF outliers
d4 = od.Outlier_detector(d2, strategy='LOF', threshold= 0.4,verbose=False).transform()
# classify with LDA
cl.Classifier(d4,strategy = 'LDA', target = 'SaleCondition', verbose =True).transform()



>>Feature selection 
Before feature selection:
78 features 
Apply MR feature selection with missing threshold= 0.3
5 features with greater than 0.30 missing values.

List of variables to be removed : ['Alley', 'Fence', 'FireplaceQu', 'MiscFeature', 'PoolQC']
After feature selection:
73 features remain
['LotConfig', 'GarageYrBlt', 'LotShape', 'LowQualFinSF', 'BsmtExposure', 'GarageArea', 'BldgType', 'Condition1', 'Exterior1st', 'HouseStyle', 'LandContour', 'Fireplaces', '2ndFlrSF', 'BsmtFinType2', 'RoofStyle', 'Exterior2nd', 'GarageCond', 'WoodDeckSF', 'BsmtFinType1', 'MasVnrArea', 'PavedDrive', 'OverallQual', 'MSSubClass', 'Utilities', 'Functional', 'BsmtUnfSF', 'CentralAir', 'Foundation', 'YearBuilt', 'MoSold', 'BsmtQual', 'BsmtFinSF1', 'GarageCars', 'MasVnrType', 'ExterCond', 'KitchenQual', 'Electrical', 'OpenPorchSF', 'FullBath', 'Street', 'GarageFinish', 'LotArea', 'BedroomAbvGr', 'LotFrontage', 'HeatingQC', 'YearRemodAdd', 'MSZoning', '3SsnPorch', 'RoofMatl', 'BsmtFullBath', 'Lan



{'quality_metric': 0.8120875303924974}

# Learn2clean data preprocessing pipeline

## Regression with Learn2Clean

In [1]:
import learn2clean.loading.reader as rd 
import learn2clean.qlearning.qlearner as ql
import learn2clean.imputation.imputer as imp

# the results of learn2clean cleaning are stored in 'house_example'_results.txt in 'save' directory

d_not_enc = rd.Reader(sep=',',verbose=True, encoding=False) 
house  = ["../datasets/house/house_train.csv","../datasets/house/test.csv"]
dataset= d_not_enc.train_test_split(house, 'SalePrice')
  
# not exhaustively
dataset['train']['YrSold'] = dataset['train']['YrSold'].astype('object')
dataset['train']['YearBuilt'] = dataset['train']['YearBuilt'].astype('object')
dataset['train']['MoSold'] = dataset['train']['MoSold'].astype('object')
dataset['train']['MiscVal'] = dataset['train']['MiscVal'].astype('object')
dataset['train']['GarageYrBlt'] = dataset['train']['GarageYrBlt'].astype('object')
dataset['train']['YearRemodAdd'] = dataset['train']['YearRemodAdd'].astype('object')
dataset['test']

# according to the profiling of House dataset:
#               missing_fraction
# PoolQC                 0.995205
# MiscFeature            0.963014
# Alley                  0.937671
# Fence                  0.807534

# the dataframes train and test will be empty for many leanr2clean episodes
# returning None for accuracy
print(len(dataset['train'].dropna()), len(dataset['test'].dropna()))
# for this particular case, we can impute data first and then run leanr2clean
d1 = imp.Imputer(dataset.copy(), strategy = 'MF',verbose=False).transform()
print(len(d1['train'].dropna()), len(d1['test'].dropna()))

# Learn2clean finds the best strategy 'LC -> LOF -> MARS' for maximal MSE : 0.2232924236591108 for MARS
# in  98.62 seconds
# The best strategy is stored in EOF of 'house_example2_results.txt' in 'save' directory as
# ('house_example2', 'learn2clean', 'MARS', 'SalePrice', None, 'LC -> LOF -> MARS', 'MSE', 0.2232924236591108, 98.62206888198853)

l2c_c1assification1=ql.Qlearner(dataset = d1.copy(),goal='MARS',target_goal='SalePrice',
                                target_prepare=None, file_name = 'house_example2', verbose = False)
l2c_c1assification1.learn2clean()




Reading csv : house_train.csv ...
Reading data ...
CPU time: 2.574882984161377 seconds
Profiling datasets
        Attribute     Type  Num. Missing Values  Num. Unique Values           Sknewness   Kurtosis
0              Id  float64                  0.0              1460.0                   0       -1.2
1      MSSubClass  float64                  0.0                15.0             1.40621    1.57067
2     LotFrontage  float64                259.0               111.0  2.1608659947055435    17.3753
3         LotArea  float64                  0.0              1073.0             12.1951    202.544
4     OverallQual  float64                  0.0                10.0            0.216721  0.0918565
5     OverallCond  float64                  0.0                 9.0            0.692355    1.09852
6       YearBuilt  float64                  0.0               112.0           -0.612831  -0.442155
7    YearRemodAdd  float64                  0.0                61.0           -0.503044     -1.272
8 

Profiling datasets
        Attribute     Type  Num. Missing Values  Num. Unique Values             Sknewness   Kurtosis
0              Id  float64                  0.0              1459.0                     0       -1.2
1      MSSubClass  float64                  0.0                16.0                1.3453    1.34024
2     LotFrontage  float64                227.0               116.0    0.6611148912627807    2.57186
3         LotArea  float64                  0.0              1106.0               3.11201    20.6714
4     OverallQual  float64                  0.0                10.0               0.18101  0.0334023
5     OverallCond  float64                  0.0                 9.0              0.448703    1.84137
6       YearBuilt  float64                  0.0               106.0             -0.587052  -0.581447
7    YearRemodAdd  float64                  0.0                61.0             -0.399495   -1.41186
8      MasVnrArea  float64                 15.0               304.0     

Total 6965 missing values in ['Alley', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual', 'Electrical', 'Fence', 'FireplaceQu', 'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType', 'GarageYrBlt', 'LotFrontage', 'MasVnrArea', 'MasVnrType', 'MiscFeature', 'PoolQC']
- 267 numerical missing values in ['LotFrontage', 'MasVnrArea']
- 6698 non-numerical missing values in ['Alley', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual', 'Electrical', 'Fence', 'FireplaceQu', 'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType', 'GarageYrBlt', 'MasVnrType', 'MiscFeature', 'PoolQC']
After imputation:
Total 0 missing values
- 0 numerical missing values
- 0 non-numerical missing values
* For test dataset
Before imputation:
Total 13964 missing values in ['Alley', 'BsmtCond', 'BsmtExposure', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFinType1', 'BsmtFinType2', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtQual', 'BsmtUnfSF', 'Electrical', 'Exterior1st', 'Exterior2nd', 'Fenc

MSE of MARS with 10 folds for cross-validation: 0.12793770571124324
Regression done -- CPU time: 6.0519118309021 seconds
End Pipeline CPU time: 6.194260120391846 seconds


Strategy# 7 : Greedy traversal for starting state LOF
LOF -> MARS

Start pipeline
-------------

>>Outlier detection and removal:
* For train dataset
30 outlying rows have been removed
* For test dataset
30 outlying rows have been removed
Outlier detection and removal done -- CPU time: 0.19350790977478027 seconds


>>Regression task
MSE of MARS with 10 folds for cross-validation: 0.1337207838769893
Regression done -- CPU time: 8.442368984222412 seconds
End Pipeline CPU time: 8.635962963104248 seconds


Strategy# 8 : Greedy traversal for starting state IQR
IQR -> MARS

Start pipeline
-------------

>>Outlier detection and removal:
* For train dataset
774 outlying rows have been removed
* For test dataset
1590 outlying rows have been removed
Outlier detection and removal done -- CPU time: 0.06523013114929199 seconds




### Classification with Learn2Clean

In [14]:
import learn2clean.loading.reader as rd 
import learn2clean.qlearning.qlearner as ql
import learn2clean.imputation.imputer as imp

# the results of learn2clean cleaning are stored in 'house_example'_results.txt in 'save' directory

d_not_enc = rd.Reader(sep=',',verbose=True, encoding=True) 
house  = ["../datasets/house/house_train.csv","../datasets/house/test.csv"]
dataset= d_not_enc.train_test_split(house, 'SaleCondition')
  
# not exhaustively
dataset['train']['YrSold'] = dataset['train']['YrSold'].astype('object')
dataset['train']['YearBuilt'] = dataset['train']['YearBuilt'].astype('object')
dataset['train']['MoSold'] = dataset['train']['MoSold'].astype('object')
dataset['train']['MiscVal'] = dataset['train']['MiscVal'].astype('object')
dataset['train']['GarageYrBlt'] = dataset['train']['GarageYrBlt'].astype('object')
dataset['train']['YearRemodAdd'] = dataset['train']['YearRemodAdd'].astype('object')
dataset['test']

# according to the profiling of House dataset:
#               missing_fraction
# PoolQC                 0.995205
# MiscFeature            0.963014
# Alley                  0.937671
# Fence                  0.807534

# the dataframes train and test will be empty for many leanr2clean episodes
# returning None for accuracy
print(len(dataset['train'].dropna()), len(dataset['test'].dropna()))
# for this particular case, we can impute data first and then run leanr2clean
d2 = imp.Imputer(dataset.copy(), strategy = 'MF',verbose=False).transform()
print(len(d1['train'].dropna()), len(d2['test'].dropna()))
# Learn2clean finds the best strategy 'ZSB -> ED -> LDA' for maximal accuracy : 0.8241517694272164 for LDA
# in  4.58 seconds
# The best strategy is stored in EOF of 'house_example2_results.txt' in 'save' directory as
# ('house_example2', 'learn2clean', 'LDA', 'SaleCondition', None, 'ZSB -> ED -> LDA', 'accuracy', 0.8241517694272164, 4.3612353801727295)
l2c_c1assification1=ql.Qlearner(dataset = d2,goal='LDA',target_goal='SaleCondition',
                                target_prepare=None, file_name = 'house_example2', verbose = False)
l2c_c1assification1.learn2clean()



Reading csv : house_train.csv ...
Reading data ...
CPU time: 2.9104151725769043 seconds
Profiling datasets
        Attribute     Type  Num. Missing Values  Num. Unique Values           Sknewness   Kurtosis
0              Id  float64                  0.0              1460.0                   0       -1.2
1      MSSubClass  float64                  0.0                15.0             1.40621    1.57067
2     LotFrontage  float64                259.0               111.0  2.1608659947055435    17.3753
3         LotArea  float64                  0.0              1073.0             12.1951    202.544
4     OverallQual  float64                  0.0                10.0            0.216721  0.0918565
5     OverallCond  float64                  0.0                 9.0            0.692355    1.09852
6       YearBuilt  float64                  0.0               112.0           -0.612831  -0.442155
7    YearRemodAdd  float64                  0.0                61.0           -0.503044     -1.272
8

Profiling datasets
        Attribute     Type  Num. Missing Values  Num. Unique Values             Sknewness   Kurtosis
0              Id  float64                  0.0              1459.0                     0       -1.2
1      MSSubClass  float64                  0.0                16.0                1.3453    1.34024
2     LotFrontage  float64                227.0               116.0    0.6611148912627807    2.57186
3         LotArea  float64                  0.0              1106.0               3.11201    20.6714
4     OverallQual  float64                  0.0                10.0               0.18101  0.0334023
5     OverallCond  float64                  0.0                 9.0              0.448703    1.84137
6       YearBuilt  float64                  0.0               106.0             -0.587052  -0.581447
7    YearRemodAdd  float64                  0.0                61.0             -0.399495   -1.41186
8      MasVnrArea  float64                 15.0               304.0     

After imputation:
Total 0 missing values
- 0 numerical missing values
- 0 non-numerical missing values
* For test dataset
Before imputation:
Total 13964 missing values in ['Alley', 'BsmtCond', 'BsmtExposure', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFinType1', 'BsmtFinType2', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtQual', 'BsmtUnfSF', 'Electrical', 'Exterior1st', 'Exterior2nd', 'Fence', 'FireplaceQu', 'Functional', 'GarageArea', 'GarageCars', 'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType', 'GarageYrBlt', 'KitchenQual', 'LotFrontage', 'MSZoning', 'MasVnrArea', 'MasVnrType', 'MiscFeature', 'PoolQC', 'TotalBsmtSF', 'Utilities']
- 678 numerical missing values in ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtUnfSF', 'GarageArea', 'GarageCars', 'GarageYrBlt', 'LotFrontage', 'MasVnrArea', 'TotalBsmtSF']
- 13286 non-numerical missing values in ['Alley', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual', 'Electrical', 'Exterior1st', 'Exterior2nd', 'Fence'

Deduplication done -- CPU time: 0.16724205017089844 seconds


>>Classification task

Accuracy of LDA result for 10 cross-validation : 0.8241517694272164

Classification done -- CPU time: 0.09065079689025879 seconds
End Pipeline CPU time: 0.3702876567840576 seconds


Strategy# 7 : Greedy traversal for starting state LOF
LOF -> LDA

Start pipeline
-------------

>>Outlier detection and removal:
* For train dataset
30 outlying rows have been removed
* For test dataset
30 outlying rows have been removed
Outlier detection and removal done -- CPU time: 0.25891613960266113 seconds


>>Classification task

Accuracy of LDA result for 10 cross-validation : 0.8134302526825892

Classification done -- CPU time: 0.08893918991088867 seconds
End Pipeline CPU time: 0.34795117378234863 seconds


Strategy# 8 : Greedy traversal for starting state IQR
IQR -> LDA

Start pipeline
-------------

>>Outlier detection and removal:
* For train dataset
1594 outlying rows have been removed
* For test dataset
1590 o

In [15]:
# Learn2clean finds the best strategy WR -> IQR -> NB for maximal accuracy : 0.6120754716981132 for NB
# in  4.31 seconds
# The best strategy is stored in EOF of 'house_example2_results.txt' in 'save' directory as
# ('house_example2', 'learn2clean', 'NB', 'SaleCondition', None, 'WR -> IQR -> NB', 'accuracy', 0.6120754716981132, 4.3143150806427)

l2c_c1assification1=ql.Qlearner(d2,goal='NB',target_goal='SaleCondition',
                                target_prepare=None, file_name = 'house_example2', verbose = False)
l2c_c1assification1.learn2clean()

Start Learn2Clean
Learn2Clean - Pipeline construction -- CPU time: 0.1809401512145996 seconds
=== Start Pipeline Execution ===


Strategy# 0 : Greedy traversal for starting state DS
DS -> IQR -> NB

Start pipeline
-------------
>>Normalization 
* For train dataset
... train dataset
* For test dataset
... test dataset
Normalization done -- CPU time: 0.15661215782165527 seconds


>>Outlier detection and removal:
* For train dataset
0 outlying rows have been removed
* For test dataset
0 outlying rows have been removed
Outlier detection and removal done -- CPU time: 0.06927299499511719 seconds


>>Classification task
Accuracy of Naive Naive Bayes classification for 10 cross-validation : 0.06783144912641316

Classification done -- CPU time: 0.10594511032104492 seconds
End Pipeline CPU time: 0.3321809768676758 seconds


Strategy# 1 : Greedy traversal for starting state MM
MM -> AD -> NB

Start pipeline
-------------
>>Normalization 
* For train dataset
... train dataset
* For test dataset
..

Accuracy of Naive Naive Bayes classification for 10 cross-validation : 0.0971025841816758

Classification done -- CPU time: 0.10449409484863281 seconds
End Pipeline CPU time: 0.14294123649597168 seconds


Strategy# 10 : Greedy traversal for starting state PC
PC -> NB

Start pipeline
-------------
>>Consistency checking
* For train dataset
0
* For test dataset
0
Consistency checking done -- CPU time: 0.03556370735168457 seconds

>>Classification task
Error: Need at least one continous variable and  10  observations for regression
Classification done -- CPU time: 0.00701594352722168 seconds
End Pipeline CPU time: 0.04272270202636719 seconds


Strategy# 11 : Greedy traversal for starting state ED
ED -> NB

Start pipeline
-------------

>>Duplicate detection and removal:
* For train dataset
Metric is not considered for 'ED'.
Total number of rows: 2919
Number of duplicated rows: 0
After deduplication: Number of rows: 2919
* For test dataset
Metric is not considered for 'ED'.
Total number of

## Random data preprocessing pipelines

In [17]:
import learn2clean.loading.reader as rd 
import learn2clean.qlearning.qlearner as ql

# the results of random cleaning are stored in 'house_example2'_results_file.txt in 'save' directory
# appended to the EOF 

#random preprocessing pipeline for LASSO regression
random1=ql.Qlearner(d1,goal='LASSO',target_goal='SalePrice',target_prepare=None, verbose = False)
random1.random_cleaning('house_example2')

#random preprocessing pipeline for OLS regression
random2=ql.Qlearner(d1,goal='OLS',target_goal='SalePrice',target_prepare=None, verbose = True)
random2.random_cleaning('house_example2')

#random preprocessing pipeline for MARS regression
random3=ql.Qlearner(d1,goal='MARS',target_goal='SalePrice',target_prepare=None, verbose = False)
random3.random_cleaning('house_example2')





--------------------------
Random cleaning strategy:
 MM -> LASSO
--------------------------

Start pipeline
-------------
>>Normalization 
* For train dataset
... train dataset
* For test dataset
... test dataset
Normalization done -- CPU time: 0.08523893356323242 seconds


>>Regression task
Best alpha =  1.2
MSE of LASSO with 10  folds for cross-validation: 1369429490.2711565
Regression done -- CPU time: 0.2566490173339844 seconds
End Pipeline CPU time: 0.34195613861083984 seconds
('house_example2', 'random', 'LASSO', 'SalePrice', None, 'MM -> LASSO', 'MSE', ({'quality_metric': 1369429490.2711565}, 0.34195613861083984))


--------------------------
Random cleaning strategy:
 MM -> WR -> LOF -> ED -> OLS
--------------------------

Start pipeline
-------------
>>Normalization 
* For train dataset
MM normalizing...
... train dataset
* For test dataset
MM normalizing...
... test dataset
Normalization done -- CPU time: 0.08022880554199219 seconds


>>Feature selection 
Before feature s

30 outlying rows have been removed
with indexes: [13, 235, 465, 490, 788, 811, 1126, 1147, 1160, 1189, 1233, 1261, 1579, 1741, 1928, 1940, 2049, 2052, 2070, 2089, 2107, 2218, 2346, 2410, 2426, 2510, 2531, 2605, 2670, 2915]

Outliers:
      TotRmsAbvGrd  EnclosedPorch  MasVnrArea  MiscVal  LotFrontage  \
13        0.384615       0.000000    0.191250      0.0     0.239726   
235       0.230769       0.000000    0.377500      0.0     0.000000   
465       0.307692       0.000000    0.011250      0.0     0.133562   
490       0.153846       0.000000    0.000000      0.0     0.133562   
788       0.153846       0.000000    0.000000      0.0     0.099315   
811       0.153846       0.000000    0.105625      0.0     0.133562   
1126      0.384615       0.000000    0.081250      0.0     0.109589   
1147      0.384615       0.000000    0.000000      0.0     0.184932   
1160      0.384615       0.000000    0.000000      0.0     0.010274   
1189      0.384615       0.000000    0.000000      0.0  

... test dataset
Normalization done -- CPU time: 0.1367490291595459 seconds


>>Duplicate detection and removal:
* For train dataset
Metric is not considered for 'ED'.
Total number of rows: 1460
Number of duplicated rows: 0
After deduplication: Number of rows: 1460
* For test dataset
Metric is not considered for 'ED'.
Total number of rows: 2919
Number of duplicated rows: 0
After deduplication: Number of rows: 2919
Deduplication done -- CPU time: 0.1633467674255371 seconds


>>Regression task
MSE of MARS with 10 folds for cross-validation: 1.413304412828155
Regression done -- CPU time: 3.4923551082611084 seconds
End Pipeline CPU time: 3.7925782203674316 seconds
('house_example2', 'random', 'MARS', 'SalePrice', None, 'ZS -> Tree -> ED -> MARS', 'MSE', ({'quality_metric': 1.413304412828155}, 3.7925782203674316))


{'quality_metric': 1.413304412828155}

In [19]:
# no preprocessing for regression: results appended to the EOF 'house_example2'_results.txt 

no_prep1=ql.Qlearner(d1,goal='LASSO',target_goal='SalePrice',target_prepare=None, verbose = False)
no_prep1.no_prep('house_example2')

no_prep2=ql.Qlearner(d1,goal='OLS',target_goal='SalePrice',target_prepare=None, verbose = False)
no_prep2.no_prep('house_example2')

no_prep3=ql.Qlearner(d1,goal='MARS',target_goal='SalePrice',target_prepare=None, verbose = False)
no_prep3.no_prep('house_example2')


Start pipeline
-------------

>>Regression task
Best alpha =  0.001
MSE of LASSO with 10  folds for cross-validation: 1368302034.803517
Regression done -- CPU time: 0.27175116539001465 seconds
End Pipeline CPU time: 0.27179503440856934 seconds

Start pipeline
-------------

>>Regression task
Regression done -- CPU time: 0.04834580421447754 seconds
End Pipeline CPU time: 0.048429012298583984 seconds

Start pipeline
-------------

>>Regression task
MSE of MARS with 10 folds for cross-validation: 0.13180375406591996
Regression done -- CPU time: 7.822583198547363 seconds
End Pipeline CPU time: 7.8226189613342285 seconds


In [20]:
# no preprocessing for classification: results appended to the EOF 'house_example2'_results.txt 

no_prep1=ql.Qlearner(d2,goal='CART',target_goal='SaleCondition',target_prepare=None, verbose = False)
no_prep1.no_prep('house_example2')

no_prep2=ql.Qlearner(d2,goal='LDA',target_goal='SaleCondition',target_prepare=None, verbose = False)
no_prep2.no_prep('house_example2')

no_prep3=ql.Qlearner(d2,goal='NB',target_goal='SaleCondition',target_prepare=None, verbose = False)
no_prep3.no_prep('house_example2')


Start pipeline
-------------

>>Classification task
Avg accuracy of CART classification for 10 cross-validation : 0.8828365878725591

Classification done -- CPU time: 99.2328782081604 seconds
End Pipeline CPU time: 99.23292994499207 seconds

Start pipeline
-------------

>>Classification task

Accuracy of LDA result for 10 cross-validation : 0.8074683110654334

Classification done -- CPU time: 0.09183597564697266 seconds
End Pipeline CPU time: 0.09188604354858398 seconds

Start pipeline
-------------

>>Classification task
Accuracy of Naive Naive Bayes classification for 10 cross-validation : 0.10551558752997602

Classification done -- CPU time: 0.12027192115783691 seconds
End Pipeline CPU time: 0.12030696868896484 seconds
