# 1) To set up your own data cleaning pipeline

In [1]:
#Load the datasets
def read_dataset(name): #when only one dataset is provided as input
    import pandas as pd
    if name == "gpsa":
        df = pd.read_csv('../datasets/googleplaystore.csv', sep=',', encoding ='ISO-8859-1')
    elif name == "gpsu":
        df = pd.read_csv('../datasets/googleplaystore_reviews.csv', sep=',',encoding = 'ISO-8859-1')  
    elif name == "titanic":
        df = pd.read_csv('../datasets/titanic/titanic_train.csv', sep=',', encoding ='ISO-8859-1')
    elif name == "house":
        df = pd.read_csv('../datasets/house/house_train.csv', sep=',', encoding ='ISO-8859-1')
    else: 
        raise ValueError('Invalid dataset name')               
    return df


In [2]:
read_dataset("gpsu").head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3


## - Loading your data

In [3]:
import learn2clean.loading.reader as rd 
import learn2clean.normalization.normalizer as nl 
import pandas as pd

# executing profiling function for one dataset as input
rd.profile_summary(read_dataset('gpsu'), plot=False)


Profiling datasets
                Attribute     Type  Num. Missing Values  Num. Unique Values             Sknewness  Kurtosis
0      Sentiment_Polarity  float64              26863.0              6196.0  -0.10457655084633158  0.646756
1  Sentiment_Subjectivity  float64              26863.0              4531.0   -0.3063336025424886 -0.282853
2                     App   object                  0.0              1074.0                   N/A       N/A
3       Translated_Review   object              26868.0             27995.0                   N/A       N/A
4               Sentiment   object              26863.0                 4.0                   N/A       N/A


In [4]:

read_dataset('gpsu')['Sentiment'].head() # the target variable is numerical 


0    Positive
1    Positive
2         NaN
3    Positive
4    Positive
Name: Sentiment, dtype: object

In [5]:
# encoding of the target variable
import learn2clean.loading.reader as rd 
d_enc = rd.Reader(sep=',',verbose=True, encoding=True) 

gpsu  = ["../datasets/googleplaystore_reviews.csv"]
gpsu_encoded = d_enc.train_test_split(gpsu, 'Sentiment')
gpsu_encoded['train'].head()
gpsu_encoded['test'].head()


Reading csv : googleplaystore_reviews.csv ...
Reading data ...
CPU time: 3.3271310329437256 seconds
Profiling datasets
                Attribute     Type  Num. Missing Values  Num. Unique Values             Sknewness  Kurtosis
0      Sentiment_Polarity  float64              26863.0              6196.0  -0.10457655084633158  0.646756
1  Sentiment_Subjectivity  float64              26863.0              4531.0   -0.3063336025424886 -0.282853
2                     App   object                  0.0              1074.0                   N/A       N/A
3       Translated_Review   object              26868.0             27995.0                   N/A       N/A
4               Sentiment   object              26863.0                 4.0                   N/A       N/A

> Number of categorical features in the training set: 3
> Number of numerical features in the training set: 2
> Number of data samples : 64295

> Top sparse features (% missing values on dataset set):
Translated_Review         41.6

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
964,4 in a Row,,3,,
38801,Easy Installer - Apps On SD,What guys thinking cool,2,0.35,0.65
45485,File Manager,,3,,
6230,Alto's Adventure,I would enjoy game lot option watch video keep playing actually worked. Instead either freezes s...,2,0.07619,0.504762
44484,Farm Heroes Saga,,3,,


## - Normalize your data

In [6]:
# >> Examples of normalization
# The choice for the normalizer : 'ZS', 'MM','DS' or 'Log10'
#    Available strategies=
#       - 'ZS' z-score normalization
#       - 'MM' MinMax scaling
#       - 'DS' decimal scaling
#       - 'Log10 log10 scaling

import learn2clean.normalization.normalizer as nl 

# MM normalization with exclude = None, all numeric variables will be normalized
n1= nl.Normalizer(gpsu_encoded.copy(),strategy='MM',exclude='Sentiment')

n1.transform()['train'].head()


>>Normalization 
* For train dataset
... train dataset
* For test dataset
... test dataset
Normalization done -- CPU time: 0.06097912788391113 seconds



Unnamed: 0,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,App,Translated_Review
0,2,1.0,0.533333,10 Best Foods for You,"I like eat delicious food. That's I'm cooking food myself, case ""10 Best Foods"" helps lot, also ..."
1,2,0.625,0.288462,10 Best Foods for You,This help eating healthy exercise regular basis
2,3,,,10 Best Foods for You,
3,2,0.7,0.875,10 Best Foods for You,Works great especially going grocery store
4,2,1.0,0.3,10 Best Foods for You,Best idea us


In [7]:

#ZS normalization
n1= nl.Normalizer(gpsu_encoded.copy(),strategy='ZS',exclude='Sentiment', verbose = False)
n1.transform()['train'].head()

#DS scaling
n2= nl.Normalizer(gpsu_encoded.copy(),strategy='DS',exclude='Sentiment', verbose = False)
n2.transform()['train'].head()


>>Normalization 
* For train dataset
... train dataset
* For test dataset
... test dataset
Normalization done -- CPU time: 0.06198620796203613 seconds

>>Normalization 
* For train dataset
... train dataset
* For test dataset
... test dataset
Normalization done -- CPU time: 0.07776117324829102 seconds



Unnamed: 0,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,App,Translated_Review
0,2,1.0,0.518519,10 Best Foods for You,"I like eat delicious food. That's I'm cooking food myself, case ""10 Best Foods"" helps lot, also ..."
1,2,0.605105,0.20973,10 Best Foods for You,This help eating healthy exercise regular basis
2,3,,,10 Best Foods for You,
3,2,0.737485,0.934083,10 Best Foods for You,Works great especially going grocery store
4,2,1.0,0.213675,10 Best Foods for You,Best idea us


## - Replace missing values

In [8]:
#>> Examples for missing value imputation
# Available strategies:
#            - 'EM': only for numerical variables; imputation based on
#                expectation maximization
#            - 'MICE': only for numerical variables  missing at random (MAR);
#                Multivariate Imputation by Chained Equations
#            - 'KNN', only for numerical variables; k-nearest neighbor
#                imputation (k=4) which weights samples using the mean squared
#                difference on features for which two rows both have observed
#                data
#            - 'RAND', 'MF': both for numerical and categorical variables;
#                replace missing values by randomly selected value in the 
#                variable domain or by the most frequent value in the variable
#                domain respectively
#            - 'MEAN', 'MEDIAN': only for numerical variables; replace missing
#                values by mean or median of the numerical variable respectvely
#            - or 'DROP' remove the row with at least one missing value

import learn2clean.imputation.imputer as imp

# replace missing values by the most frequent ones in the training and testing datasets

imp1 = imp.Imputer(gpsu_encoded.copy(),strategy='MF', verbose=True).transform()

imp2 = imp.Imputer(gpsu_encoded.copy(),strategy='MEDIAN', verbose=True).transform()
imp1['train'].head()

>>Imputation 
* For train dataset
Before imputation:
Total 54256 missing values in ['Translated_Review', 'Sentiment_Polarity', 'Sentiment_Subjectivity']
- 36168 numerical missing values in ['Sentiment_Polarity', 'Sentiment_Subjectivity']
- 18088 non-numerical missing values in ['Translated_Review']
Most frequent value for  App is: CBS Sports App - Scores, News, Stats & Watch Live
Most frequent value for  Translated_Review is: Good
Most frequent value for  Sentiment is: 3
Most frequent value for  Sentiment_Polarity is: 0.0
Most frequent value for  Sentiment_Subjectivity is: 0.0
After imputation:
Total 0 missing values
- 0 numerical missing values
- 0 non-numerical missing values
* For test dataset
Before imputation:
Total 26338 missing values in ['Translated_Review', 'Sentiment_Polarity', 'Sentiment_Subjectivity']
- 17558 numerical missing values in ['Sentiment_Polarity', 'Sentiment_Subjectivity']
- 8780 non-numerical missing values in ['Translated_Review']
Most frequent value for  App 

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
2854,ANA,problem remember login ID password .,1,0.0,0.0
14556,BeyondMenu Food Delivery,Love ease simplicity!,2,0.625,0.6
19582,"CBS Sports App - Scores, News, Stats & Watch Live","It's good, works. 50% time bugs show everything going (mostly MLB play play). Other that, gets j...",2,0.358333,0.491667
20330,CNN Breaking US & World News,The works well. Too bad President would call CNN fake media enemy people. That authoritarian cou...,0,-0.066667,0.805556
15369,"Bleacher Report: sports news, scores, & highlights",Good,3,0.0,0.0


## - Detect outliers and remove them

In [9]:
# >> Examples for outlier detection and removal
# Available strategies =
#            'ZS': detects outliers using the robust Zscore as a function
#            of median and median absolute deviation (MAD)
#            'IQR': detects outliers using Q1 and Q3 +/- 1.5*InterQuartile Range
#            'LOF': detects outliers using Local Outlier Factor

                
import learn2clean.outlier_detection.outlier_detector as out

#to remove rows having 30% and more ZSB-based outling values among the numerical variables
out1=out.Outlier_detector(gpsu_encoded.copy(), strategy='ZSB', threshold = 0.3, verbose=True)
out1.transform()

#to remove rows having at least one IQR-based outlying value using threshold '-1'
out2=out.Outlier_detector(gpsu_encoded.copy(), strategy='IQR', threshold = -1, verbose=False)
out2.transform()

#to remove rows having 40% and more ZSB-based outling values among the numerical variables; 
# since LOF requires non missing values, rows with NaN are also removed
out3=out.Outlier_detector(gpsu_encoded.copy(), strategy='LOF', threshold = .4, verbose=False)
out3.transform()['train'].head()



>>Outlier detection and removal:
* For train dataset
0 outlying rows have been removed:
* For test dataset
0 outlying rows have been removed:
Outlier detection and removal done -- CPU time: 0.09380102157592773 seconds


>>Outlier detection and removal:
* For train dataset
43077 outlying rows have been removed
* For test dataset
21218 outlying rows have been removed
Outlier detection and removal done -- CPU time: 0.06182408332824707 seconds


>>Outlier detection and removal:
* For train dataset
LOF requires no missing values, so missing values have been removed using DROP.
40 outlying rows have been removed
* For test dataset
LOF requires no missing values, so missing values have been removed using DROP.
40 outlying rows have been removed
Outlier detection and removal done -- CPU time: 0.4606969356536865 seconds



Unnamed: 0,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,App,Translated_Review
6216,2,0.155556,0.688889,Alto's Adventure,"The game awesome, filled ""must have"" criterias make good game. Graphics simply breathtaking, bac..."
29674,2,1.0,1.0,Cougar Dating Life : Date Older Women Sugar Mummy,Its awesome!!!
29363,2,0.15,0.375,Cooking Madness - A Chef's Restaurant Games,like every game way many commercials way many ads trying get buy extra stuff need
40362,2,0.380556,0.611111,Enterprise Rent-A-Car,"Easy, quick user friendly. Does needed."
49676,2,0.408929,0.883929,"Free TV Shows App:News, TV Series, Episode, Movies",I started using right now. I'm pretty impressed I've seen far.


## - Detect duplicates and remove them

In [7]:
# >> Examples for duplicate detection and removal
# House dataset has no duplicate anyway
# Available strategies =
#        'ED':  exact duplicate detection/removal or
#        'AD':  for aproximate duplicate records detection and removal
#        based on Jaccard similarity 


# import the Duplicate_detector class
import learn2clean.duplicate_detection.duplicate_detector as dup

#Remove exact duplicates with 'ED' strategy of the Duplicate_detector class

dup1 = dup.Duplicate_detector(gpsu_encoded, strategy='ED', verbose=True).transform()

dup1['train'].head()

#Remove approximate duplicates with thresholding Jaccard similarity 
# using 'AD'strategy of the Duplicate_detector class
dup2 = dup.Duplicate_detector(gpsu_encoded, strategy='AD', threshold = .6, verbose=True).transform()

dup2['train']


>>Duplicate detection and removal:
* For train dataset
Metric is not considered for 'ED'.
Initial number of rows: 43077
After deduplication: Number of rows: 22229
* For test dataset
Metric is not considered for 'ED'.
Initial number of rows: 21218
After deduplication: Number of rows: 12088
Deduplication done -- CPU time: 0.027705907821655273 seconds


>>Duplicate detection and removal:
* For train dataset
Metric is not considered for 'AD'.
Number of duplicate rows removed: 10
* For test dataset
Metric is not considered for 'AD'.
Number of duplicate rows removed: 7
Deduplication done -- CPU time: 1.6808409690856934 seconds



Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
47636,Flow Free,"Well, amazing simple game. Definitely addicting like puzzles. IN every pack beaten including one...",2,0.240000,0.491429
24091,Capital OneÂ® Mobile,,3,,
42174,FINAL FANTASY BRAVE EXVIUS,I've played since started. It impossible beat everything unless TMRs never get unless fork thous...,0,-0.186667,0.473333
51567,GPS Speedometer and Odometer,Works great,2,0.800000,0.750000
23313,"Candy Camera - selfie, beauty camera, photo editor",Amazing app..thanks a lot,2,0.600000,0.900000
16631,BlueJeans for Android,,3,,
44369,Farm Heroes Saga,,3,,
16802,Booking.com Travel Deals,They canceled booking without informing,1,0.000000,0.000000
41299,Evie Launcher,"I'm blown away! After lag battery life issues Nova, I turned Evie Launcher fill home screen need...",2,0.100000,0.514286
41048,Events High - Meet Your City!,,3,,


## - Detect inconsistencies

In [10]:
# >> Examples for inconsistency detection 
# Available consistency checking strategies :
#            - 'CC': checks whether the data satisfy the constraints
#                specified in a 'file_name'_constraint.tdda stored in 'save' directory
#            - 'PC': checks whether the data satisfy the patterns
#                specified in 'file_name'_patterns.txt stored in 'save' directory

# import the Consistency_checker class                
import learn2clean.consistency_checking.consistency_checker as cc
          
# discover the constraints from the input (train) dataset and store them in a file entitled 'gpsu'_constraint.tdda in the 'save' directory
#cc.constraint_discovery(read_dataset('gpsu'), file_name='gpsu')

# discover the patterns from the input (train) dataset and store them in a file entitled 'gpsu'_patterns.txt in the 'save' directory
#cc.pattern_discovery(read_dataset('gpsu'), file_name='gpsu')

# detect pattern violations with respect to a given file of patterns entitled 'gpsu'_constraint.tdda" stored in the 'save' directory
#cc.Consistency_checker(gpsu_encoded.copy(), strategy='CC', file_name='gpsu_example',verbose=False).transform()

# detect pattern violations with respect to a given file of patterns entitled 'gpsu'_patterns.txt" stored in the 'save' directory
# with too strong patterns resulting in an empty dataframe fro the training set
print("Original size of traning set", len(gpsu_encoded['train']))
p1= cc.Consistency_checker(gpsu_encoded.copy(), strategy='PC', file_name='gpsu_example', verbose=True).transform()
print("After pattern checksing",len(p1['train']))
gpsu_encoded['train']

Original size of traning set 43077
>>Consistency checking
* For train dataset
Patterns:
         col  num        pattern
0        App    0  '^[A-Za-z]+$'
1  Sentiment    0    '^[1-2]+$^'

Number of pattern violations on variable ' App 'for pattern# 0 : 39810
Indexes of rows to be removed: []
****** 3267 39810
* For test dataset
Patterns:
         col  num        pattern
0        App    0  '^[A-Za-z]+$'
1  Sentiment    0    '^[1-2]+$^'

Number of pattern violations on variable ' App 'for pattern# 0 : 19605
Indexes of rows to be removed: []
****** 1613 19605
Consistency checking done -- CPU time: 0.17340803146362305 seconds
After pattern checksing 3267


Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
2854,ANA,problem remember login ID password .,1,0.000000,0.000000
14556,BeyondMenu Food Delivery,Love ease simplicity!,2,0.625000,0.600000
19582,"CBS Sports App - Scores, News, Stats & Watch Live","It's good, works. 50% time bugs show everything going (mostly MLB play play). Other that, gets j...",2,0.358333,0.491667
20330,CNN Breaking US & World News,The works well. Too bad President would call CNN fake media enemy people. That authoritarian cou...,0,-0.066667,0.805556
15369,"Bleacher Report: sports news, scores, & highlights",,3,,
42823,"Face Filter, Selfie Editor - Sweet Camera",Do make mistake downloading it. It takes forever load watch 30 secs videos filters. Dear program...,1,0.000000,0.000000
26023,Chictopia,,3,,
12855,Baseball Boy!,Its actually embarrassing hard game shoves ads throat. It's even good game throwing ads left rig...,0,-0.000541,0.288853
19421,CBS - Full Episodes & Live TV,,3,,
35257,Dog Run - Pet Dog Simulator,SO THIS GAME IS TOTALLY AMAZING AND I REALLY LOVE DOGS. ESPECIALLY WHEN THEY ARE FURY AND I WISH...,0,-0.030000,0.745000


## - Select features

In [19]:
# >> Examples for Feature selection
# Available strategies=
#           'MR': using a default threshold on the missing ratio per variable,
#            i.e., variables with 20% (by default) and more missing values
#            are removed
#            'LC': detects pairs of linearly correlated variables and remove one
#            'VAR': uses threshold on the variance
#            'Tree': uses decision tree classification as model for feature
#                selection given the target set for classification task
#                'SVC': uses linear SVC as model for feature selection given
#                 the target set for classification task
#            'WR': uses the selectKbest (k=10) and Chi2 for feature selection
#                given the target set for classification task
#            'L1': uses Lasso L1 for feature selection given the target set for
#                regression task
#            'IMP': uses Random Forest regression for feature selection given
#                the target set for regression task

                
import learn2clean.feature_selection.feature_selector as fs

#Available strategies for feature selection 
#        'MR': using a default threshold on the missing ratio per variable, i.e., variables
#                with 20% (by default) and more missing values are removed
#        'LC': detects pairs of linearly correlated variables and remove one
#        'VAR': uses threshold on the variance
#        'Tree': uses decision tree classification as model for feature selection given the target set for classification task
#        'SVC': uses linear SVC as model for feature selection given the target set for classification task
#        'WR': uses the selectKbest (k=10) and Chi2 for feature selection given the target set for classification task
#        'L1': uses Lasso L1 for feature selection given the target set for regression task
#        'IMP': uses Random Forest regression for feature selection given the target set for regression task

fs.Feature_selector(dataset = gpsu_encoded.copy(), strategy= 'MR', threshold=0.1, exclude=None, verbose=True).transform()

fs.Feature_selector(dataset = gpsu_encoded.copy(), strategy= 'LC', threshold=0.2,  exclude=None, verbose=True).transform()

fs.Feature_selector(dataset = gpsu_encoded.copy(), strategy= 'L1',  exclude= None, threshold=.7,verbose=True).transform()

fs.Feature_selector(dataset = gpsu_encoded.copy(), strategy= 'IMP', exclude = 'Sentiment',verbose=True, threshold=.4).transform()

fs.Feature_selector(dataset = gpsu_encoded.copy(), strategy= 'Tree',  exclude='Sentiment',verbose=True).transform()

fs.Feature_selector(dataset = gpsu_encoded.copy(), strategy= 'WR', exclude= 'Sentiment', verbose=True).transform()

fs.Feature_selector(dataset = gpsu_encoded.copy(), strategy= 'WR', exclude= 'Sentiment', verbose=True).transform()

fs.Feature_selector(dataset = gpsu_encoded.copy(), strategy= 'SVC',  exclude='Sentiment').transform()

fs.Feature_selector(dataset = gpsu_encoded.copy(), strategy= 'VAR',  exclude=None).transform()

fs.Feature_selector(dataset = gpsu_encoded.copy(), strategy= 'VAR',  exclude='Sentiment').transform()




>>Feature selection 
Before feature selection:
5 features 
Apply MR feature selection with missing threshold= 0.1
                        missing_fraction
Translated_Review               0.043103
Sentiment_Polarity              0.042877
Sentiment_Subjectivity          0.042877
App                             0.000000
Sentiment                       0.000000
0 features with greater than 0.10 missing values.

List of variables to be removed : []
List of variables to be keep
['Translated_Review', 'App', 'Sentiment_Polarity', 'Sentiment', 'Sentiment_Subjectivity']
After feature selection:
5 features remain
['App', 'Sentiment_Polarity', 'Translated_Review', 'Sentiment', 'Sentiment_Subjectivity']
Feature selection done -- CPU time: 0.01365208625793457 seconds


>>Feature selection 
Before feature selection:
5 features 
Apply LC feature selection with threshold= 0.2
Correlation matrix
                        Sentiment  Sentiment_Polarity  Sentiment_Subjectivity
Sentiment                1.000

{'train':        Sentiment  Sentiment_Polarity  Sentiment_Subjectivity
 12307          0           -0.320000                0.420000
 41643          3                 NaN                     NaN
 43080          0           -0.180556                0.722222
 34341          0           -0.100000                0.562500
 30008          3                 NaN                     NaN
 3118           2            0.577778                0.727778
 57633          2            0.093750                0.472917
 9637           3                 NaN                     NaN
 59036          1            0.000000                0.000000
 27988          2            0.100000                0.562500
 8511           1            0.000000                0.000000
 4707           3                 NaN                     NaN
 20465          2            0.066667                0.550000
 54407          2            0.433333                0.833333
 21186          3                 NaN                     NaN

## >> Classification 

In [13]:
import learn2clean.classification.classifier as cl
#output is accuracy of classification for k=10 cross-validation and execution time 
#plus a detailed classification report if verbose = True

Cl1 = cl.Classifier(dataset = gpsu_encoded.copy(),target = 'Sentiment',strategy = 'LDA', verbose = True).transform()

Cl2 = cl.Classifier(dataset = gpsu_encoded,target = 'Sentiment',strategy = 'NB',verbose = False).transform()



>>Classification task
{'mean_fit_time': array([0.00579399]), 'std_fit_time': array([0.00082295]), 'mean_score_time': array([0.00062704]), 'std_score_time': array([0.00012987]), 'params': [{}], 'split0_test_score': array([0.93063693]), 'split1_test_score': array([0.93252693]), 'split2_test_score': array([0.9221172]), 'split3_test_score': array([0.92117202]), 'mean_test_score': array([0.92661374]), 'std_test_score': array([0.00502452]), 'rank_test_score': array([1], dtype=int32), 'split0_train_score': array([0.92628064]), 'split1_train_score': array([0.92546153]), 'split2_train_score': array([0.92710433]), 'split3_train_score': array([0.92886845]), 'mean_train_score': array([0.92692874]), 'std_train_score': array([0.00126155])}

Accuracy of LDA result for 10 cross-validation : 0.926613741612324

Classification done -- CPU time: 0.05927395820617676 seconds

>>Classification task
Accuracy of Naive Naive Bayes classification for 10 cross-validation : 0.9697571118041773

Classification done

## >> Regression

In [11]:
import learn2clean.regression.regressor as rg
# output is MSE and computation time, with regression summary if verbose = True
 
    
rg1 = rg.Regressor(dataset = gpsu_encoded,target = 'Sentiment',strategy= 'LASSO', verbose = True).transform()

rg3 = rg.Regressor(dataset = gpsu_encoded,target = 'Sentiment',strategy= 'OLS',verbose = True).transform()

rg2 = rg.Regressor(dataset = gpsu_encoded,target = 'Sentiment',strategy= 'MARS',verbose = True).transform()



>>Regression task
MSE values of cross validation
[[5.27306573e-01 5.41007665e-01 5.31604921e-01 5.34189732e-01
  5.34285549e-01 5.29979682e-01 5.20422726e-01 5.32916849e-01
  5.29568719e-01 5.39058098e-01]
 [3.66185120e-01 3.75699767e-01 3.69170084e-01 3.70965092e-01
  3.71031632e-01 3.68041446e-01 3.61404671e-01 3.70081145e-01
  3.67756055e-01 3.74345901e-01]
 [2.34358477e-01 2.40447851e-01 2.36268854e-01 2.37417659e-01
  2.37460244e-01 2.35546525e-01 2.31298989e-01 2.36851933e-01
  2.35363875e-01 2.39581377e-01]
 [9.15462800e-02 9.39249418e-02 9.22925210e-02 9.27412729e-02
  9.27579079e-02 9.20103614e-02 9.03511677e-02 9.25202862e-02
  9.19390137e-02 9.35864753e-02]
 [2.28865700e-02 2.34812355e-02 2.30731303e-02 2.31853182e-02
  2.31894770e-02 2.30025904e-02 2.25877919e-02 2.31300715e-02
  2.29847534e-02 2.33966188e-02]
 [3.66185120e-03 3.75699767e-03 3.69170084e-03 3.70965092e-03
  3.71031632e-03 3.68041446e-03 3.61404671e-03 3.70081145e-03
  3.67756055e-03 3.74345901e-03]
 [9.1546

## >> Clustering

In [3]:
import learn2clean.clustering.clusterer as ct
# clustering is applied to one dataset (i.e., the training set if two datasets are given in the path)
# output is silhouette, best k, and computation time, plus the training dataset with cluster IDs

ct.Clusterer(dataset = gpsu_encoded,strategy= 'KMEANS', verbose=True).transform()
#ct.Clusterer(dataset = gpsu_encoded,strategy='HCA', verbose = True).transform()
#ct.Clusterer(dataset = gpsu_encoded,strategy='HCA', metric= 'euclidean', verbose = True).transform()
#ct.Clusterer(dataset = gpsu_encoded,strategy='HCA', metric= 'cosine', verbose = True).transform()
#ct.Clusterer(dataset = gpsu_encoded,strategy='HCA', metric= 'cityblock', verbose = True).transform()



>>Clustering task
Note: The clustering is applied on the training dataset only.
Best silhouette = 0.6267  for k= 2
Quality of clustering 0.6267
Labels distribution:
1    8337
0    8322
Name: cluster_ID, dtype: int64
Clustering done -- CPU time: 63.12605404853821 seconds


{'quality_metric': 0.6267,
 'result': {'train':        Sentiment_Polarity  Sentiment_Subjectivity  New_ID  cluster_ID
  0                0.675919            6.791192e-01       1           1
  1                0.388889            1.000000e-07       2           1
  3                0.933333            7.896552e-01       3           1
  6                0.081633            5.967453e-01       4           1
  9                0.388889            1.000000e-07       5           1
  10               0.888889            9.259259e-01       6           1
  11               0.388889            1.000000e-07       7           1
  12               0.388889            1.000000e-07       8           1
  16               0.758303            8.227331e-01       9           1
  18               0.698387            6.396516e-01      10           1
  19               0.388889            1.000000e-07      11           1
  20               0.778635            6.491863e-01      12           1
  21              

## - Create your own pipeline

In [13]:
# create your preprocessing pipeline for classification

import learn2clean.loading.reader as rd 
import learn2clean.normalization.normalizer as nl 
import learn2clean.feature_selection.feature_selector as fs
import learn2clean.duplicate_detection.duplicate_detector as dd
import learn2clean.outlier_detection.outlier_detector as od
import learn2clean.imputation.imputer as imp
import learn2clean.classification.classifier as cl

d_enc = rd.Reader(sep=',',verbose=True, encoding=True) 
gpsu  = ["../datasets/googleplaystore_reviews.csv"]
gpsu_encoded = d_enc.train_test_split(gpsu, 'Sentiment')

# replace numerical missing values by median
d1 = imp.Imputer(dataset=gpsu_encoded, strategy = 'MEDIAN',verbose=False).transform()
# decima scaling for numerical variables
d2 = nl.Normalizer(dataset=d1, strategy='DS', exclude = 'Sentiment', verbose=False).transform()
# eliminate 20 LOF outliers
d3 = od.Outlier_detector(dataset=d2, strategy='LOF', threshold= 0.2,verbose=False).transform()

# classify with LDA
cl.Classifier(dataset=d3,strategy = 'LDA', target = 'Sentiment', verbose =True).transform()



Reading csv : googleplaystore_reviews.csv ...
Reading data ...
CPU time: 0.27039003372192383 seconds
Profiling datasets
                Attribute     Type  Num. Missing Values  Num. Unique Values             Sknewness  Kurtosis
0      Sentiment_Polarity  float64              26863.0              6196.0  -0.10457655084633158  0.646756
1  Sentiment_Subjectivity  float64              26863.0              4531.0   -0.3063336025424886 -0.282853
2                     App   object                  0.0              1074.0                   N/A       N/A
3       Translated_Review   object              26868.0             27995.0                   N/A       N/A
4               Sentiment   object              26863.0                 4.0                   N/A       N/A

> Number of categorical features in the training set: 3
> Number of numerical features in the training set: 2
> Number of data samples : 64295

> Top sparse features (% missing values on dataset set):
Translated_Review         41.

{'quality_metric': 0.6394541536988269}

# Learn2clean data preprocessing pipeline

### Classification with Learn2Clean

In [14]:
import learn2clean.qlearning.qlearner as ql
# Learn2clean finds the best strategy 'ZS -> ED -> NB'for maximal accuracy : 0.0.6408668730650154 for NB
# in 4.58 seconds
# The best strategy is stored in EOF of 'gpsu_example_results.txt' in 'save' directory as
# ('gpsu_example', 'learn2clean', 'NB', 'Sentiment', None, 'ZS -> ED -> NB', 'accuracy', 0.6408668730650154, 4.58355188369751)
l2c_c1assification1=ql.Qlearner(dataset = gpsu_encoded,goal='NB',target_goal='Sentiment',
                                target_prepare=None, file_name = 'gpsu_example', verbose = False)
l2c_c1assification1.learn2clean()

Start Learn2Clean
Learn2Clean - Pipeline construction -- CPU time: 0.13373112678527832 seconds
=== Start Pipeline Execution ===


Strategy# 0 : Greedy traversal for starting state DS
DS -> IQR -> NB

Start pipeline
-------------
>>Normalization 
* For train dataset
... train dataset
* For test dataset
... test dataset
Normalization done -- CPU time: 0.03691911697387695 seconds


>>Outlier detection and removal:
* For train dataset
0 outlying rows have been removed
* For test dataset
0 outlying rows have been removed
Outlier detection and removal done -- CPU time: 0.035826921463012695 seconds


>>Classification task
Accuracy of Naive Naive Bayes classification for 10 cross-validation : 0.6394541536988269

Classification done -- CPU time: 0.06967592239379883 seconds
End Pipeline CPU time: 0.14260268211364746 seconds


Strategy# 1 : Greedy traversal for starting state MM
MM -> AD -> NB

Start pipeline
-------------
>>Normalization 
* For train dataset
... train dataset
* For test dataset


Accuracy of Naive Naive Bayes classification for 10 cross-validation : 0.6408668730650154

Classification done -- CPU time: 0.06667399406433105 seconds
End Pipeline CPU time: 0.08294820785522461 seconds


Strategy# 12 : Greedy traversal for starting state AD
AD -> NB

Start pipeline
-------------

>>Duplicate detection and removal:
* For train dataset
Metric is not considered for 'AD'.
Number of duplicate rows removed: 7
* For test dataset
Metric is not considered for 'AD'.
Number of duplicate rows removed: 3
Deduplication done -- CPU time: 0.886221170425415 seconds


>>Classification task
Accuracy of Naive Naive Bayes classification for 10 cross-validation : 0.6395425423627328

Classification done -- CPU time: 0.06791400909423828 seconds
End Pipeline CPU time: 0.954207181930542 seconds

Start pipeline
-------------

>>Classification task
Accuracy of Naive Naive Bayes classification for 10 cross-validation : 0.6394541536988269

Classification done -- CPU time: 0.0760650634765625 second

## Random data preprocessing pipelines

In [10]:
import learn2clean.loading.reader as rd 
import learn2clean.qlearning.qlearner as ql

d_enc = rd.Reader(sep=',',verbose=False, encoding=True) 
gpsu  = ["../datasets/googleplaystore_reviews.csv"]
gpsu_encoded = d_enc.train_test_split(gpsu, 'Sentiment')

# the results of random cleaning are stored in 'gpsu_example'_results_file.txt in 'save' directory
# appended to the EOF 
# random pipeline for LDA classification
random1=ql.Qlearner(gpsu_encoded.copy(),goal='LDA',target_goal='Sentiment',target_prepare=None, verbose = False)
random1.random_cleaning('gpsu_example')

gpsu_encoded['train']




--------------------------
Random cleaning strategy:
 LC -> ZSB -> ED -> LDA
--------------------------

Start pipeline
-------------

>>Feature selection 
Before feature selection:
5 features 
Apply LC feature selection with threshold= 0.3
0 features with linear correlation greater than 0.30.

List of correlated variables to be removed : []
After feature selection:
5 features remain
['Sentiment_Polarity', 'Translated_Review', 'App', 'Sentiment', 'Sentiment_Subjectivity']
Feature selection done -- CPU time: 0.017850160598754883 seconds


>>Outlier detection and removal:
* For train dataset
0 outlying rows have been removed:
* For test dataset
0 outlying rows have been removed:
Outlier detection and removal done -- CPU time: 0.06990385055541992 seconds


>>Duplicate detection and removal:
* For train dataset
Metric is not considered for 'ED'.
Initial number of rows: 43077
After deduplication: Number of rows: 22180
* For test dataset
Metric is not considered for 'ED'.
Initial number of

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
43353,Fair: A New Way To Own A Car,,,,
38028,EMT-B Pocket Prep,,,,
46171,Firefox Browser fast & private,,,,
49297,Free Dating Hook Up Messenger,Try out! It's completely fun provides constant array free date babes. You gotta give try worked ...,Positive,0.327881,0.463540
14051,Best Car Wallpapers,,,,
59741,Hangouts,Good,Positive,0.700000,0.600000
58266,HBO NOW: Stream TV & Movies,,,,
974,4 in a Row,,,,
56608,Google Photos,This awesome. I things before. Im new play it. I enjoying every minute. Thank Google,Positive,0.545455,0.684848
27276,ClassDojo,,,,


In [11]:
# no preprocessing: results appended to the EOF 'gpsu_example'_results.txt 

no_prep1=ql.Qlearner(gpsu_encoded.copy(),goal='CART',target_goal='Sentiment',target_prepare=None, verbose = False)
no_prep1.no_prep('gpsu_example')

no_prep2=ql.Qlearner(gpsu_encoded.copy(),goal='LDA',target_goal='Sentiment',target_prepare=None, verbose = False)
no_prep2.no_prep('gpsu_example')

no_prep3=ql.Qlearner(gpsu_encoded.copy(),goal='NB',target_goal='Sentiment',target_prepare=None, verbose = False)
no_prep3.no_prep('gpsu_example')


Start pipeline
-------------

>>Classification task
Avg accuracy of CART classification for 10 cross-validation : 0.9987655795802971

Classification done -- CPU time: 163.18577790260315 seconds
End Pipeline CPU time: 163.2061710357666 seconds

Start pipeline
-------------

>>Classification task

Accuracy of LDA result for 10 cross-validation : 0.9286027157249234

Classification done -- CPU time: 0.19683003425598145 seconds
End Pipeline CPU time: 0.2148118019104004 seconds

Start pipeline
-------------

>>Classification task
Accuracy of Naive Naive Bayes classification for 10 cross-validation : 0.9681439891689563

Classification done -- CPU time: 0.4236910343170166 seconds
End Pipeline CPU time: 0.4411659240722656 seconds
