In [79]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (
    train_test_split, StratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV
)
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier

from os import cpu_count

n_jobs = max(cpu_count()-1, 1)

In [2]:
train = pd.read_csv('../data/modulbank/train.csv', sep = '\t', index_col=0)
test =pd.read_csv('../data/modulbank/test.csv', sep = '\t', index_col=0)
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,336,337,338,339,340,341,342,343,344,345
0,1,1,0,0,0,0,0,0.090909,0,0,...,0.221395,0,1,0,0,0.222222,1,1,1,1
1,1,1,0,0,1,0,0,0.090909,0,0,...,0.241508,0,1,0,0,0.111111,1,1,1,0
2,0,1,0,0,1,0,0,0.090909,0,0,...,0.123067,0,1,0,0,0.444444,1,1,1,1
3,0,1,0,0,1,0,0,0.136364,0,0,...,0.296065,0,0,1,0,0.222222,1,1,1,0
4,0,1,0,0,1,0,0,0.136364,0,0,...,0.178956,0,0,1,0,0.111111,1,1,1,1


In [3]:
target = train['0']

In [4]:
# drop constant columns
unique_value = train.nunique()
one_value_col = unique_value[unique_value == 1].index
unique_value_test = test.nunique()
one_value_col_test = unique_value[unique_value == 1].index
train = train.drop(one_value_col, axis=1, errors='ignore')
test = test.drop(one_value_col_test, axis=1, errors='ignore')

In [5]:
df = pd.concat([train, test])

In [6]:
def get_categorical_col(df, k = 0.9, n = 10):
    #get categorical values
    categ_col = []
    row_num = df.shape[0]
    unique_value = df.nunique()
    columns = unique_value[unique_value <= n].index
    for c in columns:
        if df[c].value_counts().iloc[0] < row_num * k:
            categ_col.append(c)
    return categ_col

In [7]:
categ_col =  get_categorical_col(df, k = 0.9, n = 10)
num_col = list(set(df.columns) - set(categ_col))
categ_col.remove('0')

In [8]:
var_col = df[num_col].var(axis = 0)
num_col_filtered = list(var_col[var_col > 0.1].index)
len(num_col_filtered)

16

In [10]:
import pandas_profiling as pp
import warnings
warnings.filterwarnings('ignore')
pp.ProfileReport(train[categ_col])

0,1
Number of variables,38
Number of observations,30500
Total Missing (%),0.0%
Total size in memory,9.1 MiB
Average record size in memory,312.0 B

0,1
Numeric,37
Categorical,0
Date,0
Text (Unique),0
Rejected,1

0,1
Distinct count,5
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.58752
Minimum,0
Maximum,1
Zeros (%),14.3%

0,1
Minimum,0.0
5-th percentile,0.0
Q1,0.5
Median,0.5
Q3,0.75
95-th percentile,1.0
Maximum,1.0
Range,1.0
Interquartile range,0.25

0,1
Standard deviation,0.2964
Coef of variation,0.5045
Kurtosis,-0.16156
Mean,0.58752
MAD,0.2349
Skewness,-0.64662
Sum,17919
Variance,0.087854
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0.5,11488,37.7%,
0.75,9780,32.1%,
1.0,4832,15.8%,
0.0,4367,14.3%,
0.25,33,0.1%,

Value,Count,Frequency (%),Unnamed: 3
0.0,4367,14.3%,
0.25,33,0.1%,
0.5,11488,37.7%,
0.75,9780,32.1%,
1.0,4832,15.8%,

Value,Count,Frequency (%),Unnamed: 3
0.0,4367,14.3%,
0.25,33,0.1%,
0.5,11488,37.7%,
0.75,9780,32.1%,
1.0,4832,15.8%,

0,1
Distinct count,5
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.54456
Minimum,0
Maximum,1
Zeros (%),0.0%

0,1
Minimum,0.0
5-th percentile,0.5
Q1,0.5
Median,0.5
Q3,0.5
95-th percentile,0.75
Maximum,1.0
Range,1.0
Interquartile range,0.0

0,1
Standard deviation,0.11484
Coef of variation,0.21088
Kurtosis,4.1839
Mean,0.54456
MAD,0.079883
Skewness,1.8366
Sum,16609
Variance,0.013187
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0.5,24965,81.9%,
0.75,4573,15.0%,
1.0,612,2.0%,
0.25,339,1.1%,
0.0,11,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0.0,11,0.0%,
0.25,339,1.1%,
0.5,24965,81.9%,
0.75,4573,15.0%,
1.0,612,2.0%,

Value,Count,Frequency (%),Unnamed: 3
0.0,11,0.0%,
0.25,339,1.1%,
0.5,24965,81.9%,
0.75,4573,15.0%,
1.0,612,2.0%,

0,1
Distinct count,10
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.040882
Minimum,0
Maximum,0.096774
Zeros (%),0.0%

0,1
Minimum,0.0
5-th percentile,0.010753
Q1,0.021505
Median,0.032258
Q3,0.053763
95-th percentile,0.096774
Maximum,0.096774
Range,0.096774
Interquartile range,0.032258

0,1
Standard deviation,0.024364
Coef of variation,0.59595
Kurtosis,-0.14161
Mean,0.040882
MAD,0.020385
Skewness,0.84388
Sum,1246.9
Variance,0.00059358
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0.021505376,7822,25.6%,
0.032258065,6287,20.6%,
0.053763441,5958,19.5%,
0.010752688,3473,11.4%,
0.096774194,2258,7.4%,
0.064516129,1507,4.9%,
0.075268817,1347,4.4%,
0.043010753,1324,4.3%,
0.086021505,513,1.7%,
0.0,11,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0.0,11,0.0%,
0.010752688,3473,11.4%,
0.021505376,7822,25.6%,
0.032258065,6287,20.6%,
0.043010753,1324,4.3%,

Value,Count,Frequency (%),Unnamed: 3
0.053763441,5958,19.5%,
0.064516129,1507,4.9%,
0.075268817,1347,4.4%,
0.086021505,513,1.7%,
0.096774194,2258,7.4%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.19275
Minimum,0
Maximum,1
Zeros (%),80.7%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.39447
Coef of variation,2.0465
Kurtosis,0.427
Mean,0.19275
MAD,0.3112
Skewness,1.5579
Sum,5879
Variance,0.15561
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,24621,80.7%,
1,5879,19.3%,

Value,Count,Frequency (%),Unnamed: 3
0,24621,80.7%,
1,5879,19.3%,

Value,Count,Frequency (%),Unnamed: 3
0,24621,80.7%,
1,5879,19.3%,

0,1
Correlation,0.95576

0,1
Distinct count,5
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.89432
Minimum,0
Maximum,1
Zeros (%),0.6%

0,1
Minimum,0.0
5-th percentile,0.23319
Q1,1.0
Median,1.0
Q3,1.0
95-th percentile,1.0
Maximum,1.0
Range,1.0
Interquartile range,0.0

0,1
Standard deviation,0.26796
Coef of variation,0.29963
Kurtosis,2.7505
Mean,0.89432
MAD,0.18277
Skewness,-2.1621
Sum,27277
Variance,0.071805
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
1.0,26374,86.5%,
0.233191725,3862,12.7%,
0.0,181,0.6%,
0.0260542,82,0.3%,
0.003846547,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0.0,181,0.6%,
0.003846547,1,0.0%,
0.0260542,82,0.3%,
0.233191725,3862,12.7%,
1.0,26374,86.5%,

Value,Count,Frequency (%),Unnamed: 3
0.0,181,0.6%,
0.003846547,1,0.0%,
0.0260542,82,0.3%,
0.233191725,3862,12.7%,
1.0,26374,86.5%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.3258
Minimum,0
Maximum,1
Zeros (%),67.4%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,1
95-th percentile,1
Maximum,1
Range,1
Interquartile range,1

0,1
Standard deviation,0.46868
Coef of variation,1.4385
Kurtosis,-1.4475
Mean,0.3258
MAD,0.43931
Skewness,0.7434
Sum,9937
Variance,0.21966
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,20563,67.4%,
1,9937,32.6%,

Value,Count,Frequency (%),Unnamed: 3
0,20563,67.4%,
1,9937,32.6%,

Value,Count,Frequency (%),Unnamed: 3
0,20563,67.4%,
1,9937,32.6%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.26482
Minimum,0
Maximum,1
Zeros (%),73.5%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,1
95-th percentile,1
Maximum,1
Range,1
Interquartile range,1

0,1
Standard deviation,0.44124
Coef of variation,1.6662
Kurtosis,-0.86358
Mean,0.26482
MAD,0.38938
Skewness,1.0661
Sum,8077
Variance,0.1947
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,22423,73.5%,
1,8077,26.5%,

Value,Count,Frequency (%),Unnamed: 3
0,22423,73.5%,
1,8077,26.5%,

Value,Count,Frequency (%),Unnamed: 3
0,22423,73.5%,
1,8077,26.5%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.24561
Minimum,0
Maximum,1
Zeros (%),75.4%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.43045
Coef of variation,1.7526
Kurtosis,-0.60278
Mean,0.24561
MAD,0.37057
Skewness,1.1821
Sum,7491
Variance,0.18529
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,23009,75.4%,
1,7491,24.6%,

Value,Count,Frequency (%),Unnamed: 3
0,23009,75.4%,
1,7491,24.6%,

Value,Count,Frequency (%),Unnamed: 3
0,23009,75.4%,
1,7491,24.6%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.12036
Minimum,0
Maximum,1
Zeros (%),88.0%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.32539
Coef of variation,2.7034
Kurtosis,3.446
Mean,0.12036
MAD,0.21175
Skewness,2.3336
Sum,3671
Variance,0.10588
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,26829,88.0%,
1,3671,12.0%,

Value,Count,Frequency (%),Unnamed: 3
0,26829,88.0%,
1,3671,12.0%,

Value,Count,Frequency (%),Unnamed: 3
0,26829,88.0%,
1,3671,12.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.84544
Minimum,0
Maximum,1
Zeros (%),15.5%

0,1
Minimum,0
5-th percentile,0
Q1,1
Median,1
Q3,1
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.36149
Coef of variation,0.42757
Kurtosis,1.6534
Mean,0.84544
MAD,0.26134
Skewness,-1.9114
Sum,25786
Variance,0.13067
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
1,25786,84.5%,
0,4714,15.5%,

Value,Count,Frequency (%),Unnamed: 3
0,4714,15.5%,
1,25786,84.5%,

Value,Count,Frequency (%),Unnamed: 3
0,4714,15.5%,
1,25786,84.5%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.37374
Minimum,0
Maximum,1
Zeros (%),62.6%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,1
95-th percentile,1
Maximum,1
Range,1
Interquartile range,1

0,1
Standard deviation,0.4838
Coef of variation,1.2945
Kurtosis,-1.7276
Mean,0.37374
MAD,0.46812
Skewness,0.52199
Sum,11399
Variance,0.23407
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,19101,62.6%,
1,11399,37.4%,

Value,Count,Frequency (%),Unnamed: 3
0,19101,62.6%,
1,11399,37.4%,

Value,Count,Frequency (%),Unnamed: 3
0,19101,62.6%,
1,11399,37.4%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.56449
Minimum,0
Maximum,1
Zeros (%),43.6%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,1
Q3,1
95-th percentile,1
Maximum,1
Range,1
Interquartile range,1

0,1
Standard deviation,0.49583
Coef of variation,0.87837
Kurtosis,-1.9324
Mean,0.56449
MAD,0.49168
Skewness,-0.26015
Sum,17217
Variance,0.24585
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
1,17217,56.4%,
0,13283,43.6%,

Value,Count,Frequency (%),Unnamed: 3
0,13283,43.6%,
1,17217,56.4%,

Value,Count,Frequency (%),Unnamed: 3
0,13283,43.6%,
1,17217,56.4%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.48967
Minimum,0
Maximum,1
Zeros (%),51.0%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,1
95-th percentile,1
Maximum,1
Range,1
Interquartile range,1

0,1
Standard deviation,0.4999
Coef of variation,1.0209
Kurtosis,-1.9984
Mean,0.48967
MAD,0.49979
Skewness,0.041322
Sum,14935
Variance,0.2499
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,15565,51.0%,
1,14935,49.0%,

Value,Count,Frequency (%),Unnamed: 3
0,15565,51.0%,
1,14935,49.0%,

Value,Count,Frequency (%),Unnamed: 3
0,15565,51.0%,
1,14935,49.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.40161
Minimum,0
Maximum,1
Zeros (%),59.8%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,1
95-th percentile,1
Maximum,1
Range,1
Interquartile range,1

0,1
Standard deviation,0.49023
Coef of variation,1.2207
Kurtosis,-1.839
Mean,0.40161
MAD,0.48064
Skewness,0.40144
Sum,12249
Variance,0.24033
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,18251,59.8%,
1,12249,40.2%,

Value,Count,Frequency (%),Unnamed: 3
0,18251,59.8%,
1,12249,40.2%,

Value,Count,Frequency (%),Unnamed: 3
0,18251,59.8%,
1,12249,40.2%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.12049
Minimum,0
Maximum,1
Zeros (%),88.0%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.32554
Coef of variation,2.7018
Kurtosis,3.4371
Mean,0.12049
MAD,0.21195
Skewness,2.3317
Sum,3675
Variance,0.10598
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,26825,88.0%,
1,3675,12.0%,

Value,Count,Frequency (%),Unnamed: 3
0,26825,88.0%,
1,3675,12.0%,

Value,Count,Frequency (%),Unnamed: 3
0,26825,88.0%,
1,3675,12.0%,

0,1
Distinct count,10
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.085858
Minimum,0
Maximum,1
Zeros (%),71.1%

0,1
Minimum,0.0
5-th percentile,0.0
Q1,0.0
Median,0.0
Q3,0.11111
95-th percentile,0.44444
Maximum,1.0
Range,1.0
Interquartile range,0.11111

0,1
Standard deviation,0.16588
Coef of variation,1.932
Kurtosis,5.0351
Mean,0.085858
MAD,0.12217
Skewness,2.239
Sum,2618.7
Variance,0.027515
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0.0,21699,71.1%,
0.222222222,3367,11.0%,
0.111111111,2119,6.9%,
0.444444444,1983,6.5%,
0.333333333,485,1.6%,
0.777777778,356,1.2%,
0.555555556,281,0.9%,
0.666666667,138,0.5%,
0.888888889,45,0.1%,
1.0,27,0.1%,

Value,Count,Frequency (%),Unnamed: 3
0.0,21699,71.1%,
0.111111111,2119,6.9%,
0.222222222,3367,11.0%,
0.333333333,485,1.6%,
0.444444444,1983,6.5%,

Value,Count,Frequency (%),Unnamed: 3
0.555555556,281,0.9%,
0.666666667,138,0.5%,
0.777777778,356,1.2%,
0.888888889,45,0.1%,
1.0,27,0.1%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.62623
Minimum,0
Maximum,1
Zeros (%),37.4%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,1
Q3,1
95-th percentile,1
Maximum,1
Range,1
Interquartile range,1

0,1
Standard deviation,0.48381
Coef of variation,0.77258
Kurtosis,-1.7278
Mean,0.62623
MAD,0.46813
Skewness,-0.52185
Sum,19100
Variance,0.23407
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
1,19100,62.6%,
0,11400,37.4%,

Value,Count,Frequency (%),Unnamed: 3
0,11400,37.4%,
1,19100,62.6%,

Value,Count,Frequency (%),Unnamed: 3
0,11400,37.4%,
1,19100,62.6%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.16485
Minimum,0
Maximum,1
Zeros (%),83.5%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.37105
Coef of variation,2.2508
Kurtosis,1.2638
Mean,0.16485
MAD,0.27535
Skewness,1.8066
Sum,5028
Variance,0.13768
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,25472,83.5%,
1,5028,16.5%,

Value,Count,Frequency (%),Unnamed: 3
0,25472,83.5%,
1,5028,16.5%,

Value,Count,Frequency (%),Unnamed: 3
0,25472,83.5%,
1,5028,16.5%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.11584
Minimum,0
Maximum,1
Zeros (%),88.4%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.32003
Coef of variation,2.7628
Kurtosis,3.7647
Mean,0.11584
MAD,0.20484
Skewness,2.4009
Sum,3533
Variance,0.10242
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,26967,88.4%,
1,3533,11.6%,

Value,Count,Frequency (%),Unnamed: 3
0,26967,88.4%,
1,3533,11.6%,

Value,Count,Frequency (%),Unnamed: 3
0,26967,88.4%,
1,3533,11.6%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.43767
Minimum,0
Maximum,1
Zeros (%),56.2%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,1
95-th percentile,1
Maximum,1
Range,1
Interquartile range,1

0,1
Standard deviation,0.49611
Coef of variation,1.1335
Kurtosis,-1.937
Mean,0.43767
MAD,0.49223
Skewness,0.25128
Sum,13349
Variance,0.24612
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,17151,56.2%,
1,13349,43.8%,

Value,Count,Frequency (%),Unnamed: 3
0,17151,56.2%,
1,13349,43.8%,

Value,Count,Frequency (%),Unnamed: 3
0,17151,56.2%,
1,13349,43.8%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.10521
Minimum,0
Maximum,1
Zeros (%),89.5%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.30683
Coef of variation,2.9163
Kurtosis,4.6231
Mean,0.10521
MAD,0.18829
Skewness,2.5735
Sum,3209
Variance,0.094146
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,27291,89.5%,
1,3209,10.5%,

Value,Count,Frequency (%),Unnamed: 3
0,27291,89.5%,
1,3209,10.5%,

Value,Count,Frequency (%),Unnamed: 3
0,27291,89.5%,
1,3209,10.5%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.12964
Minimum,0
Maximum,1
Zeros (%),87.0%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.33591
Coef of variation,2.5911
Kurtosis,2.8633
Mean,0.12964
MAD,0.22567
Skewness,2.2053
Sum,3954
Variance,0.11284
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,26546,87.0%,
1,3954,13.0%,

Value,Count,Frequency (%),Unnamed: 3
0,26546,87.0%,
1,3954,13.0%,

Value,Count,Frequency (%),Unnamed: 3
0,26546,87.0%,
1,3954,13.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.13023
Minimum,0
Maximum,1
Zeros (%),87.0%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.33656
Coef of variation,2.5844
Kurtosis,2.8291
Mean,0.13023
MAD,0.22654
Skewness,2.1975
Sum,3972
Variance,0.11327
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,26528,87.0%,
1,3972,13.0%,

Value,Count,Frequency (%),Unnamed: 3
0,26528,87.0%,
1,3972,13.0%,

Value,Count,Frequency (%),Unnamed: 3
0,26528,87.0%,
1,3972,13.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.17833
Minimum,0
Maximum,1
Zeros (%),82.2%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.38279
Coef of variation,2.1466
Kurtosis,0.82501
Mean,0.17833
MAD,0.29305
Skewness,1.6808
Sum,5439
Variance,0.14653
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,25061,82.2%,
1,5439,17.8%,

Value,Count,Frequency (%),Unnamed: 3
0,25061,82.2%,
1,5439,17.8%,

Value,Count,Frequency (%),Unnamed: 3
0,25061,82.2%,
1,5439,17.8%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.11426
Minimum,0
Maximum,1
Zeros (%),88.6%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.31813
Coef of variation,2.7842
Kurtosis,3.8816
Mean,0.11426
MAD,0.20241
Skewness,2.4252
Sum,3485
Variance,0.10121
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,27015,88.6%,
1,3485,11.4%,

Value,Count,Frequency (%),Unnamed: 3
0,27015,88.6%,
1,3485,11.4%,

Value,Count,Frequency (%),Unnamed: 3
0,27015,88.6%,
1,3485,11.4%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.10816
Minimum,0
Maximum,1
Zeros (%),89.2%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.31059
Coef of variation,2.8715
Kurtosis,4.3674
Mean,0.10816
MAD,0.19293
Skewness,2.5233
Sum,3299
Variance,0.096468
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,27201,89.2%,
1,3299,10.8%,

Value,Count,Frequency (%),Unnamed: 3
0,27201,89.2%,
1,3299,10.8%,

Value,Count,Frequency (%),Unnamed: 3
0,27201,89.2%,
1,3299,10.8%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.12734
Minimum,0
Maximum,1
Zeros (%),87.3%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.33336
Coef of variation,2.6178
Kurtosis,2.9993
Mean,0.12734
MAD,0.22226
Skewness,2.2359
Sum,3884
Variance,0.11113
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,26616,87.3%,
1,3884,12.7%,

Value,Count,Frequency (%),Unnamed: 3
0,26616,87.3%,
1,3884,12.7%,

Value,Count,Frequency (%),Unnamed: 3
0,26616,87.3%,
1,3884,12.7%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.16708
Minimum,0
Maximum,1
Zeros (%),83.3%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.37306
Coef of variation,2.2328
Kurtosis,1.1861
Mean,0.16708
MAD,0.27833
Skewness,1.7849
Sum,5096
Variance,0.13917
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,25404,83.3%,
1,5096,16.7%,

Value,Count,Frequency (%),Unnamed: 3
0,25404,83.3%,
1,5096,16.7%,

Value,Count,Frequency (%),Unnamed: 3
0,25404,83.3%,
1,5096,16.7%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.22918
Minimum,0
Maximum,1
Zeros (%),77.1%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.42031
Coef of variation,1.834
Kurtosis,-0.33916
Mean,0.22918
MAD,0.35331
Skewness,1.2887
Sum,6990
Variance,0.17666
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,23510,77.1%,
1,6990,22.9%,

Value,Count,Frequency (%),Unnamed: 3
0,23510,77.1%,
1,6990,22.9%,

Value,Count,Frequency (%),Unnamed: 3
0,23510,77.1%,
1,6990,22.9%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.25298
Minimum,0
Maximum,1
Zeros (%),74.7%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,1
95-th percentile,1
Maximum,1
Range,1
Interquartile range,1

0,1
Standard deviation,0.43473
Coef of variation,1.7184
Kurtosis,-0.70844
Mean,0.25298
MAD,0.37797
Skewness,1.1365
Sum,7716
Variance,0.18899
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,22784,74.7%,
1,7716,25.3%,

Value,Count,Frequency (%),Unnamed: 3
0,22784,74.7%,
1,7716,25.3%,

Value,Count,Frequency (%),Unnamed: 3
0,22784,74.7%,
1,7716,25.3%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.35125
Minimum,0
Maximum,1
Zeros (%),64.9%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,1
95-th percentile,1
Maximum,1
Range,1
Interquartile range,1

0,1
Standard deviation,0.47737
Coef of variation,1.3591
Kurtosis,-1.6116
Mean,0.35125
MAD,0.45574
Skewness,0.62327
Sum,10713
Variance,0.22788
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,19787,64.9%,
1,10713,35.1%,

Value,Count,Frequency (%),Unnamed: 3
0,19787,64.9%,
1,10713,35.1%,

Value,Count,Frequency (%),Unnamed: 3
0,19787,64.9%,
1,10713,35.1%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.11413
Minimum,0
Maximum,1
Zeros (%),88.6%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.31798
Coef of variation,2.7861
Kurtosis,3.8915
Mean,0.11413
MAD,0.20221
Skewness,2.4272
Sum,3481
Variance,0.10111
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,27019,88.6%,
1,3481,11.4%,

Value,Count,Frequency (%),Unnamed: 3
0,27019,88.6%,
1,3481,11.4%,

Value,Count,Frequency (%),Unnamed: 3
0,27019,88.6%,
1,3481,11.4%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.17174
Minimum,0
Maximum,1
Zeros (%),82.8%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.37716
Coef of variation,2.1961
Kurtosis,1.0305
Mean,0.17174
MAD,0.28449
Skewness,1.7408
Sum,5238
Variance,0.14225
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,25262,82.8%,
1,5238,17.2%,

Value,Count,Frequency (%),Unnamed: 3
0,25262,82.8%,
1,5238,17.2%,

Value,Count,Frequency (%),Unnamed: 3
0,25262,82.8%,
1,5238,17.2%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.20951
Minimum,0
Maximum,1
Zeros (%),79.0%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.40696
Coef of variation,1.9425
Kurtosis,0.038321
Mean,0.20951
MAD,0.33123
Skewness,1.4277
Sum,6390
Variance,0.16562
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,24110,79.0%,
1,6390,21.0%,

Value,Count,Frequency (%),Unnamed: 3
0,24110,79.0%,
1,6390,21.0%,

Value,Count,Frequency (%),Unnamed: 3
0,24110,79.0%,
1,6390,21.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.66741
Minimum,0
Maximum,1
Zeros (%),33.3%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,1
Q3,1
95-th percentile,1
Maximum,1
Range,1
Interquartile range,1

0,1
Standard deviation,0.47115
Coef of variation,0.70594
Kurtosis,-1.495
Mean,0.66741
MAD,0.44395
Skewness,-0.71069
Sum,20356
Variance,0.22198
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
1,20356,66.7%,
0,10144,33.3%,

Value,Count,Frequency (%),Unnamed: 3
0,10144,33.3%,
1,20356,66.7%,

Value,Count,Frequency (%),Unnamed: 3
0,10144,33.3%,
1,20356,66.7%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.80118
Minimum,0
Maximum,1
Zeros (%),19.9%

0,1
Minimum,0
5-th percentile,0
Q1,1
Median,1
Q3,1
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.39912
Coef of variation,0.49816
Kurtosis,0.27808
Mean,0.80118
MAD,0.31858
Skewness,-1.5093
Sum,24436
Variance,0.1593
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
1,24436,80.1%,
0,6064,19.9%,

Value,Count,Frequency (%),Unnamed: 3
0,6064,19.9%,
1,24436,80.1%,

Value,Count,Frequency (%),Unnamed: 3
0,6064,19.9%,
1,24436,80.1%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.84797
Minimum,0
Maximum,1
Zeros (%),15.2%

0,1
Minimum,0
5-th percentile,0
Q1,1
Median,1
Q3,1
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.35906
Coef of variation,0.42343
Kurtosis,1.7573
Mean,0.84797
MAD,0.25784
Skewness,-1.9383
Sum,25863
Variance,0.12892
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
1,25863,84.8%,
0,4637,15.2%,

Value,Count,Frequency (%),Unnamed: 3
0,4637,15.2%,
1,25863,84.8%,

Value,Count,Frequency (%),Unnamed: 3
0,4637,15.2%,
1,25863,84.8%,

Unnamed: 0,4,34,36,39,41,45,46,47,48,49,51,57,60,61,62,63,64,65,95,96,97,117,124,127,147,148,212,216,217,218,219,222,298,332,337,338,341,345
0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,1,1,1,1,0.75,0.5,0.021505,0,0,1.0,0,0,1,0,1,0,1,0,1,0.222222,1
1,1,0,0,0,0,1,0,1,0,0,0,1,1,0,0,0,0,0,1,1,1,1.0,0.5,0.053763,0,0,1.0,0,0,1,0,1,0,0,0,1,0.111111,0
2,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,1,0,1,1,1,0.75,0.25,0.064516,0,0,1.0,1,0,0,0,1,0,1,0,1,0.444444,1
3,1,0,0,0,0,1,1,0,0,0,1,1,0,1,1,1,1,0,0,1,1,0.75,0.5,0.010753,0,0,1.0,0,1,0,0,1,0,0,0,0,0.222222,0
4,1,1,0,0,1,0,0,0,0,0,0,0,1,1,1,1,0,0,1,1,1,0.75,0.5,0.053763,0,0,1.0,1,0,0,0,1,0,0,0,0,0.111111,1


In [27]:
import pandas_profiling as pp
import warnings
warnings.filterwarnings('ignore')
pp.ProfileReport(test[categ_col])

0,1
Number of variables,38
Number of observations,4166
Total Missing (%),0.0%
Total size in memory,1.2 MiB
Average record size in memory,312.0 B

0,1
Numeric,37
Categorical,0
Date,0
Text (Unique),0
Rejected,1

0,1
Distinct count,5
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.58323
Minimum,0
Maximum,1
Zeros (%),14.7%

0,1
Minimum,0.0
5-th percentile,0.0
Q1,0.5
Median,0.5
Q3,0.75
95-th percentile,1.0
Maximum,1.0
Range,1.0
Interquartile range,0.25

0,1
Standard deviation,0.29687
Coef of variation,0.50901
Kurtosis,-0.18434
Mean,0.58323
MAD,0.23477
Skewness,-0.6448
Sum,2429.8
Variance,0.088134
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0.5,1566,37.6%,
0.75,1348,32.4%,
1.0,634,15.2%,
0.0,611,14.7%,
0.25,7,0.2%,

Value,Count,Frequency (%),Unnamed: 3
0.0,611,14.7%,
0.25,7,0.2%,
0.5,1566,37.6%,
0.75,1348,32.4%,
1.0,634,15.2%,

Value,Count,Frequency (%),Unnamed: 3
0.0,611,14.7%,
0.25,7,0.2%,
0.5,1566,37.6%,
0.75,1348,32.4%,
1.0,634,15.2%,

0,1
Distinct count,5
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.54633
Minimum,0
Maximum,1
Zeros (%),0.0%

0,1
Minimum,0.0
5-th percentile,0.5
Q1,0.5
Median,0.5
Q3,0.5
95-th percentile,0.75
Maximum,1.0
Range,1.0
Interquartile range,0.0

0,1
Standard deviation,0.11781
Coef of variation,0.21564
Kurtosis,3.9639
Mean,0.54633
MAD,0.082811
Skewness,1.7704
Sum,2276
Variance,0.01388
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0.5,3380,81.1%,
0.75,642,15.4%,
1.0,92,2.2%,
0.25,50,1.2%,
0.0,2,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0.0,2,0.0%,
0.25,50,1.2%,
0.5,3380,81.1%,
0.75,642,15.4%,
1.0,92,2.2%,

Value,Count,Frequency (%),Unnamed: 3
0.0,2,0.0%,
0.25,50,1.2%,
0.5,3380,81.1%,
0.75,642,15.4%,
1.0,92,2.2%,

0,1
Distinct count,10
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.040257
Minimum,0
Maximum,0.096774
Zeros (%),0.0%

0,1
Minimum,0.0
5-th percentile,0.010753
Q1,0.021505
Median,0.032258
Q3,0.053763
95-th percentile,0.096774
Maximum,0.096774
Range,0.096774
Interquartile range,0.032258

0,1
Standard deviation,0.024381
Coef of variation,0.60564
Kurtosis,-0.12804
Mean,0.040257
MAD,0.020414
Skewness,0.85888
Sum,167.71
Variance,0.00059444
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0.021505376,1104,26.5%,
0.032258065,822,19.7%,
0.053763441,794,19.1%,
0.010752688,514,12.3%,
0.096774194,291,7.0%,
0.064516129,191,4.6%,
0.075268817,190,4.6%,
0.043010753,180,4.3%,
0.086021505,78,1.9%,
0.0,2,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0.0,2,0.0%,
0.010752688,514,12.3%,
0.021505376,1104,26.5%,
0.032258065,822,19.7%,
0.043010753,180,4.3%,

Value,Count,Frequency (%),Unnamed: 3
0.053763441,794,19.1%,
0.064516129,191,4.6%,
0.075268817,190,4.6%,
0.086021505,78,1.9%,
0.096774194,291,7.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.19467
Minimum,0
Maximum,1
Zeros (%),80.5%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.39599
Coef of variation,2.0342
Kurtosis,0.38049
Mean,0.19467
MAD,0.31355
Skewness,1.5428
Sum,811
Variance,0.15681
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,3355,80.5%,
1,811,19.5%,

Value,Count,Frequency (%),Unnamed: 3
0,3355,80.5%,
1,811,19.5%,

Value,Count,Frequency (%),Unnamed: 3
0,3355,80.5%,
1,811,19.5%,

0,1
Correlation,0.9567

0,1
Distinct count,4
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.89431
Minimum,0
Maximum,1
Zeros (%),0.6%

0,1
Minimum,0.0
5-th percentile,0.23319
Q1,1.0
Median,1.0
Q3,1.0
95-th percentile,1.0
Maximum,1.0
Range,1.0
Interquartile range,0.0

0,1
Standard deviation,0.26755
Coef of variation,0.29917
Kurtosis,2.7161
Mean,0.89431
MAD,0.18272
Skewness,-2.1555
Sum,3725.7
Variance,0.071583
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
1.0,3601,86.4%,
0.233191725,534,12.8%,
0.0,25,0.6%,
0.0260542,6,0.1%,

Value,Count,Frequency (%),Unnamed: 3
0.0,25,0.6%,
0.0260542,6,0.1%,
0.233191725,534,12.8%,
1.0,3601,86.4%,

Value,Count,Frequency (%),Unnamed: 3
0.0,25,0.6%,
0.0260542,6,0.1%,
0.233191725,534,12.8%,
1.0,3601,86.4%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.32597
Minimum,0
Maximum,1
Zeros (%),67.4%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,1
95-th percentile,1
Maximum,1
Range,1
Interquartile range,1

0,1
Standard deviation,0.46879
Coef of variation,1.4381
Kurtosis,-1.4489
Mean,0.32597
MAD,0.43943
Skewness,0.74281
Sum,1358
Variance,0.21977
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,2808,67.4%,
1,1358,32.6%,

Value,Count,Frequency (%),Unnamed: 3
0,2808,67.4%,
1,1358,32.6%,

Value,Count,Frequency (%),Unnamed: 3
0,2808,67.4%,
1,1358,32.6%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.26452
Minimum,0
Maximum,1
Zeros (%),73.5%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,1
95-th percentile,1
Maximum,1
Range,1
Interquartile range,1

0,1
Standard deviation,0.44113
Coef of variation,1.6677
Kurtosis,-0.85953
Mean,0.26452
MAD,0.3891
Skewness,1.0681
Sum,1102
Variance,0.1946
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,3064,73.5%,
1,1102,26.5%,

Value,Count,Frequency (%),Unnamed: 3
0,3064,73.5%,
1,1102,26.5%,

Value,Count,Frequency (%),Unnamed: 3
0,3064,73.5%,
1,1102,26.5%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.24676
Minimum,0
Maximum,1
Zeros (%),75.3%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.43118
Coef of variation,1.7474
Kurtosis,-0.61918
Mean,0.24676
MAD,0.37174
Skewness,1.1752
Sum,1028
Variance,0.18591
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,3138,75.3%,
1,1028,24.7%,

Value,Count,Frequency (%),Unnamed: 3
0,3138,75.3%,
1,1028,24.7%,

Value,Count,Frequency (%),Unnamed: 3
0,3138,75.3%,
1,1028,24.7%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.11906
Minimum,0
Maximum,1
Zeros (%),88.1%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.3239
Coef of variation,2.7205
Kurtosis,3.54
Mean,0.11906
MAD,0.20977
Skewness,2.3534
Sum,496
Variance,0.10491
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,3670,88.1%,
1,496,11.9%,

Value,Count,Frequency (%),Unnamed: 3
0,3670,88.1%,
1,496,11.9%,

Value,Count,Frequency (%),Unnamed: 3
0,3670,88.1%,
1,496,11.9%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.84157
Minimum,0
Maximum,1
Zeros (%),15.8%

0,1
Minimum,0
5-th percentile,0
Q1,1
Median,1
Q3,1
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.36518
Coef of variation,0.43393
Kurtosis,1.5036
Mean,0.84157
MAD,0.26665
Skewness,-1.8716
Sum,3506
Variance,0.13336
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
1,3506,84.2%,
0,660,15.8%,

Value,Count,Frequency (%),Unnamed: 3
0,660,15.8%,
1,3506,84.2%,

Value,Count,Frequency (%),Unnamed: 3
0,660,15.8%,
1,3506,84.2%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.36654
Minimum,0
Maximum,1
Zeros (%),63.3%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,1
95-th percentile,1
Maximum,1
Range,1
Interquartile range,1

0,1
Standard deviation,0.48192
Coef of variation,1.3148
Kurtosis,-1.6937
Mean,0.36654
MAD,0.46438
Skewness,0.55414
Sum,1527
Variance,0.23224
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,2639,63.3%,
1,1527,36.7%,

Value,Count,Frequency (%),Unnamed: 3
0,2639,63.3%,
1,1527,36.7%,

Value,Count,Frequency (%),Unnamed: 3
0,2639,63.3%,
1,1527,36.7%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.56001
Minimum,0
Maximum,1
Zeros (%),44.0%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,1
Q3,1
95-th percentile,1
Maximum,1
Range,1
Interquartile range,1

0,1
Standard deviation,0.49645
Coef of variation,0.88649
Kurtosis,-1.9424
Mean,0.56001
MAD,0.4928
Skewness,-0.24187
Sum,2333
Variance,0.24646
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
1,2333,56.0%,
0,1833,44.0%,

Value,Count,Frequency (%),Unnamed: 3
0,1833,44.0%,
1,2333,56.0%,

Value,Count,Frequency (%),Unnamed: 3
0,1833,44.0%,
1,2333,56.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.49184
Minimum,0
Maximum,1
Zeros (%),50.8%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,1
95-th percentile,1
Maximum,1
Range,1
Interquartile range,1

0,1
Standard deviation,0.49999
Coef of variation,1.0166
Kurtosis,-1.9999
Mean,0.49184
MAD,0.49987
Skewness,0.032661
Sum,2049
Variance,0.24999
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,2117,50.8%,
1,2049,49.2%,

Value,Count,Frequency (%),Unnamed: 3
0,2117,50.8%,
1,2049,49.2%,

Value,Count,Frequency (%),Unnamed: 3
0,2117,50.8%,
1,2049,49.2%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.40158
Minimum,0
Maximum,1
Zeros (%),59.8%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,1
95-th percentile,1
Maximum,1
Range,1
Interquartile range,1

0,1
Standard deviation,0.49028
Coef of variation,1.2209
Kurtosis,-1.8396
Mean,0.40158
MAD,0.48063
Skewness,0.40166
Sum,1673
Variance,0.24037
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,2493,59.8%,
1,1673,40.2%,

Value,Count,Frequency (%),Unnamed: 3
0,2493,59.8%,
1,1673,40.2%,

Value,Count,Frequency (%),Unnamed: 3
0,2493,59.8%,
1,1673,40.2%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.12698
Minimum,0
Maximum,1
Zeros (%),87.3%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.33299
Coef of variation,2.6224
Kurtosis,3.0258
Mean,0.12698
MAD,0.22171
Skewness,2.2415
Sum,529
Variance,0.11088
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,3637,87.3%,
1,529,12.7%,

Value,Count,Frequency (%),Unnamed: 3
0,3637,87.3%,
1,529,12.7%,

Value,Count,Frequency (%),Unnamed: 3
0,3637,87.3%,
1,529,12.7%,

0,1
Distinct count,10
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.088868
Minimum,0
Maximum,1
Zeros (%),70.2%

0,1
Minimum,0.0
5-th percentile,0.0
Q1,0.0
Median,0.0
Q3,0.11111
95-th percentile,0.44444
Maximum,1.0
Range,1.0
Interquartile range,0.11111

0,1
Standard deviation,0.16769
Coef of variation,1.887
Kurtosis,4.4174
Mean,0.088868
MAD,0.1247
Skewness,2.1424
Sum,370.22
Variance,0.028121
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0.0,2923,70.2%,
0.222222222,468,11.2%,
0.111111111,305,7.3%,
0.444444444,293,7.0%,
0.333333333,55,1.3%,
0.777777778,49,1.2%,
0.555555556,46,1.1%,
0.666666667,18,0.4%,
0.888888889,8,0.2%,
1.0,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0.0,2923,70.2%,
0.111111111,305,7.3%,
0.222222222,468,11.2%,
0.333333333,55,1.3%,
0.444444444,293,7.0%,

Value,Count,Frequency (%),Unnamed: 3
0.555555556,46,1.1%,
0.666666667,18,0.4%,
0.777777778,49,1.2%,
0.888888889,8,0.2%,
1.0,1,0.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.61762
Minimum,0
Maximum,1
Zeros (%),38.2%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,1
Q3,1
95-th percentile,1
Maximum,1
Range,1
Interquartile range,1

0,1
Standard deviation,0.48603
Coef of variation,0.78694
Kurtosis,-1.7664
Mean,0.61762
MAD,0.47233
Skewness,-0.48423
Sum,2573
Variance,0.23622
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
1,2573,61.8%,
0,1593,38.2%,

Value,Count,Frequency (%),Unnamed: 3
0,1593,38.2%,
1,2573,61.8%,

Value,Count,Frequency (%),Unnamed: 3
0,1593,38.2%,
1,2573,61.8%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.16083
Minimum,0
Maximum,1
Zeros (%),83.9%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.36741
Coef of variation,2.2845
Kurtosis,1.4127
Mean,0.16083
MAD,0.26992
Skewness,1.8472
Sum,670
Variance,0.13499
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,3496,83.9%,
1,670,16.1%,

Value,Count,Frequency (%),Unnamed: 3
0,3496,83.9%,
1,670,16.1%,

Value,Count,Frequency (%),Unnamed: 3
0,3496,83.9%,
1,670,16.1%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.11762
Minimum,0
Maximum,1
Zeros (%),88.2%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.32219
Coef of variation,2.7393
Kurtosis,3.6411
Mean,0.11762
MAD,0.20757
Skewness,2.3747
Sum,490
Variance,0.10381
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,3676,88.2%,
1,490,11.8%,

Value,Count,Frequency (%),Unnamed: 3
0,3676,88.2%,
1,490,11.8%,

Value,Count,Frequency (%),Unnamed: 3
0,3676,88.2%,
1,490,11.8%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.44575
Minimum,0
Maximum,1
Zeros (%),55.4%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,1
95-th percentile,1
Maximum,1
Range,1
Interquartile range,1

0,1
Standard deviation,0.49711
Coef of variation,1.1152
Kurtosis,-1.9533
Mean,0.44575
MAD,0.49411
Skewness,0.21836
Sum,1857
Variance,0.24712
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,2309,55.4%,
1,1857,44.6%,

Value,Count,Frequency (%),Unnamed: 3
0,2309,55.4%,
1,1857,44.6%,

Value,Count,Frequency (%),Unnamed: 3
0,2309,55.4%,
1,1857,44.6%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.11162
Minimum,0
Maximum,1
Zeros (%),88.8%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.31493
Coef of variation,2.8215
Kurtosis,4.0911
Mean,0.11162
MAD,0.19832
Skewness,2.4676
Sum,465
Variance,0.099183
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,3701,88.8%,
1,465,11.2%,

Value,Count,Frequency (%),Unnamed: 3
0,3701,88.8%,
1,465,11.2%,

Value,Count,Frequency (%),Unnamed: 3
0,3701,88.8%,
1,465,11.2%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.12674
Minimum,0
Maximum,1
Zeros (%),87.3%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.33272
Coef of variation,2.6252
Kurtosis,3.0404
Mean,0.12674
MAD,0.22135
Skewness,2.2448
Sum,528
Variance,0.1107
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,3638,87.3%,
1,528,12.7%,

Value,Count,Frequency (%),Unnamed: 3
0,3638,87.3%,
1,528,12.7%,

Value,Count,Frequency (%),Unnamed: 3
0,3638,87.3%,
1,528,12.7%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.13082
Minimum,0
Maximum,1
Zeros (%),86.9%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.33724
Coef of variation,2.5779
Kurtosis,2.7993
Mean,0.13082
MAD,0.22741
Skewness,2.1904
Sum,545
Variance,0.11373
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,3621,86.9%,
1,545,13.1%,

Value,Count,Frequency (%),Unnamed: 3
0,3621,86.9%,
1,545,13.1%,

Value,Count,Frequency (%),Unnamed: 3
0,3621,86.9%,
1,545,13.1%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.18075
Minimum,0
Maximum,1
Zeros (%),81.9%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.38486
Coef of variation,2.1292
Kurtosis,0.75551
Mean,0.18075
MAD,0.29616
Skewness,1.6599
Sum,753
Variance,0.14811
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,3413,81.9%,
1,753,18.1%,

Value,Count,Frequency (%),Unnamed: 3
0,3413,81.9%,
1,753,18.1%,

Value,Count,Frequency (%),Unnamed: 3
0,3413,81.9%,
1,753,18.1%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.1217
Minimum,0
Maximum,1
Zeros (%),87.8%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.32698
Coef of variation,2.6868
Kurtosis,3.361
Mean,0.1217
MAD,0.21378
Skewness,2.315
Sum,507
Variance,0.10691
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,3659,87.8%,
1,507,12.2%,

Value,Count,Frequency (%),Unnamed: 3
0,3659,87.8%,
1,507,12.2%,

Value,Count,Frequency (%),Unnamed: 3
0,3659,87.8%,
1,507,12.2%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.11114
Minimum,0
Maximum,1
Zeros (%),88.9%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.31434
Coef of variation,2.8284
Kurtosis,4.1293
Mean,0.11114
MAD,0.19757
Skewness,2.4753
Sum,463
Variance,0.09881
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,3703,88.9%,
1,463,11.1%,

Value,Count,Frequency (%),Unnamed: 3
0,3703,88.9%,
1,463,11.1%,

Value,Count,Frequency (%),Unnamed: 3
0,3703,88.9%,
1,463,11.1%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.12746
Minimum,0
Maximum,1
Zeros (%),87.3%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.33353
Coef of variation,2.6167
Kurtosis,2.9967
Mean,0.12746
MAD,0.22243
Skewness,2.235
Sum,531
Variance,0.11124
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,3635,87.3%,
1,531,12.7%,

Value,Count,Frequency (%),Unnamed: 3
0,3635,87.3%,
1,531,12.7%,

Value,Count,Frequency (%),Unnamed: 3
0,3635,87.3%,
1,531,12.7%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.16683
Minimum,0
Maximum,1
Zeros (%),83.3%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.37287
Coef of variation,2.235
Kurtosis,1.1974
Mean,0.16683
MAD,0.27799
Skewness,1.788
Sum,695
Variance,0.13903
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,3471,83.3%,
1,695,16.7%,

Value,Count,Frequency (%),Unnamed: 3
0,3471,83.3%,
1,695,16.7%,

Value,Count,Frequency (%),Unnamed: 3
0,3471,83.3%,
1,695,16.7%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.23452
Minimum,0
Maximum,1
Zeros (%),76.5%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.42375
Coef of variation,1.8069
Kurtosis,-0.42864
Mean,0.23452
MAD,0.35904
Skewness,1.2536
Sum,977
Variance,0.17956
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,3189,76.5%,
1,977,23.5%,

Value,Count,Frequency (%),Unnamed: 3
0,3189,76.5%,
1,977,23.5%,

Value,Count,Frequency (%),Unnamed: 3
0,3189,76.5%,
1,977,23.5%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.25348
Minimum,0
Maximum,1
Zeros (%),74.7%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,1
95-th percentile,1
Maximum,1
Range,1
Interquartile range,1

0,1
Standard deviation,0.43506
Coef of variation,1.7163
Kurtosis,-0.71479
Mean,0.25348
MAD,0.37846
Skewness,1.1338
Sum,1056
Variance,0.18927
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,3110,74.7%,
1,1056,25.3%,

Value,Count,Frequency (%),Unnamed: 3
0,3110,74.7%,
1,1056,25.3%,

Value,Count,Frequency (%),Unnamed: 3
0,3110,74.7%,
1,1056,25.3%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.35286
Minimum,0
Maximum,1
Zeros (%),64.7%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,1
95-th percentile,1
Maximum,1
Range,1
Interquartile range,1

0,1
Standard deviation,0.47792
Coef of variation,1.3544
Kurtosis,-1.6212
Mean,0.35286
MAD,0.4567
Skewness,0.61607
Sum,1470
Variance,0.2284
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,2696,64.7%,
1,1470,35.3%,

Value,Count,Frequency (%),Unnamed: 3
0,2696,64.7%,
1,1470,35.3%,

Value,Count,Frequency (%),Unnamed: 3
0,2696,64.7%,
1,1470,35.3%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.11426
Minimum,0
Maximum,1
Zeros (%),88.6%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.31816
Coef of variation,2.7846
Kurtosis,3.8872
Mean,0.11426
MAD,0.20241
Skewness,2.426
Sum,476
Variance,0.10123
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,3690,88.6%,
1,476,11.4%,

Value,Count,Frequency (%),Unnamed: 3
0,3690,88.6%,
1,476,11.4%,

Value,Count,Frequency (%),Unnamed: 3
0,3690,88.6%,
1,476,11.4%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.16707
Minimum,0
Maximum,1
Zeros (%),83.3%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.37308
Coef of variation,2.2331
Kurtosis,1.1891
Mean,0.16707
MAD,0.27831
Skewness,1.7856
Sum,696
Variance,0.13919
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,3470,83.3%,
1,696,16.7%,

Value,Count,Frequency (%),Unnamed: 3
0,3470,83.3%,
1,696,16.7%,

Value,Count,Frequency (%),Unnamed: 3
0,3470,83.3%,
1,696,16.7%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.20787
Minimum,0
Maximum,1
Zeros (%),79.2%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.40583
Coef of variation,1.9523
Kurtosis,0.074577
Mean,0.20787
MAD,0.32932
Skewness,1.4403
Sum,866
Variance,0.1647
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,3300,79.2%,
1,866,20.8%,

Value,Count,Frequency (%),Unnamed: 3
0,3300,79.2%,
1,866,20.8%,

Value,Count,Frequency (%),Unnamed: 3
0,3300,79.2%,
1,866,20.8%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.65867
Minimum,0
Maximum,1
Zeros (%),34.1%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,1
Q3,1
95-th percentile,1
Maximum,1
Range,1
Interquartile range,1

0,1
Standard deviation,0.47421
Coef of variation,0.71996
Kurtosis,-1.5525
Mean,0.65867
MAD,0.44965
Skewness,-0.66949
Sum,2744
Variance,0.22488
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
1,2744,65.9%,
0,1422,34.1%,

Value,Count,Frequency (%),Unnamed: 3
0,1422,34.1%,
1,2744,65.9%,

Value,Count,Frequency (%),Unnamed: 3
0,1422,34.1%,
1,2744,65.9%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.79789
Minimum,0
Maximum,1
Zeros (%),20.2%

0,1
Minimum,0
5-th percentile,0
Q1,1
Median,1
Q3,1
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.40162
Coef of variation,0.50336
Kurtosis,0.20274
Mean,0.79789
MAD,0.32253
Skewness,-1.4841
Sum,3324
Variance,0.1613
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
1,3324,79.8%,
0,842,20.2%,

Value,Count,Frequency (%),Unnamed: 3
0,842,20.2%,
1,3324,79.8%,

Value,Count,Frequency (%),Unnamed: 3
0,842,20.2%,
1,3324,79.8%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.84253
Minimum,0
Maximum,1
Zeros (%),15.7%

0,1
Minimum,0
5-th percentile,0
Q1,1
Median,1
Q3,1
95-th percentile,1
Maximum,1
Range,1
Interquartile range,0

0,1
Standard deviation,0.36428
Coef of variation,0.43236
Kurtosis,1.5408
Mean,0.84253
MAD,0.26534
Skewness,-1.8815
Sum,3510
Variance,0.1327
Memory size,65.1 KiB

Value,Count,Frequency (%),Unnamed: 3
1,3510,84.3%,
0,656,15.7%,

Value,Count,Frequency (%),Unnamed: 3
0,656,15.7%,
1,3510,84.3%,

Value,Count,Frequency (%),Unnamed: 3
0,656,15.7%,
1,3510,84.3%,

Unnamed: 0,4,34,36,39,41,45,46,47,48,49,51,57,60,61,62,63,64,65,95,96,97,117,124,127,147,148,212,216,217,218,219,222,298,332,337,338,341,345
0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,1,1,0.5,0.5,0.075269,1,0,1.0,0,0,1,0,1,0,1,0,1,0.222222,1
1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0.5,0.5,0.021505,0,1,1.0,0,1,0,0,1,0,0,0,1,0.0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0.5,0.5,0.010753,0,0,1.0,0,1,0,0,1,0,1,0,1,0.222222,0
3,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.5,0.021505,0,1,1.0,0,0,1,0,1,0,0,0,1,0.0,0
4,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,0.5,0.5,0.053763,0,0,1.0,0,1,0,0,1,0,0,0,0,0.0,1


In [43]:
import pandas_profiling as pp
import warnings
warnings.filterwarnings('ignore')
pp.ProfileReport(train[num_col])

0,1
Number of variables,16
Number of observations,30500
Total Missing (%),0.0%
Total size in memory,5.2 MiB
Average record size in memory,179.0 B

0,1
Numeric,12
Categorical,0
Date,0
Text (Unique),0
Rejected,4

0,1
Distinct count,307
Unique (%),1.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.56975
Minimum,0
Maximum,1
Zeros (%),0.0%

0,1
Minimum,0.0
5-th percentile,0.0031511
Q1,0.35257
Median,0.3897
Q3,1.0
95-th percentile,1.0
Maximum,1.0
Range,1.0
Interquartile range,0.64743

0,1
Standard deviation,0.39599
Coef of variation,0.69502
Kurtosis,-1.6597
Mean,0.56975
MAD,0.3742
Skewness,-0.018124
Sum,17377
Variance,0.15681
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
1.0,13263,43.5%,
0.352568116,4994,16.4%,
0.389701843,4903,16.1%,
0.146644114,1672,5.5%,
0.08913837,1339,4.4%,
0.068744943,661,2.2%,
0.017735534,270,0.9%,
0.014807421,191,0.6%,
0.015217918,184,0.6%,
0.011562576,168,0.6%,

Value,Count,Frequency (%),Unnamed: 3
0.0,2,0.0%,
1.477e-06,2,0.0%,
2.953e-06,3,0.0%,
3.692e-06,1,0.0%,
5.168e-06,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0.505921566,1,0.0%,
0.50740371,1,0.0%,
0.544569185,2,0.0%,
0.676284058,1,0.0%,
1.0,13263,43.5%,

0,1
Distinct count,583
Unique (%),1.9%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.34591
Minimum,0
Maximum,1
Zeros (%),0.1%

0,1
Minimum,0.0
5-th percentile,0.0013335
Q1,0.079698
Median,0.20775
Q3,0.4877
95-th percentile,1.0
Maximum,1.0
Range,1.0
Interquartile range,0.408

0,1
Standard deviation,0.34727
Coef of variation,1.0039
Kurtosis,-0.64537
Mean,0.34591
MAD,0.28754
Skewness,0.91511
Sum,10550
Variance,0.1206
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
1.0,3409,11.2%,
0.912367239,3057,10.0%,
0.438066394,1510,5.0%,
0.487696522,1394,4.6%,
0.390839104,1346,4.4%,
0.364148114,1304,4.3%,
0.310529982,1214,4.0%,
0.226460152,729,2.4%,
0.25756717,704,2.3%,
0.207745169,674,2.2%,

Value,Count,Frequency (%),Unnamed: 3
0.0,34,0.1%,
2.952e-06,19,0.1%,
5.904e-06,20,0.1%,
8.856e-06,10,0.0%,
1.1808e-05,17,0.1%,

Value,Count,Frequency (%),Unnamed: 3
0.611448611,1,0.0%,
0.695419552,1,0.0%,
0.912367239,3057,10.0%,
0.956183619,2,0.0%,
1.0,3409,11.2%,

0,1
Distinct count,361
Unique (%),1.2%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.47097
Minimum,0
Maximum,1
Zeros (%),14.3%

0,1
Minimum,0.0
5-th percentile,0.0
Q1,0.087759
Median,0.40298
Q3,0.96106
95-th percentile,0.96905
Maximum,1.0
Range,1.0
Interquartile range,0.8733

0,1
Standard deviation,0.37106
Coef of variation,0.78785
Kurtosis,-1.4371
Mean,0.47097
MAD,0.32292
Skewness,0.19093
Sum,14365
Variance,0.13768
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0.0,4367,14.3%,
0.969052386,4162,13.6%,
0.961057179,3326,10.9%,
0.393594084,1471,4.8%,
0.503075261,1116,3.7%,
1.0,847,2.8%,
0.442269859,785,2.6%,
0.18019252,661,2.2%,
0.087758559,568,1.9%,
0.624349525,482,1.6%,

Value,Count,Frequency (%),Unnamed: 3
0.0,4367,14.3%,
1.573e-06,24,0.1%,
3.145e-06,59,0.2%,
4.718e-06,83,0.3%,
6.29e-06,50,0.2%,

Value,Count,Frequency (%),Unnamed: 3
0.876631005,271,0.9%,
0.947502756,362,1.2%,
0.961057179,3326,10.9%,
0.969052386,4162,13.6%,
1.0,847,2.8%,

0,1
Distinct count,274
Unique (%),0.9%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.78228
Minimum,0
Maximum,1
Zeros (%),14.3%

0,1
Minimum,0.0
5-th percentile,0.0
Q1,0.81811
Median,0.93189
Q3,0.99701
95-th percentile,0.99775
Maximum,1.0
Range,1.0
Interquartile range,0.17889

0,1
Standard deviation,0.34688
Coef of variation,0.44342
Kurtosis,1.0062
Mean,0.78228
MAD,0.25778
Skewness,-1.6697
Sum,23860
Variance,0.12032
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0.0,4367,14.3%,
0.997754491,4162,13.6%,
0.997005988,3326,10.9%,
0.930389222,1493,4.9%,
0.949101796,1116,3.7%,
1.0,847,2.8%,
0.939371257,785,2.6%,
0.872005988,661,2.2%,
0.818113772,568,1.9%,
0.905688623,564,1.8%,

Value,Count,Frequency (%),Unnamed: 3
0.0,4367,14.3%,
0.051646707,24,0.1%,
0.082335329,59,0.2%,
0.104041916,83,0.3%,
0.120508982,50,0.2%,

Value,Count,Frequency (%),Unnamed: 3
0.990269461,271,0.9%,
0.996257485,362,1.2%,
0.997005988,3326,10.9%,
0.997754491,4162,13.6%,
1.0,847,2.8%,

0,1
Distinct count,859
Unique (%),2.8%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.36684
Minimum,0
Maximum,1
Zeros (%),33.3%

0,1
Minimum,0.0
5-th percentile,0.0
Q1,0.0
Median,0.23077
Q3,0.66848
95-th percentile,1.0
Maximum,1.0
Range,1.0
Interquartile range,0.66848

0,1
Standard deviation,0.3831
Coef of variation,1.0443
Kurtosis,-1.1801
Mean,0.36684
MAD,0.33746
Skewness,0.60792
Sum,11189
Variance,0.14677
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0.0,10144,33.3%,
1.0,5530,18.1%,
0.5,1426,4.7%,
0.333333333,1018,3.3%,
0.666666667,657,2.2%,
0.25,648,2.1%,
0.2,584,1.9%,
0.4,423,1.4%,
0.166666667,408,1.3%,
0.75,401,1.3%,

Value,Count,Frequency (%),Unnamed: 3
0.0,10144,33.3%,
0.001607717,1,0.0%,
0.002673797,1,0.0%,
0.003003003,1,0.0%,
0.003262643,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0.977272727,1,0.0%,
0.981132075,1,0.0%,
0.981818182,1,0.0%,
0.985915493,1,0.0%,
1.0,5530,18.1%,

0,1
Distinct count,80
Unique (%),0.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.54468
Minimum,0
Maximum,1
Zeros (%),0.6%

0,1
Minimum,0.0
5-th percentile,0.035668
Q1,0.087175
Median,0.30462
Q3,1.0
95-th percentile,1.0
Maximum,1.0
Range,1.0
Interquartile range,0.91283

0,1
Standard deviation,0.44669
Coef of variation,0.8201
Kurtosis,-1.9386
Mean,0.54468
MAD,0.44174
Skewness,0.0020838
Sum,16613
Variance,0.19953
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
1.0,14795,48.5%,
0.304618539,2348,7.7%,
0.112240254,1027,3.4%,
0.087174614,906,3.0%,
0.10764422,758,2.5%,
0.137851016,696,2.3%,
0.2275562,596,2.0%,
0.076937311,447,1.5%,
0.106348929,441,1.4%,
0.11120002,436,1.4%,

Value,Count,Frequency (%),Unnamed: 3
0.0,181,0.6%,
0.000190043,1,0.0%,
0.005951339,7,0.0%,
0.00680153,2,0.0%,
0.007801755,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0.137851016,696,2.3%,
0.153584557,326,1.1%,
0.2275562,596,2.0%,
0.304618539,2348,7.7%,
1.0,14795,48.5%,

0,1
Distinct count,11
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.69441
Minimum,0
Maximum,1
Zeros (%),0.6%

0,1
Minimum,0.0
5-th percentile,0.20247
Q1,0.44661
Median,0.61263
Q3,1.0
95-th percentile,1.0
Maximum,1.0
Range,1.0
Interquartile range,0.55339

0,1
Standard deviation,0.32004
Coef of variation,0.46089
Kurtosis,-1.3936
Mean,0.69441
MAD,0.29581
Skewness,-0.36796
Sum,21179
Variance,0.10243
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
1.0,14762,48.4%,
0.5833333,3400,11.1%,
0.235026,2735,9.0%,
0.4466146,2544,8.3%,
0.4889323,2461,8.1%,
0.6126302,1510,5.0%,
0.202474,1013,3.3%,
0.2057292,803,2.6%,
0.1178385,572,1.9%,
0.1699219,524,1.7%,

Value,Count,Frequency (%),Unnamed: 3
0.0,176,0.6%,
0.1178385,572,1.9%,
0.1699219,524,1.7%,
0.202474,1013,3.3%,
0.2057292,803,2.6%,

Value,Count,Frequency (%),Unnamed: 3
0.4466146,2544,8.3%,
0.4889323,2461,8.1%,
0.5833333,3400,11.1%,
0.6126302,1510,5.0%,
1.0,14762,48.4%,

0,1
Distinct count,12
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.68542
Minimum,0
Maximum,1
Zeros (%),1.8%

0,1
Minimum,0.0
5-th percentile,0.033314
Q1,0.34158
Median,0.8526
Q3,1.0
95-th percentile,1.0
Maximum,1.0
Range,1.0
Interquartile range,0.65842

0,1
Standard deviation,0.35662
Coef of variation,0.52029
Kurtosis,-1.1498
Mean,0.68542
MAD,0.32427
Skewness,-0.59631
Sum,20905
Variance,0.12718
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
1.0,14762,48.4%,
0.3415823,3400,11.1%,
0.0787639,2544,8.3%,
0.4488797,2461,8.1%,
0.5901081,2049,6.7%,
0.8525991,1510,5.0%,
0.6529206,1013,3.3%,
0.4127147,803,2.6%,
0.0154354,686,2.2%,
0.0333138,572,1.9%,

Value,Count,Frequency (%),Unnamed: 3
0.0,541,1.8%,
0.0154354,686,2.2%,
0.0333138,572,1.9%,
0.0787639,2544,8.3%,
0.3415823,3400,11.1%,

Value,Count,Frequency (%),Unnamed: 3
0.5901081,2049,6.7%,
0.6529206,1013,3.3%,
0.8525991,1510,5.0%,
0.9929259,159,0.5%,
1.0,14762,48.4%,

0,1
Correlation,0.98569

0,1
Correlation,0.94371

0,1
Distinct count,85
Unique (%),0.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.42603
Minimum,0
Maximum,1
Zeros (%),0.1%

0,1
Minimum,0.0
5-th percentile,0.0010673
Q1,0.005871
Median,0.013474
Q3,1.0
95-th percentile,1.0
Maximum,1.0
Range,1.0
Interquartile range,0.99413

0,1
Standard deviation,0.4792
Coef of variation,1.1248
Kurtosis,-1.8836
Mean,0.42603
MAD,0.4724
Skewness,0.2936
Sum,12994
Variance,0.22963
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
1.0,11443,37.5%,
0.7786997,1804,5.9%,
0.0346268,1156,3.8%,
0.0081693,1083,3.6%,
0.0032203,830,2.7%,
0.0123577,812,2.7%,
0.0045947,721,2.4%,
0.0086548,505,1.7%,
0.0033836,497,1.6%,
0.0152522,491,1.6%,

Value,Count,Frequency (%),Unnamed: 3
0.0,19,0.1%,
3.71e-05,1,0.0%,
5.03e-05,68,0.2%,
5.08e-05,17,0.1%,
0.0001259,25,0.1%,

Value,Count,Frequency (%),Unnamed: 3
0.0273809,6,0.0%,
0.0346268,1156,3.8%,
0.1026372,11,0.0%,
0.7786997,1804,5.9%,
1.0,11443,37.5%,

0,1
Correlation,0.99639

0,1
Correlation,0.99101

0,1
Distinct count,1996
Unique (%),6.5%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.15835
Minimum,0
Maximum,1
Zeros (%),77.1%

0,1
Minimum,0.0
5-th percentile,0.0
Q1,0.0
Median,0.0
Q3,0.0
95-th percentile,0.99705
Maximum,1.0
Range,1.0
Interquartile range,0.0

0,1
Standard deviation,0.32739
Coef of variation,2.0675
Kurtosis,1.5859
Mean,0.15835
MAD,0.2462
Skewness,1.813
Sum,4829.6
Variance,0.10718
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0.0,23510,77.1%,
1.0,1421,4.7%,
0.70711,169,0.6%,
0.89443,130,0.4%,
0.44721,87,0.3%,
0.94868,85,0.3%,
0.97014,85,0.3%,
0.57735,78,0.3%,
0.40825,50,0.2%,
0.8165,49,0.2%,

Value,Count,Frequency (%),Unnamed: 3
0.0,23510,77.1%,
0.01351,1,0.0%,
0.01537,1,0.0%,
0.01542,1,0.0%,
0.01603,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0.99906,1,0.0%,
0.99913,3,0.0%,
0.99926,1,0.0%,
0.99959,1,0.0%,
1.0,1421,4.7%,

0,1
Distinct count,2430
Unique (%),8.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.21998
Minimum,0
Maximum,1
Zeros (%),64.9%

0,1
Minimum,0.0
5-th percentile,0.0
Q1,0.0
Median,0.0
Q3,0.40825
95-th percentile,1.0
Maximum,1.0
Range,1.0
Interquartile range,0.40825

0,1
Standard deviation,0.34997
Coef of variation,1.591
Kurtosis,-0.011976
Mean,0.21998
MAD,0.29326
Skewness,1.2669
Sum,6709.2
Variance,0.12248
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0.0,19787,64.9%,
1.0,1762,5.8%,
0.70711,519,1.7%,
0.89443,250,0.8%,
0.57735,208,0.7%,
0.44721,198,0.6%,
0.40825,149,0.5%,
0.94868,145,0.5%,
0.66667,122,0.4%,
0.5,108,0.4%,

Value,Count,Frequency (%),Unnamed: 3
0.0,19787,64.9%,
0.02745,1,0.0%,
0.02841,1,0.0%,
0.02877,1,0.0%,
0.03107,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0.99973,1,0.0%,
0.99982,1,0.0%,
0.99983,1,0.0%,
0.9999,1,0.0%,
1.0,1762,5.8%,

0,1
Distinct count,66
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.67198
Minimum,0
Maximum,1
Zeros (%),0.0%

0,1
Minimum,0.0
5-th percentile,0.013477
Q1,0.32221
Median,1.0
Q3,1.0
95-th percentile,1.0
Maximum,1.0
Range,1.0
Interquartile range,0.67779

0,1
Standard deviation,0.40793
Coef of variation,0.60705
Kurtosis,-1.5561
Mean,0.67198
MAD,0.3912
Skewness,-0.52742
Sum,20495
Variance,0.1664
Memory size,476.6 KiB

Value,Count,Frequency (%),Unnamed: 3
1.0,18187,59.6%,
0.322212967,5994,19.7%,
0.110331917,1766,5.8%,
0.075385,1538,5.0%,
0.041581124,733,2.4%,
0.019329773,370,1.2%,
0.014342969,302,1.0%,
0.01347706,244,0.8%,
0.009526054,185,0.6%,
0.009616309,166,0.5%,

Value,Count,Frequency (%),Unnamed: 3
0.0,2,0.0%,
5.4152e-05,4,0.0%,
8.7069e-05,4,0.0%,
0.000213424,5,0.0%,
0.000254835,8,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0.520790562,3,0.0%,
0.5376925,4,0.0%,
0.555165959,1,0.0%,
0.661106483,3,0.0%,
1.0,18187,59.6%,

Unnamed: 0,119,325,99,319,100,101,88,315,314,118,86,324,13,333,214,335
0,0.997754,0.004595,0.322213,0.332109,0.352568,0.089094,0.13484,0.341582,0.583333,0.969052,0.80904,0.34737,0.461538,0.29779,0.137851,0.178935
1,0.91991,0.012358,0.322213,0.189719,0.352568,0.102546,0.0,0.44888,0.488932,0.341784,0.46291,0.256125,0.1875,0.207799,0.107644,0.110012
2,0.919162,0.000696,0.110332,0.076295,0.146644,0.000756,0.0,0.078764,0.446615,0.33998,0.0,0.083892,0.2,0.061374,0.035668,0.025786
3,0.997006,0.005871,1.0,0.168095,1.0,0.204746,0.27617,0.341582,0.583333,0.961057,0.0,0.326102,0.0,0.203917,0.107169,0.116122
4,1.0,0.007445,0.322213,0.086495,0.352568,0.147612,0.14744,0.341582,0.583333,1.0,0.95837,0.119391,0.65,0.079048,0.041699,0.039169


In [9]:
# Find High correlated features
corr_matrix = df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
high_cor_col = [column for column in upper.columns if any(upper[column] > 0.96)]
high_cor_col

['2',
 '9',
 '122',
 '146',
 '153',
 '165',
 '180',
 '325',
 '326',
 '329',
 '331',
 '333',
 '334',
 '335']

In [10]:
def write_to_submission_file(predicted, id_col, out_file, col = ['_ID_', '_VAL_']):
    predicted = pd.DataFrame(predicted, columns=['_VAL_'], index=id_col)
    predicted['_ID_'] = predicted.index
    predicted.to_csv(out_file, index=False, sep = ',', columns = col)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(train.drop(['0'],  axis=1, errors='ignore'), target,
                                                    test_size=0.2, stratify= target)

In [12]:
%%time
# train at all features
params = {"C": [0.01, 0.05, 0.1, 1, 10]}
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lr = LogisticRegression( class_weight = 'balanced')
gs = GridSearchCV(
    estimator=lr,  
    param_grid=params,  
    cv=kf,  
    error_score=1,  
    scoring='roc_auc',  
    n_jobs=n_jobs,
    verbose=2,
)
gs.fit( X= X_train, y= y_train)
best_score = gs.best_score_
best_lr = gs.best_estimator_
print(best_score)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=3)]: Done  25 out of  25 | elapsed:  1.8min finished


0.72862566674
Wall time: 1min 56s


In [16]:
y_pred = best_lr.predict(X_test)
score_valid = roc_auc_score(y_pred, y_test)
score_valid

0.59733978978461355

In [14]:
#drop corr value
df = df.drop(high_cor_col,axis=1, errors='ignore')
train_clean = df[df['0'].notnull()]
test_clean = df[df['0'].isnull()]
target_clean = df[df['0'].notnull()]['0']
train_clean = train_clean.drop(['0'],  axis=1, errors='ignore')
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(train_clean, target_clean,
                                                    test_size=0.2, stratify= target_clean)
print(train.shape)

(30500, 342)


In [15]:
%%time
# train without high correlated feature
params = {"C": [0.01, 0.05, 0.1, 1, 10]}
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lr = LogisticRegression( class_weight = 'balanced')
gs = GridSearchCV(
    estimator=lr,  
    param_grid=params,  
    cv=kf,  
    error_score=1,  
    scoring='roc_auc',  
    n_jobs=n_jobs,
    verbose=2,
)
gs.fit( X= X_train_clean, y= y_train_clean)
best_score = gs.best_score_
best_lr_clean = gs.best_estimator_
print(best_score)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=3)]: Done  25 out of  25 | elapsed:  2.5min finished


0.726587720026
Wall time: 2min 34s


In [16]:
y_pred_clean = best_lr_clean.predict(X_test_clean)
score_valid = roc_auc_score(y_pred_clean, y_test_clean)
score_valid

0.60070948111145062

In [17]:
def get_dummies(df, categ_col):
    # et dummies from categorical values
    for c in categ_col:
        records = pd.get_dummies(df[c], prefix=c)
        df.drop(c, axis=1, inplace=True)
        df = pd.concat([df, records], axis=1)
    return df

In [18]:
#code categorical values
df = pd.concat([train, test])
df = get_dummies(df,categ_col)
print(df.shape)


(34666, 405)


In [19]:
train_code = df[df['0'].notnull()]
test_code = df[df['0'].isnull()]
target_code = df[df['0'].notnull()]['0']
train_code = train_code.drop(['0'],  axis=1, errors='ignore')
test_code = test_code.drop(['0'], axis=1, errors='ignore')
X_train_code, X_test_code, y_train_code, y_test_code = train_test_split(train_code, target_code,
                                                    test_size=0.2, stratify= target_code)

In [20]:
%%time
# train at all features + coded categorical features
params = {"C": [0.01, 0.05, 0.1, 1],
           "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]}
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
#SVM = SVC(probability=True, kernel = 'rbf', C = 1, gamma =  0.01)
lr = LogisticRegression( class_weight = 'balanced')
gs = GridSearchCV(
    estimator=lr,  
    param_grid=params,  
    cv=kf,  
    error_score=1,  
    scoring='roc_auc',  
    n_jobs=n_jobs,
    verbose=2,
)
gs.fit( X= X_train_code, y= y_train_code)
best_score = gs.best_score_
best_lr_code = gs.best_estimator_
print(best_score)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:  4.7min
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed: 14.7min finished


0.725298048657
Wall time: 14min 49s


In [28]:
y_pred_code = best_lr_code.predict(X_test_code)
score_valid = roc_auc_score(y_pred_code, y_test_code)
score_valid

0.60425373972098084

In [21]:
best_lr_code.fit(train_code, target_code )

LogisticRegression(C=0.05, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [23]:
y_log_pred =  best_lr_code.predict(test_code)
write_to_submission_file(y_log_pred.astype(int), test_code.index, 'log_submission.csv' )

In [34]:
# get hiqh variance feature
df = pd.concat([train, test])
print(df.shape)
var_col = df[num_col].var(axis = 0)
num_col_filtered = list(var_col[var_col > 0.025].index)
print(len(num_col_filtered))
to_drop = list(set(num_col) - set(num_col_filtered))

(34666, 342)
86


In [35]:
df = df.drop(to_drop, axis = 1)
train_var = df[df['0'].notnull()]
test_var = df[df['0'].isnull()]
target_var = df[df['0'].notnull()]['0']
train_var = train_var.drop(['0'],  axis=1, errors='ignore')
X_train_var, X_test_var, y_train_var, y_test_var = train_test_split(train_var, target_var, test_size=0.2, stratify= target_var)
        

In [36]:
%%time
# train at features with high variance
params = {"C": [ 0.05, 0.1, 1, 2, 4],
           "solver": ["newton-cg", "lbfgs", "liblinear"]}
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
#SVM = SVC(probability=True, kernel = 'rbf', C = 1, gamma =  0.01)
lr = LogisticRegression( class_weight = 'balanced')
gs = GridSearchCV(
    estimator=lr,  
    param_grid=params,  
    cv=kf,  
    error_score=1,  
    scoring='roc_auc',  
    n_jobs=n_jobs,
    verbose=2,
)
gs.fit( X= X_train_var, y= y_train_var)
best_score = gs.best_score_
best_lr_var = gs.best_estimator_
print(best_score)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:  1.7min
[Parallel(n_jobs=3)]: Done  75 out of  75 | elapsed:  3.3min finished


0.720332661255
Wall time: 3min 22s


In [37]:
y_pred_var = best_lr_var.predict(X_test_var)
score_valid = roc_auc_score(y_pred_var, y_test_var)
score_valid

0.5981979001485549

In [41]:
%%time
# train bagging at logreg
parameters = {'max_features': [0.5, 0.7, 0.9, 1.], 'max_samples': [0.5, 0.7, 0.9, 1.], "base_estimator__C": [0.05, 0.1, 1]}
skf = StratifiedKFold(shuffle=True, random_state=42)
lg = LogisticRegression(class_weight= 'balanced',random_state=42)
bg = BaggingClassifier(lg, random_state=42, n_estimators=100)
r_grid_search = RandomizedSearchCV(bg, parameters, scoring ='roc_auc', n_iter=10, cv=skf, random_state=1, n_jobs = n_jobs)
r_grid_search = r_grid_search.fit(train_code, target_code)
print(r_grid_search.best_score_)

0.728769187951
Wall time: 1h 42min 36s


In [52]:
best_bglg = r_grid_search.best_estimator_

In [43]:
y_bag_log_pred =  best_bglg.predict(test_code)
write_to_submission_file(y_bag_log_pred.astype(int), test_code.index, 'log_bag_submission.csv' )

In [50]:
%%time
parameters = {'max_features': ['log2',20, 30, 40],
              'min_samples_leaf': [ 3, 5], 
              'max_depth': [20, 25, 30],
               'oob_score': [True]}
etc = ExtraTreesClassifier(n_estimators=300, random_state=42, n_jobs= n_jobs, bootstrap= True)
etcv = GridSearchCV(etc, parameters, n_jobs=-1, cv=skf, verbose=1, scoring='roc_auc')
etcv.fit(X_train_code, y_train_code)
print(etcv.best_estimator_)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 12.7min
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed: 22.7min finished


ExtraTreesClassifier(bootstrap=True, class_weight=None, criterion='gini',
           max_depth=30, max_features=20, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=3, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=3,
           oob_score=True, random_state=42, verbose=0, warm_start=False)
Wall time: 23min 19s


In [49]:
etcv.best_score_

0.73331790819055587

In [65]:
# train tuned model for ensemble
lg = LogisticRegression(class_weight= 'balanced',random_state=42, C=0.1)
bglg = BaggingClassifier(lg, random_state=42, n_estimators=400, bootstrap=True, max_features=0.9,
         max_samples=0.9,  n_jobs=n_jobs)
etc = ExtraTreesClassifier(n_estimators=400, random_state=42, n_jobs= n_jobs, bootstrap= True,
                           class_weight='balanced',max_depth=30, max_features=20, 
                           min_samples_leaf=3, min_samples_split=2,
                           oob_score=True)

In [66]:
%%time
bglg.fit(X_train_code, y_train_code)
etc.fit(X_train_code, y_train_code)

Wall time: 19min 41s


In [81]:
y_bglg_pred = bglg.predict_proba(X_test_code)
y_etc_pred = etc.predict_proba(X_test_code)
y_pred = [(etc_p[1]*0.4 + bglg_p[1]*0.6) for etc_p, bglg_p in zip(y_etc_pred, y_bglg_pred)]
y_pred = [ round(x) for x in y_pred]
print(roc_auc_score(y_test_code, y_pred))

0.681123909378


In [86]:
print('Bagging LogReg: ',roc_auc_score(y_test_code, bglg.predict(X_test_code)))

Bagging LogReg:  0.677340788694


In [88]:
print('Extra DT: ',roc_auc_score(y_test_code, etc.predict(X_test_code)))

Extra DT:  0.653590380931


In [None]:
bglg.fit(train_code, target_code)
etc.fit(train_code, target_code)
y_bglg_pred = bglg.predict_proba(test_code)
y_etc_pred = etc.predict_proba(test_code)
y_pred = [(etc_p[1]*0.4 + bglg_p[1]*0.6) for etc_p, bglg_p in zip(y_etc_pred, y_bglg_pred)]
y_pred = [ round(x) for x in y_pred]

In [62]:
write_to_submission_file(np.asarray(y_pred).astype(int), test_code.index, 'simple_ensemble_submission.csv' )
write_to_submission_file(np.asarray(etc.predict(test_code)).astype(int), test_code.index, 'etc_submission.csv' )
write_to_submission_file(np.asarray(bglg.predict(test_code)).astype(int), test_code.index, 'bglg_submission.csv' )