In [586]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import KBinsDiscretizer, StandardScaler
from sklearn.model_selection import train_test_split
import pickle
from sklearn.decomposition import PCA

In [587]:
pd.set_option('display.max_columns', None)  # or 1000
# pd.set_option('display.max_rows', None)  # or 1000

In [588]:
nbin = 5
onthot = True

# Iris

In [589]:
column_names = ['sepal.length', 'sepal.width', 'petal.length', 'petal.width', 'class']

iris = pd.read_csv('dataset/iris.data', sep=",", header=None, names=column_names, engine='python')

In [590]:
iris.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [591]:
# iris = iris[iris['class'] != 'Iris-virginica']

In [592]:
iris['class'] = iris['class'].astype('category')

In [593]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
sepal.length    150 non-null float64
sepal.width     150 non-null float64
petal.length    150 non-null float64
petal.width     150 non-null float64
class           150 non-null category
dtypes: category(1), float64(4)
memory usage: 5.1 KB


In [594]:
iris.describe()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [595]:
iris.select_dtypes('category').describe()

Unnamed: 0,class
count,150
unique,3
top,Iris-virginica
freq,50


In [596]:
display(iris.corr())

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
sepal.length,1.0,-0.109369,0.871754,0.817954
sepal.width,-0.109369,1.0,-0.420516,-0.356544
petal.length,0.871754,-0.420516,1.0,0.962757
petal.width,0.817954,-0.356544,0.962757,1.0


In [597]:
# Features
iris_data = iris.drop(columns = ['class'])
iris_label = iris['class']

iris_non_cat_col = iris_data.select_dtypes(exclude = 'category').columns
for c in iris_non_cat_col:
    iris_data[c] = pd.cut(iris_data[c], bins=nbin, labels=range(nbin))
    
iris_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
sepal.length    150 non-null category
sepal.width     150 non-null category
petal.length    150 non-null category
petal.width     150 non-null category
dtypes: category(4)
memory usage: 1.2 KB


In [598]:
iris_data[iris_data.isnull().any(axis=1)]

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width


In [599]:
if onthot:
    iris_data = pd.get_dummies(iris_data)
else:
    #iris_data = iris_data.fillna(0)
    pass

In [600]:
iris_data.head()

Unnamed: 0,sepal.length_0,sepal.length_1,sepal.length_2,sepal.length_3,sepal.length_4,sepal.width_0,sepal.width_1,sepal.width_2,sepal.width_3,sepal.width_4,petal.length_0,petal.length_1,petal.length_2,petal.length_3,petal.length_4,petal.width_0,petal.width_1,petal.width_2,petal.width_3,petal.width_4
0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0
1,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0
2,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0
3,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0
4,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0


In [601]:
iris_label.head()

0    Iris-setosa
1    Iris-setosa
2    Iris-setosa
3    Iris-setosa
4    Iris-setosa
Name: class, dtype: category
Categories (3, object): [Iris-setosa, Iris-versicolor, Iris-virginica]

In [602]:
iris_label.cat.codes

0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Length: 150, dtype: int8

In [603]:
iris_data.shape

(150, 20)

In [604]:
with open('dataset/iris-bin{}{}.pkl'.format(nbin,'-1hot' if onthot else ''), 'wb') as f:
    pickle.dump((iris_data, pd.Series(iris_label.cat.codes, name='class')), f)

# Anuran

In [605]:
name = 'anuran'

In [606]:
data = pd.read_csv('dataset/anuran.csv', sep=",", header=0, engine='python')

In [607]:
data.shape

(7195, 26)

In [608]:
data.head()

Unnamed: 0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,MFCCs_11,MFCCs_12,MFCCs_13,MFCCs_14,MFCCs_15,MFCCs_16,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22,Family,Genus,Species,RecordID
0,1.0,0.152936,-0.105586,0.200722,0.317201,0.260764,0.100945,-0.150063,-0.171128,0.124676,0.188654,-0.075622,-0.156436,0.082245,0.135752,-0.024017,-0.108351,-0.077623,-0.009568,0.057684,0.11868,0.014038,Leptodactylidae,Adenomera,AdenomeraAndre,1
1,1.0,0.171534,-0.098975,0.268425,0.338672,0.268353,0.060835,-0.222475,-0.207693,0.170883,0.270958,-0.095004,-0.254341,0.022786,0.16332,0.012022,-0.090974,-0.05651,-0.035303,0.02014,0.082263,0.029056,Leptodactylidae,Adenomera,AdenomeraAndre,1
2,1.0,0.152317,-0.082973,0.287128,0.276014,0.189867,0.008714,-0.242234,-0.219153,0.232538,0.266064,-0.072827,-0.237384,0.050791,0.207338,0.083536,-0.050691,-0.02359,-0.066722,-0.025083,0.099108,0.077162,Leptodactylidae,Adenomera,AdenomeraAndre,1
3,1.0,0.224392,0.118985,0.329432,0.372088,0.361005,0.015501,-0.194347,-0.098181,0.270375,0.267279,-0.162258,-0.317084,-0.011567,0.100413,-0.050224,-0.136009,-0.177037,-0.130498,-0.054766,-0.018691,0.023954,Leptodactylidae,Adenomera,AdenomeraAndre,1
4,1.0,0.087817,-0.068345,0.306967,0.330923,0.249144,0.006884,-0.265423,-0.1727,0.266434,0.332695,-0.100749,-0.298524,0.037439,0.219153,0.062837,-0.048885,-0.053074,-0.08855,-0.031346,0.10861,0.079244,Leptodactylidae,Adenomera,AdenomeraAndre,1


In [609]:
data.columns

Index(['MFCCs_ 1', 'MFCCs_ 2', 'MFCCs_ 3', 'MFCCs_ 4', 'MFCCs_ 5', 'MFCCs_ 6',
       'MFCCs_ 7', 'MFCCs_ 8', 'MFCCs_ 9', 'MFCCs_10', 'MFCCs_11', 'MFCCs_12',
       'MFCCs_13', 'MFCCs_14', 'MFCCs_15', 'MFCCs_16', 'MFCCs_17', 'MFCCs_18',
       'MFCCs_19', 'MFCCs_20', 'MFCCs_21', 'MFCCs_22', 'Family', 'Genus',
       'Species', 'RecordID'],
      dtype='object')

In [610]:
# Use family as target
#data = data[[c for c in data.columns if c not in {'Genus','Species','RecordID'}]]
data = data.drop(columns = ['Genus','Species','RecordID'])
new_cols = data.columns.values
new_cols[-1] = 'class'
data.columns = new_cols
data.head()

Unnamed: 0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,MFCCs_11,MFCCs_12,MFCCs_13,MFCCs_14,MFCCs_15,MFCCs_16,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22,class
0,1.0,0.152936,-0.105586,0.200722,0.317201,0.260764,0.100945,-0.150063,-0.171128,0.124676,0.188654,-0.075622,-0.156436,0.082245,0.135752,-0.024017,-0.108351,-0.077623,-0.009568,0.057684,0.11868,0.014038,Leptodactylidae
1,1.0,0.171534,-0.098975,0.268425,0.338672,0.268353,0.060835,-0.222475,-0.207693,0.170883,0.270958,-0.095004,-0.254341,0.022786,0.16332,0.012022,-0.090974,-0.05651,-0.035303,0.02014,0.082263,0.029056,Leptodactylidae
2,1.0,0.152317,-0.082973,0.287128,0.276014,0.189867,0.008714,-0.242234,-0.219153,0.232538,0.266064,-0.072827,-0.237384,0.050791,0.207338,0.083536,-0.050691,-0.02359,-0.066722,-0.025083,0.099108,0.077162,Leptodactylidae
3,1.0,0.224392,0.118985,0.329432,0.372088,0.361005,0.015501,-0.194347,-0.098181,0.270375,0.267279,-0.162258,-0.317084,-0.011567,0.100413,-0.050224,-0.136009,-0.177037,-0.130498,-0.054766,-0.018691,0.023954,Leptodactylidae
4,1.0,0.087817,-0.068345,0.306967,0.330923,0.249144,0.006884,-0.265423,-0.1727,0.266434,0.332695,-0.100749,-0.298524,0.037439,0.219153,0.062837,-0.048885,-0.053074,-0.08855,-0.031346,0.10861,0.079244,Leptodactylidae


In [611]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7195 entries, 0 to 7194
Data columns (total 23 columns):
MFCCs_ 1    7195 non-null float64
MFCCs_ 2    7195 non-null float64
MFCCs_ 3    7195 non-null float64
MFCCs_ 4    7195 non-null float64
MFCCs_ 5    7195 non-null float64
MFCCs_ 6    7195 non-null float64
MFCCs_ 7    7195 non-null float64
MFCCs_ 8    7195 non-null float64
MFCCs_ 9    7195 non-null float64
MFCCs_10    7195 non-null float64
MFCCs_11    7195 non-null float64
MFCCs_12    7195 non-null float64
MFCCs_13    7195 non-null float64
MFCCs_14    7195 non-null float64
MFCCs_15    7195 non-null float64
MFCCs_16    7195 non-null float64
MFCCs_17    7195 non-null float64
MFCCs_18    7195 non-null float64
MFCCs_19    7195 non-null float64
MFCCs_20    7195 non-null float64
MFCCs_21    7195 non-null float64
MFCCs_22    7195 non-null float64
class       7195 non-null object
dtypes: float64(22), object(1)
memory usage: 1.3+ MB


In [612]:
data.describe()

Unnamed: 0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,MFCCs_11,MFCCs_12,MFCCs_13,MFCCs_14,MFCCs_15,MFCCs_16,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22
count,7195.0,7195.0,7195.0,7195.0,7195.0,7195.0,7195.0,7195.0,7195.0,7195.0,7195.0,7195.0,7195.0,7195.0,7195.0,7195.0,7195.0,7195.0,7195.0,7195.0,7195.0,7195.0
mean,0.989885,0.323584,0.311224,0.445997,0.127046,0.097939,-0.001397,-0.00037,0.128213,0.055998,-0.115682,0.043371,0.150945,-0.039244,-0.101748,0.042062,0.08868,0.007755,-0.049474,-0.053244,0.037313,0.087567
std,0.069016,0.218653,0.263527,0.160328,0.162722,0.120412,0.171404,0.116302,0.179008,0.127099,0.186792,0.155983,0.20688,0.152515,0.187618,0.119915,0.138055,0.084733,0.082546,0.094181,0.07947,0.123442
min,-0.251179,-0.673025,-0.436028,-0.472676,-0.636012,-0.410417,-0.538982,-0.576506,-0.587313,-0.952266,-0.901989,-0.799441,-0.644116,-0.59038,-0.717156,-0.498675,-0.42148,-0.759322,-0.680745,-0.361649,-0.430812,-0.379304
25%,1.0,0.165945,0.138445,0.336737,0.051717,0.012581,-0.125737,-0.063109,0.004648,-0.001132,-0.26986,-0.033931,-0.002859,-0.13298,-0.255929,-0.019549,-0.001764,-0.042122,-0.106079,-0.120971,-0.01762,0.000533
50%,1.0,0.302184,0.274626,0.481463,0.161361,0.072079,-0.05263,0.013265,0.189317,0.063478,-0.153322,0.051054,0.196921,-0.050715,-0.143259,0.041081,0.112769,0.01182,-0.052626,-0.05518,0.031274,0.105373
75%,1.0,0.466566,0.430695,0.559861,0.222592,0.175957,0.08558,0.075108,0.265395,0.117725,0.026689,0.132432,0.324589,0.039157,0.017348,0.107046,0.201932,0.061889,0.006321,0.001342,0.089619,0.194819
max,1.0,1.0,1.0,1.0,0.752246,0.96424,1.0,0.551762,0.738033,0.522768,0.523033,0.690889,0.94571,0.575749,0.668924,0.6707,0.681157,0.614064,0.574209,0.467831,0.389797,0.432207


In [613]:
# Missing values
# for i,j in zip(data.columns,(data.values.astype(str) == '?').sum(axis = 0)):
#     if j > 0:
#         print(str(i) + ': ' + str(j) + ' records')

In [614]:
# data = data.replace('?',np.NaN)

In [615]:
# Setting all the numerical columns
# is_num = data.describe().loc['unique'].values > 3
# for col in data.iloc[:,is_num].columns:
#     data[col] = pd.to_numeric(data[col])

In [616]:
# Setting all the categorical columns to type category
for col in set(data.columns) - set(data.describe().columns):
    data[col] = data[col].astype('category')

In [617]:
data.select_dtypes('category').describe()

Unnamed: 0,class
count,7195
unique,4
top,Leptodactylidae
freq,4420


In [618]:
display(data.corr())

Unnamed: 0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,MFCCs_11,MFCCs_12,MFCCs_13,MFCCs_14,MFCCs_15,MFCCs_16,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22
MFCCs_ 1,1.0,-0.264768,-0.308624,0.021448,0.188768,-0.218619,-0.0712,0.037503,-0.000191,0.049657,-0.022288,-0.050154,0.072449,0.01405,-0.097434,0.010451,0.115724,-0.002011,-0.144825,-0.085469,0.072363,0.079368
MFCCs_ 2,-0.264768,1.0,0.611001,-0.042378,-0.420759,0.047503,0.227932,0.232423,-0.040515,-0.226083,0.05926,0.13333,-0.165818,-0.006654,0.270655,-0.055596,-0.269448,0.05002,0.197618,0.129981,-0.036238,-0.201407
MFCCs_ 3,-0.308624,0.611001,1.0,0.029403,-0.756089,0.180192,0.435171,0.035504,-0.187901,-0.188196,0.209527,0.247255,-0.183667,-0.090214,0.209107,-0.024501,-0.19159,0.065077,0.229409,0.338848,-0.055124,-0.405719
MFCCs_ 4,0.021448,-0.042378,0.029403,1.0,0.173574,-0.611927,-0.399321,0.320716,0.445693,-0.103668,-0.45067,0.308691,0.515422,-0.366775,-0.539008,0.250611,0.55294,0.10767,-0.383491,-0.413988,0.233089,0.463047
MFCCs_ 5,0.188768,-0.420759,-0.756089,0.173574,1.0,-0.116983,-0.698757,-0.068843,0.376587,0.150765,-0.239089,-0.071414,0.16971,-0.111034,-0.193764,0.195593,0.180462,-0.136097,-0.154481,-0.2916,-0.015469,0.365276
MFCCs_ 6,-0.218619,0.047503,0.180192,-0.611927,-0.116983,1.0,0.322164,-0.60354,-0.288668,0.184581,0.308249,-0.146011,-0.363535,0.161294,0.401258,-0.065702,-0.444974,-0.225529,0.354775,0.456391,-0.174788,-0.443431
MFCCs_ 7,-0.0712,0.227932,0.435171,-0.399321,-0.698757,0.322164,1.0,-0.116446,-0.812516,0.093195,0.61504,-0.328114,-0.467855,0.487558,0.423671,-0.455578,-0.401475,0.110193,0.233688,0.417053,-0.026791,-0.499355
MFCCs_ 8,0.037503,0.232423,0.035504,0.320716,-0.068843,-0.60354,-0.116446,1.0,0.342841,-0.666405,-0.335232,0.420723,0.226465,-0.261781,-0.145883,0.108638,0.171899,0.12692,-0.238918,-0.285313,0.189337,0.246922
MFCCs_ 9,-0.000191,-0.040515,-0.187901,0.445693,0.376587,-0.288668,-0.812516,0.342841,1.0,-0.223024,-0.852813,0.453657,0.670163,-0.551479,-0.540109,0.497591,0.486216,-0.09975,-0.267932,-0.434309,0.040679,0.520056
MFCCs_10,0.049657,-0.226083,-0.188196,-0.103668,0.150765,0.184581,0.093195,-0.666405,-0.223024,1.0,0.25927,-0.750621,-0.094546,0.516843,-0.008003,-0.270764,0.011631,0.036676,0.071435,0.021804,-0.163389,-0.005355


In [619]:
# Features
data_data = data.drop(columns = ['class'])
data_label = data['class']

data_non_cat_col = data_data.select_dtypes(exclude = 'category').columns
for c in data_non_cat_col:
    data_data[c] = pd.cut(data_data[c], bins=nbin, labels=range(nbin))
    
data_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7195 entries, 0 to 7194
Data columns (total 22 columns):
MFCCs_ 1    7195 non-null category
MFCCs_ 2    7195 non-null category
MFCCs_ 3    7195 non-null category
MFCCs_ 4    7195 non-null category
MFCCs_ 5    7195 non-null category
MFCCs_ 6    7195 non-null category
MFCCs_ 7    7195 non-null category
MFCCs_ 8    7195 non-null category
MFCCs_ 9    7195 non-null category
MFCCs_10    7195 non-null category
MFCCs_11    7195 non-null category
MFCCs_12    7195 non-null category
MFCCs_13    7195 non-null category
MFCCs_14    7195 non-null category
MFCCs_15    7195 non-null category
MFCCs_16    7195 non-null category
MFCCs_17    7195 non-null category
MFCCs_18    7195 non-null category
MFCCs_19    7195 non-null category
MFCCs_20    7195 non-null category
MFCCs_21    7195 non-null category
MFCCs_22    7195 non-null category
dtypes: category(22)
memory usage: 157.5 KB


In [620]:
data_data[data_data.isnull().any(axis=1)]

Unnamed: 0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,MFCCs_11,MFCCs_12,MFCCs_13,MFCCs_14,MFCCs_15,MFCCs_16,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22


In [621]:
if onthot:
    data_data = pd.get_dummies(data_data)
else:
    #data_data = data_data.fillna(0)
    pass

with open('dataset/{}-bin{}{}.pkl'.format(name,nbin,'-1hot' if onthot else ''), 'wb') as f:
    pickle.dump((data_data, pd.Series(data_label.cat.codes, name='class')), f)

# Avila

In [622]:
name = 'avila'

In [623]:
data1 = pd.read_csv('dataset/avila-tr.txt', sep=",", header=None, engine='python')
data2 = pd.read_csv('dataset/avila-ts.txt', sep=",", header=None, engine='python')
data = pd.concat([data1,data2])
data.reset_index(drop=True, inplace=True)

In [624]:
data.shape

(20867, 11)

In [625]:
data.columns = ['intercolumnar_distance', 'upper_margin', 'lower_margin', 'exploitation', 'row_number',
                'modular_ratio', 'interlinear_spacing', 'weight', 'peak_number', 'modular_ratio_interlinear_spacing', 'class']

In [626]:
data.head()

Unnamed: 0,intercolumnar_distance,upper_margin,lower_margin,exploitation,row_number,modular_ratio,interlinear_spacing,weight,peak_number,modular_ratio_interlinear_spacing,class
0,0.266074,-0.16562,0.32098,0.483299,0.17234,0.273364,0.371178,0.929823,0.251173,0.159345,A
1,0.130292,0.870736,-3.210528,0.062493,0.261718,1.43606,1.46594,0.636203,0.282354,0.515587,A
2,-0.116585,0.069915,0.068476,-0.783147,0.261718,0.439463,-0.081827,-0.888236,-0.123005,0.582939,A
3,0.031541,0.2976,-3.210528,-0.58359,-0.721442,-0.307984,0.710932,1.051693,0.594169,-0.533994,A
4,0.229043,0.807926,-0.052442,0.082634,0.261718,0.14879,0.635431,0.051062,0.032902,-0.086652,F


In [627]:
data = data[data['class'].isin(['A','E','F','I'])]
data.shape

(16348, 11)

In [628]:
# Use family as target
#data = data[[c for c in data.columns if c not in {'Genus','Species','RecordID'}]]
#data = data.drop(columns = ['B','C','D','G','H','W','X','Y'])
# new_cols = data.columns.values
# new_cols[-1] = 'class'
# data.columns = new_cols
# data.head()

In [629]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16348 entries, 0 to 20865
Data columns (total 11 columns):
intercolumnar_distance               16348 non-null float64
upper_margin                         16348 non-null float64
lower_margin                         16348 non-null float64
exploitation                         16348 non-null float64
row_number                           16348 non-null float64
modular_ratio                        16348 non-null float64
interlinear_spacing                  16348 non-null float64
weight                               16348 non-null float64
peak_number                          16348 non-null float64
modular_ratio_interlinear_spacing    16348 non-null float64
class                                16348 non-null object
dtypes: float64(10), object(1)
memory usage: 1.5+ MB


In [630]:
data.describe()

Unnamed: 0,intercolumnar_distance,upper_margin,lower_margin,exploitation,row_number,modular_ratio,interlinear_spacing,weight,peak_number,modular_ratio_interlinear_spacing
count,16348.0,16348.0,16348.0,16348.0,16348.0,16348.0,16348.0,16348.0,16348.0,16348.0
mean,0.042533,0.001368,-0.027875,-0.054004,-0.011441,0.006576,0.015014,-0.03566,-0.116727,-0.016203
std,1.011743,3.168384,1.055251,0.960523,1.022469,1.052959,1.184643,0.999404,0.999673,0.95397
min,-3.498799,-2.426761,-3.210528,-5.440122,-4.922215,-7.450257,-11.935457,-4.164819,-5.486218,-6.719324
25%,-0.079554,-0.267686,0.050694,-0.545248,0.17234,-0.557133,-0.044076,-0.572192,-0.43482,-0.471243
50%,0.09326,-0.087108,0.19295,0.08708,0.261718,-0.058835,0.220177,0.081418,0.001721,-0.021873
75%,0.241386,0.179832,0.328093,0.580183,0.261718,0.522513,0.446679,0.609716,0.375899,0.479421
max,9.943651,386.0,50.0,2.974359,1.066121,53.0,83.0,13.173081,44.0,11.911338


In [631]:
# Missing values
# for i,j in zip(data.columns,(data.values.astype(str) == '?').sum(axis = 0)):
#     if j > 0:
#         print(str(i) + ': ' + str(j) + ' records')

In [632]:
# data = data.replace('?',np.NaN)

In [633]:
# Setting all the numerical columns
# is_num = data.describe().loc['unique'].values > 3
# for col in data.iloc[:,is_num].columns:
#     data[col] = pd.to_numeric(data[col])

In [634]:
# Setting all the categorical columns to type category
for col in set(data.columns) - set(data.describe().columns):
    data[col] = data[col].astype('category')

In [635]:
data.select_dtypes('category').describe()

Unnamed: 0,class
count,16348
unique,4
top,A
freq,8572


In [636]:
display(data.corr())

Unnamed: 0,intercolumnar_distance,upper_margin,lower_margin,exploitation,row_number,modular_ratio,interlinear_spacing,weight,peak_number,modular_ratio_interlinear_spacing
intercolumnar_distance,1.0,-0.042751,0.032575,0.004166,0.448045,-0.032858,-0.036857,-0.048611,0.124827,0.030052
upper_margin,-0.042751,1.0,0.350797,0.002339,-0.067304,0.354345,0.517365,0.037797,0.28689,-0.023834
lower_margin,0.032575,0.350797,1.0,0.158222,-0.003429,0.065504,0.223952,0.045629,0.132527,-0.06656
exploitation,0.004166,0.002339,0.158222,1.0,0.101526,0.234069,0.096522,0.32332,0.215945,0.280302
row_number,0.448045,-0.067304,-0.003429,0.101526,1.0,0.064126,0.02521,-0.082638,0.306538,0.182162
modular_ratio,-0.032858,0.354345,0.065504,0.234069,0.064126,1.0,0.459212,-0.053702,0.17266,0.783573
interlinear_spacing,-0.036857,0.517365,0.223952,0.096522,0.02521,0.459212,1.0,0.03917,0.219869,0.341714
weight,-0.048611,0.037797,0.045629,0.32332,-0.082638,-0.053702,0.03917,1.0,0.480267,-0.021022
peak_number,0.124827,0.28689,0.132527,0.215945,0.306538,0.17266,0.219869,0.480267,1.0,0.194751
modular_ratio_interlinear_spacing,0.030052,-0.023834,-0.06656,0.280302,0.182162,0.783573,0.341714,-0.021022,0.194751,1.0


In [637]:
# Features
data_data = data.drop(columns = ['class'])
data_label = data['class']

data_non_cat_col = data_data.select_dtypes(exclude = 'category').columns
for c in data_non_cat_col:
    data_data[c] = pd.cut(data_data[c], bins=nbin, labels=range(nbin))
    
data_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16348 entries, 0 to 20865
Data columns (total 10 columns):
intercolumnar_distance               16348 non-null category
upper_margin                         16348 non-null category
lower_margin                         16348 non-null category
exploitation                         16348 non-null category
row_number                           16348 non-null category
modular_ratio                        16348 non-null category
interlinear_spacing                  16348 non-null category
weight                               16348 non-null category
peak_number                          16348 non-null category
modular_ratio_interlinear_spacing    16348 non-null category
dtypes: category(10)
memory usage: 288.6 KB


In [638]:
data_data[data_data.isnull().any(axis=1)]

Unnamed: 0,intercolumnar_distance,upper_margin,lower_margin,exploitation,row_number,modular_ratio,interlinear_spacing,weight,peak_number,modular_ratio_interlinear_spacing


In [639]:
if onthot:
    data_data = pd.get_dummies(data_data)
else:
    #data_data = data_data.fillna(0)
    pass

with open('dataset/{}-bin{}{}.pkl'.format(name,nbin,'-1hot' if onthot else ''), 'wb') as f:
    pickle.dump((data_data, pd.Series(data_label.cat.codes, name='class')), f)

# Cardio

In [640]:
name = 'cardio'

In [641]:
data = pd.read_csv('dataset/CTG.csv', sep=",", header=0, skiprows=[1], engine='python')

In [642]:
data.shape

(2129, 40)

In [643]:
data.head()

Unnamed: 0,FileName,Date,SegFile,b,e,LBE,LB,AC,FM,UC,ASTV,MSTV,ALTV,MLTV,DL,DS,DP,DR,Width,Min,Max,Nmax,Nzeros,Mode,Mean,Median,Variance,Tendency,A,B,C,D,E,AD,DE,LD,FS,SUSP,CLASS,NSP
0,Variab10.txt,12/1/96,CTG0001.txt,240.0,357.0,120.0,120.0,0.0,0.0,0.0,73.0,0.5,43.0,2.4,0.0,0.0,0.0,0.0,64.0,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,9.0,2.0
1,Fmcs_1.txt,5/3/96,CTG0002.txt,5.0,632.0,132.0,132.0,4.0,0.0,4.0,17.0,2.1,0.0,10.4,2.0,0.0,0.0,0.0,130.0,68.0,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,6.0,1.0
2,Fmcs_1.txt,5/3/96,CTG0003.txt,177.0,779.0,133.0,133.0,2.0,0.0,5.0,16.0,2.1,0.0,13.4,2.0,0.0,0.0,0.0,130.0,68.0,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,6.0,1.0
3,Fmcs_1.txt,5/3/96,CTG0004.txt,411.0,1192.0,134.0,134.0,2.0,0.0,6.0,16.0,2.4,0.0,23.0,2.0,0.0,0.0,0.0,117.0,53.0,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,6.0,1.0
4,Fmcs_1.txt,5/3/96,CTG0005.txt,533.0,1147.0,132.0,132.0,4.0,0.0,5.0,16.0,2.4,0.0,19.9,0.0,0.0,0.0,0.0,117.0,53.0,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0


In [644]:
data.columns

Index(['FileName', 'Date', 'SegFile', 'b', 'e', 'LBE', 'LB', 'AC', 'FM', 'UC',
       'ASTV', 'MSTV', 'ALTV', 'MLTV', 'DL', 'DS', 'DP', 'DR', 'Width', 'Min',
       'Max', 'Nmax', 'Nzeros', 'Mode', 'Mean', 'Median', 'Variance',
       'Tendency', 'A', 'B', 'C', 'D', 'E', 'AD', 'DE', 'LD', 'FS', 'SUSP',
       'CLASS', 'NSP'],
      dtype='object')

In [645]:
# Use family as target
# data = data[data['class'].isin(['A','E','F','I'])]
#data = data[[c for c in data.columns if c not in {'Genus','Species','RecordID'}]]
data = data.drop(columns = ['FileName', 'Date', 'SegFile', 'b', 'e', 'LBE', 'DR', 
                            'A', 'B', 'C', 'D', 'E', 'AD', 'DE', 'LD', 'FS', 'SUSP', 'CLASS'])
new_cols = data.columns.values
new_cols[-1] = 'class'
data.columns = new_cols
print(data.shape)
data.head()

(2129, 22)


Unnamed: 0,LB,AC,FM,UC,ASTV,MSTV,ALTV,MLTV,DL,DS,DP,Width,Min,Max,Nmax,Nzeros,Mode,Mean,Median,Variance,Tendency,class
0,120.0,0.0,0.0,0.0,73.0,0.5,43.0,2.4,0.0,0.0,0.0,64.0,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0,2.0
1,132.0,4.0,0.0,4.0,17.0,2.1,0.0,10.4,2.0,0.0,0.0,130.0,68.0,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0,1.0
2,133.0,2.0,0.0,5.0,16.0,2.1,0.0,13.4,2.0,0.0,0.0,130.0,68.0,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0,1.0
3,134.0,2.0,0.0,6.0,16.0,2.4,0.0,23.0,2.0,0.0,0.0,117.0,53.0,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0,1.0
4,132.0,4.0,0.0,5.0,16.0,2.4,0.0,19.9,0.0,0.0,0.0,117.0,53.0,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0,1.0


In [646]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2129 entries, 0 to 2128
Data columns (total 22 columns):
LB          2126 non-null float64
AC          2126 non-null float64
FM          2127 non-null float64
UC          2127 non-null float64
ASTV        2127 non-null float64
MSTV        2127 non-null float64
ALTV        2127 non-null float64
MLTV        2127 non-null float64
DL          2128 non-null float64
DS          2128 non-null float64
DP          2128 non-null float64
Width       2126 non-null float64
Min         2126 non-null float64
Max         2126 non-null float64
Nmax        2126 non-null float64
Nzeros      2126 non-null float64
Mode        2126 non-null float64
Mean        2126 non-null float64
Median      2126 non-null float64
Variance    2126 non-null float64
Tendency    2126 non-null float64
class       2126 non-null float64
dtypes: float64(22)
memory usage: 366.0 KB


In [647]:
data.describe()

Unnamed: 0,LB,AC,FM,UC,ASTV,MSTV,ALTV,MLTV,DL,DS,DP,Width,Min,Max,Nmax,Nzeros,Mode,Mean,Median,Variance,Tendency,class
count,2126.0,2126.0,2127.0,2127.0,2127.0,2127.0,2127.0,2127.0,2128.0,2128.0,2128.0,2126.0,2126.0,2126.0,2126.0,2126.0,2126.0,2126.0,2126.0,2126.0,2126.0,2126.0
mean,133.303857,2.722484,7.503056,3.669017,47.008933,1.335449,9.884814,8.207616,1.576128,0.003759,0.12782,70.445908,93.579492,164.0254,4.068203,0.323612,137.452023,134.610536,138.09031,18.80809,0.32032,1.304327
std,9.840844,3.56085,39.030452,2.877148,17.210648,0.891543,18.476534,5.701926,2.517794,0.061213,0.471687,38.955693,29.560212,17.944183,2.949386,0.706059,16.381289,15.593596,14.466589,28.977636,0.610829,0.614377
min,106.0,0.0,0.0,0.0,12.0,0.2,0.0,0.0,0.0,0.0,0.0,3.0,50.0,122.0,0.0,0.0,60.0,73.0,77.0,0.0,-1.0,1.0
25%,126.0,0.0,0.0,1.0,32.0,0.7,0.0,4.6,0.0,0.0,0.0,37.0,67.0,152.0,2.0,0.0,129.0,125.0,129.0,2.0,0.0,1.0
50%,133.0,1.0,0.0,3.0,49.0,1.2,0.0,7.4,0.0,0.0,0.0,67.5,93.0,162.0,3.0,0.0,139.0,136.0,139.0,7.0,0.0,1.0
75%,140.0,4.0,2.0,5.0,61.0,1.7,11.0,10.8,3.0,0.0,0.0,100.0,120.0,174.0,6.0,0.0,148.0,145.0,148.0,24.0,1.0,1.0
max,160.0,26.0,564.0,23.0,87.0,7.0,91.0,50.7,16.0,1.0,4.0,180.0,159.0,238.0,18.0,10.0,187.0,182.0,186.0,269.0,1.0,3.0


In [648]:
# Missing values
# for i,j in zip(data.columns,(data.values.astype(str) == '?').sum(axis = 0)):
#     if j > 0:
#         print(str(i) + ': ' + str(j) + ' records')

In [649]:
print(data.shape[0] - data.dropna().shape[0])
data = data.dropna()

3


In [650]:
# data = data.replace('?',np.NaN)

In [651]:
# Setting all the numerical columns
# is_num = data.describe().loc['unique'].values > 3
# for col in data.iloc[:,is_num].columns:
#     data[col] = pd.to_numeric(data[col])

In [652]:
# Setting all the categorical columns to type category
# for col in set(data.columns) - set(data.describe().columns):
for col in ['class']:
    data[col] = data[col].astype('category')

In [653]:
data.select_dtypes('category').describe()

Unnamed: 0,class
count,2126.0
unique,3.0
top,1.0
freq,1655.0


In [654]:
data['class'].value_counts() / len(data)

1.0    0.778457
2.0    0.138758
3.0    0.082785
Name: class, dtype: float64

In [655]:
display(data.corr())

Unnamed: 0,LB,AC,FM,UC,ASTV,MSTV,ALTV,MLTV,DL,DS,DP,Width,Min,Max,Nmax,Nzeros,Mode,Mean,Median,Variance,Tendency
LB,1.0,-0.077967,-0.033305,-0.127722,0.30557,-0.279607,0.28563,-0.032091,-0.155908,-0.053518,-0.117133,-0.147679,0.361619,0.27511,-0.113933,-0.004745,0.708993,0.723121,0.789246,-0.133938,0.293503
AC,-0.077967,1.0,0.053956,0.2121,-0.197985,0.191378,-0.345162,-0.128901,-0.01135,-0.039341,-0.110317,0.291516,-0.157447,0.373493,0.191475,-0.01143,0.221693,0.236811,0.250729,0.111981,0.032235
FM,-0.033305,0.053956,1.0,-0.036626,-0.116461,0.138061,-0.072214,0.021031,0.076283,-0.010328,0.205337,0.171448,-0.158422,0.111228,0.183904,-0.014632,-0.037492,-0.085718,-0.056281,0.177793,-6.9e-05
UC,-0.127722,0.2121,-0.036626,1.0,-0.079651,0.232687,-0.247513,-0.10659,0.339484,0.018405,0.152395,0.15299,-0.112597,0.146645,0.095512,0.022701,-0.077695,-0.169739,-0.111041,0.184188,-0.060995
ASTV,0.30557,-0.197985,-0.116461,-0.079651,1.0,-0.430705,0.459413,-0.315105,-0.03704,0.033949,0.053677,-0.260463,0.275378,-0.111806,-0.167561,-0.149296,0.058363,0.074554,0.11996,-0.146434,-0.005748
MSTV,-0.279607,0.191378,0.138061,0.232687,-0.430705,1.0,-0.470259,0.073892,0.516901,0.03413,0.287548,0.660847,-0.622569,0.409072,0.50143,0.266183,-0.307586,-0.445401,-0.336109,0.555852,-0.06614
ALTV,0.28563,-0.345162,-0.072214,-0.247513,0.459413,-0.470259,1.0,-0.171114,-0.266298,-0.03077,-0.140133,-0.451297,0.422834,-0.283183,-0.279301,-0.121784,0.165211,0.222321,0.18648,-0.281536,0.042481
MLTV,-0.032091,-0.128901,0.021031,-0.10659,-0.315105,0.073892,-0.171114,1.0,-0.251442,-0.037667,-0.230849,0.110942,-0.144976,0.002023,0.056357,0.123869,0.072071,0.137813,0.063228,-0.164079,0.153093
DL,-0.155908,-0.01135,0.076283,0.339484,-0.03704,0.516901,-0.266298,-0.251442,1.0,0.121617,0.294068,0.520656,-0.539125,0.242187,0.39386,0.208487,-0.315463,-0.509446,-0.353878,0.51513,-0.000687
DS,-0.053518,-0.039341,-0.010328,0.018405,0.033949,0.03413,-0.03077,-0.037667,0.121617,1.0,0.019766,0.04488,-0.071974,-0.021135,0.007024,0.043441,-0.215161,-0.158673,-0.160451,0.136421,-0.070483


In [656]:
# Features
data_data = data.drop(columns = ['class'])
data_label = data['class']

data_non_cat_col = data_data.select_dtypes(exclude = 'category').columns
for c in data_non_cat_col:
    data_data[c] = pd.cut(data_data[c], bins=nbin, labels=range(nbin))
    
data_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2126 entries, 0 to 2125
Data columns (total 21 columns):
LB          2126 non-null category
AC          2126 non-null category
FM          2126 non-null category
UC          2126 non-null category
ASTV        2126 non-null category
MSTV        2126 non-null category
ALTV        2126 non-null category
MLTV        2126 non-null category
DL          2126 non-null category
DS          2126 non-null category
DP          2126 non-null category
Width       2126 non-null category
Min         2126 non-null category
Max         2126 non-null category
Nmax        2126 non-null category
Nzeros      2126 non-null category
Mode        2126 non-null category
Mean        2126 non-null category
Median      2126 non-null category
Variance    2126 non-null category
Tendency    2126 non-null category
dtypes: category(21)
memory usage: 62.8 KB


In [657]:
data_data[data_data.isnull().any(axis=1)]

Unnamed: 0,LB,AC,FM,UC,ASTV,MSTV,ALTV,MLTV,DL,DS,DP,Width,Min,Max,Nmax,Nzeros,Mode,Mean,Median,Variance,Tendency


In [658]:
if onthot:
    data_data = pd.get_dummies(data_data)
else:
    #data_data = data_data.fillna(0)
    pass

with open('dataset/{}-bin{}{}.pkl'.format(name,nbin,'-1hot' if onthot else ''), 'wb') as f:
    pickle.dump((data_data, pd.Series(data_label.cat.codes, name='class')), f)

# Contraceptive

In [659]:
name = 'contracept'

In [660]:
data = pd.read_csv('dataset/cmc.data', sep=",", header=None, engine='python')

In [661]:
data.shape

(1473, 10)

In [662]:
data.columns = ['age', 'education', 'husband_education', 'nchildren', 'religion', 'working',
                'husband_occupation', 'living index', 'media_exposure', 'class']

In [663]:
data.head()

Unnamed: 0,age,education,husband_education,nchildren,religion,working,husband_occupation,living index,media_exposure,class
0,24,2,3,3,1,1,2,3,0,1
1,45,1,3,10,1,1,3,4,0,1
2,43,2,3,7,1,1,3,4,0,1
3,42,3,2,9,1,1,3,3,0,1
4,36,3,3,8,1,1,3,2,0,1


In [664]:
# Use family as target
# data = data[data['class'].isin(['A','E','F','I'])]
#data = data[[c for c in data.columns if c not in {'Genus','Species','RecordID'}]]
# data = data.drop(columns = ['FileName', 'Date', 'SegFile', 'b', 'e', 'LBE', 'DR', 
#                             'A', 'B', 'C', 'D', 'E', 'AD', 'DE', 'LD', 'FS', 'SUSP', 'CLASS'])
# new_cols = data.columns.values
# new_cols[-1] = 'class'
# data.columns = new_cols
# print(data.shape)
# data.head()

In [665]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1473 entries, 0 to 1472
Data columns (total 10 columns):
age                   1473 non-null int64
education             1473 non-null int64
husband_education     1473 non-null int64
nchildren             1473 non-null int64
religion              1473 non-null int64
working               1473 non-null int64
husband_occupation    1473 non-null int64
living index          1473 non-null int64
media_exposure        1473 non-null int64
class                 1473 non-null int64
dtypes: int64(10)
memory usage: 115.2 KB


In [666]:
data.describe()

Unnamed: 0,age,education,husband_education,nchildren,religion,working,husband_occupation,living index,media_exposure,class
count,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0
mean,32.538357,2.958588,3.429735,3.261371,0.850645,0.749491,2.137814,3.133741,0.073999,1.919891
std,8.227245,1.014994,0.816349,2.358549,0.356559,0.433453,0.864857,0.976161,0.261858,0.876376
min,16.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
25%,26.0,2.0,3.0,1.0,1.0,0.0,1.0,3.0,0.0,1.0
50%,32.0,3.0,4.0,3.0,1.0,1.0,2.0,3.0,0.0,2.0
75%,39.0,4.0,4.0,4.0,1.0,1.0,3.0,4.0,0.0,3.0
max,49.0,4.0,4.0,16.0,1.0,1.0,4.0,4.0,1.0,3.0


In [667]:
# Missing values
# for i,j in zip(data.columns,(data.values.astype(str) == '?').sum(axis = 0)):
#     if j > 0:
#         print(str(i) + ': ' + str(j) + ' records')

In [668]:
# data = data.replace('?',np.NaN)

In [669]:
# Setting all the numerical columns
# is_num = data.describe().loc['unique'].values > 3
# for col in data.iloc[:,is_num].columns:
#     data[col] = pd.to_numeric(data[col])

In [670]:
# Setting all the categorical columns to type category
# for col in set(data.columns) - set(data.describe().columns):
for col in ['education', 'husband_education', 'religion', 'working',
            'husband_occupation', 'living index', 'media_exposure', 'class']:
    data[col] = data[col].astype('category')

In [671]:
data.select_dtypes('category').describe()

Unnamed: 0,education,husband_education,religion,working,husband_occupation,living index,media_exposure,class
count,1473,1473,1473,1473,1473,1473,1473,1473
unique,4,4,2,2,4,4,2,3
top,4,4,1,1,3,4,0,1
freq,577,899,1253,1104,585,684,1364,629


In [672]:
display(data.corr())

Unnamed: 0,age,nchildren
age,1.0,0.540126
nchildren,0.540126,1.0


In [673]:
# Features
data_data = data.drop(columns = ['class'])
data_label = data['class']

data_non_cat_col = data_data.select_dtypes(exclude = 'category').columns
for c in data_non_cat_col:
    data_data[c] = pd.cut(data_data[c], bins=nbin, labels=range(nbin))
    
data_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1473 entries, 0 to 1472
Data columns (total 9 columns):
age                   1473 non-null category
education             1473 non-null category
husband_education     1473 non-null category
nchildren             1473 non-null category
religion              1473 non-null category
working               1473 non-null category
husband_occupation    1473 non-null category
living index          1473 non-null category
media_exposure        1473 non-null category
dtypes: category(9)
memory usage: 14.4 KB


In [674]:
data_data[data_data.isnull().any(axis=1)]

Unnamed: 0,age,education,husband_education,nchildren,religion,working,husband_occupation,living index,media_exposure


In [675]:
if onthot:
    data_data = pd.get_dummies(data_data)
else:
    #data_data = data_data.fillna(0)
    pass

with open('dataset/{}-bin{}{}.pkl'.format(name,nbin,'-1hot' if onthot else ''), 'wb') as f:
    pickle.dump((data_data, pd.Series(data_label.cat.codes, name='class')), f)