In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import os
from nltk.tokenize import RegexpTokenizer 
from collections import Counter
import arff
from bs4 import BeautifulSoup
import subprocess
import random
from mdlp.discretization import MDLP



In [2]:
# get label
def get_label(file):
    with open(file, 'r') as f:
        lines = f.read()
        soup = BeautifulSoup(lines, 'lxml')
        n_label = len(soup.find_all('label'))
        
        label_list = []
        for label in soup.find_all('label'): 
            label_list.append(label.attrs['name'])
        
    return n_label, label_list

def get_data_label(filePath, dataset):


    # get label
    file = os.path.join(filePath+dataset,dataset+'.xml')
    n_label, label_list = get_label(file)

    # get X, y
    file = os.path.join(filePath,dataset,dataset+'.csv')
    data = pd.read_csv(file)
    X = data.iloc[:,:-n_label]
    y = data.iloc[:,-n_label:]
    
    return X,y

def calculate_LAIM(interval, feature_data, label):
    # discrete
    feature_data = pd.cut(feature_data, bins = interval, include_lowest=True)
    
    quanta_matrix = pd.concat([feature_data, label],axis=1).groupby(feature_data.name).sum()
    
    max_r = quanta_matrix.max(axis=1)**2
    
    M_r = quanta_matrix.sum(axis=1)
    
    total_pos = y.values.sum()
    
    n = len(interval) - 1
    
    LAIM = (max_r/M_r).sum() / (n*total_pos)
    
    return LAIM

def perform_LAIM(data, label):
    
    data_dis = pd.DataFrame()
    for i in range(data.shape[1]):
        
        # step 1: intialization
        attr = data.iloc[:,i]
        print("Performing attribute:",attr.name)
        max_value = attr.max()
        min_value = attr.min()
        sort_attr = attr.sort_values(ascending=True)
        
        # form B, the set of candidate interval boundaries, 
        # with min, max and 
        # all the mid points of all the adjacent pairs
        B = set()
        for i in range(len(sort_attr)):
            if i == 0:
                B.add(sort_attr[i])
                B.add((sort_attr[i] + sort_attr[i+1])/2)
            elif i == len(sort_attr)-1:
                B.add(sort_attr[i])
                break
            else:
                B.add((sort_attr[i] + sort_attr[i+1])/2) 
        len_b = len(B)
        
        D_i = [min_value, max_value] # initial interval
        globalLAIM = 0
        
        if len(B) == 2:
            feature_data = pd.cut(list(attr), bins = [min_value, max_value/2, max_value], include_lowest=True).codes
            data_dis = pd.concat([data_dis, pd.Series(feature_data)],axis=1)
            print("bins:",D_i)
        
        else:
            # step 2: Discretization
            nIntervals = 1
            while True:
                midpoint_LAIM = []
                for b_j in B:

                    D = list(D_i)
                    # Add a midpoint, b_j, into C which is not still in D_i
                    if b_j not in D:
                        D.append(b_j)
                        D.sort()
                        # calculate LAIM value
                        LAIM = calculate_LAIM(D, attr, label)
                        midpoint_LAIM.append((b_j, LAIM))       
                # Accept the midpoint, bj, with highest value of LAIM
                midpoint_max = max(midpoint_LAIM, key=lambda x: x[1])

                if midpoint_max[1] > globalLAIM:
                    D_i.append(midpoint_max[0])
                    D_i.sort()
                    B.remove(midpoint_max[0])
                    globalLAIM = midpoint_max[1]

                else:
                    feature_data = pd.cut(list(attr), bins = D_i, include_lowest=True).codes
                    data_dis = pd.concat([data_dis, pd.Series(feature_data)],axis=1)
                    print("nInterval:",nIntervals, "bins:",D_i)
                    break

                nIntervals += 1
                if len(D_i) == len_b:
                    feature_data = pd.cut(list(attr), bins = D_i, include_lowest=True).codes
                    data_dis = pd.concat([data_dis, pd.Series(feature_data)],axis=1)
                    print("nInterval:",nIntervals, "bins:",D_i)
                    break
    
    
    data_dis.columns = data.columns
    
    return data_dis

# small datasets

In [3]:

filePath = '/Volumes/Samsung_T5/research/data/small_datasets/'
for dataset in ['emotions','yeast','scene']:
    X, y = get_data_label(filePath, dataset)
    X_dis = perform_LAIM(X, y)
    y.to_csv(os.path.join(filePath,dataset,'y.csv'), index=False)
    X_dis.to_csv(os.path.join(filePath,dataset,'X.csv'), index=False)



Performing attribute: Mean_Acc1298_Mean_Mem40_Centroid
nInterval: 2 bins: [0.010201, 0.085428000000000004, 0.195412]
Performing attribute: Mean_Acc1298_Mean_Mem40_Rolloff
nInterval: 2 bins: [0.038286000000000001, 0.177256, 0.69827700000000004]
Performing attribute: Mean_Acc1298_Mean_Mem40_Flux
nInterval: 2 bins: [0.070932000000000009, 0.088305500000000009, 0.15945999999999999]
Performing attribute: Mean_Acc1298_Mean_Mem40_MFCC_0
nInterval: 2 bins: [-99.090801999999996, -70.85131100000001, -56.297651999999999]
Performing attribute: Mean_Acc1298_Mean_Mem40_MFCC_1
nInterval: 2 bins: [0.051473999999999999, 3.6759235000000001, 12.069754]
Performing attribute: Mean_Acc1298_Mean_Mem40_MFCC_2
nInterval: 2 bins: [-2.277307, 2.7672185000000002, 3.9108730000000005]
Performing attribute: Mean_Acc1298_Mean_Mem40_MFCC_3
nInterval: 2 bins: [-0.60460900000000006, 1.365394, 4.3823699999999999]
Performing attribute: Mean_Acc1298_Mean_Mem40_MFCC_4
nInterval: 2 bins: [-1.143864, -0.74528649999999996, 2.25

nInterval: 2 bins: [0.0, 0.40480850000000002, 1.598533]
Performing attribute: BH_LowPeakBPM
nInterval: 2 bins: [0.0, 69.0, 115.0]
Performing attribute: BH_HighPeakAmp
nInterval: 2 bins: [0.0, 0.227072, 1.762948]
Performing attribute: BH_HighPeakBPM
nInterval: 2 bins: [0.0, 138.0, 237.0]
Performing attribute: BH_HighLowRatio
nInterval: 2 bins: [0.0, 2.0, 3.0]
Performing attribute: BHSUM1
nInterval: 2 bins: [0.0, 0.58176549999999994, 1.7951279999999998]
Performing attribute: BHSUM2
nInterval: 2 bins: [0.0, 0.6421095, 1.7977650000000001]
Performing attribute: BHSUM3
nInterval: 2 bins: [0.0, 1.1753045, 3.4228989999999997]
Performing attribute: Att1
nInterval: 2 bins: [-0.37114599999999998, 0.27813300000000002, 0.52027200000000007]
Performing attribute: Att2
nInterval: 2 bins: [-0.472632, -0.186832, 0.61411400000000005]
Performing attribute: Att3
nInterval: 2 bins: [-0.33919499999999997, -0.19344349999999999, 0.35324099999999997]
Performing attribute: Att4
nInterval: 2 bins: [-0.467945, -0.

nInterval: 2 bins: [-0.35533799999999999, 0.076647500000000007, 0.43108000000000002]
Performing attribute: Att72
nInterval: 2 bins: [-0.31593699999999997, -0.073661499999999991, 0.34933200000000003]
Performing attribute: Att73
nInterval: 2 bins: [-0.44144499999999998, -0.086526000000000006, 0.30306900000000003]
Performing attribute: Att74
nInterval: 2 bins: [-0.46675299999999997, -0.100383, 0.39474499999999996]
Performing attribute: Att75
nInterval: 2 bins: [-0.38469399999999998, 0.25953799999999999, 0.49895699999999998]
Performing attribute: Att76
nInterval: 2 bins: [-0.38152600000000003, -0.058276499999999995, 0.33700799999999997]
Performing attribute: Att77
nInterval: 2 bins: [-0.339814, -0.1850175, 0.41170800000000002]
Performing attribute: Att78
nInterval: 2 bins: [-0.31355500000000003, -0.11172899999999999, 0.32793600000000001]
Performing attribute: Att79
nInterval: 2 bins: [-0.34031400000000001, -0.096530500000000005, 0.46023100000000006]
Performing attribute: Att80
nInterval: 2

nInterval: 2 bins: [0.0, 0.089613499999999999, 1.0]
Performing attribute: Att60
nInterval: 2 bins: [0.0, 0.081787500000000013, 1.0]
Performing attribute: Att61
nInterval: 2 bins: [0.0, 0.064680000000000001, 1.0]
Performing attribute: Att62
nInterval: 2 bins: [0.0, 0.066834000000000005, 1.0]
Performing attribute: Att63
nInterval: 2 bins: [0.0, 0.079201500000000008, 1.0]
Performing attribute: Att64
nInterval: 2 bins: [0.0, 0.026053, 1.0]
Performing attribute: Att65
nInterval: 2 bins: [0.0, 0.021614500000000002, 1.0]
Performing attribute: Att66
nInterval: 2 bins: [0.0, 0.017016, 1.0]
Performing attribute: Att67
nInterval: 2 bins: [0.0, 0.021541000000000001, 1.0]
Performing attribute: Att68
nInterval: 2 bins: [0.0, 0.0209295, 1.0]
Performing attribute: Att69
nInterval: 2 bins: [0.0, 0.025481, 1.0]
Performing attribute: Att70
nInterval: 2 bins: [0.0, 0.029378500000000002, 1.0]
Performing attribute: Att71
nInterval: 2 bins: [0.0, 0.038176000000000002, 1.0]
Performing attribute: Att72
nInterv

nInterval: 2 bins: [0.0, 0.041326000000000002, 1.0]
Performing attribute: Att166
nInterval: 2 bins: [0.0, 0.050896999999999998, 1.0]
Performing attribute: Att167
nInterval: 2 bins: [6.0000000000000002e-06, 0.027363499999999999, 1.0]
Performing attribute: Att168
nInterval: 2 bins: [0.0, 0.025412999999999998, 1.0]
Performing attribute: Att169
nInterval: 2 bins: [0.0, 0.026495500000000002, 1.0]
Performing attribute: Att170
nInterval: 2 bins: [0.0, 0.042990499999999994, 1.0]
Performing attribute: Att171
nInterval: 2 bins: [0.0, 0.025457500000000001, 1.0]
Performing attribute: Att172
nInterval: 2 bins: [0.0, 0.030185, 1.0]
Performing attribute: Att173
nInterval: 2 bins: [9.9999999999999995e-07, 0.0269555, 1.0]
Performing attribute: Att174
nInterval: 2 bins: [9.0000000000000002e-06, 0.019901499999999999, 1.0]
Performing attribute: Att175
nInterval: 2 bins: [0.0, 0.036983000000000002, 1.0]
Performing attribute: Att176
nInterval: 2 bins: [0.0, 0.0324365, 1.0]
Performing attribute: Att177
nInte

nInterval: 2 bins: [0.0, 0.14156150000000001, 1.0]
Performing attribute: Att269
nInterval: 2 bins: [0.00026200000000000003, 0.16023950000000001, 1.0]
Performing attribute: Att270
nInterval: 2 bins: [0.000154, 0.12940299999999999, 1.0]
Performing attribute: Att271
nInterval: 2 bins: [7.0000000000000007e-06, 0.1197925, 1.0]
Performing attribute: Att272
nInterval: 2 bins: [0.0, 0.1409745, 1.0]
Performing attribute: Att273
nInterval: 2 bins: [0.0, 0.12997150000000002, 1.0]
Performing attribute: Att274
nInterval: 2 bins: [0.0, 0.098815500000000001, 1.0]
Performing attribute: Att275
nInterval: 2 bins: [0.0, 0.1416085, 1.0]
Performing attribute: Att276
nInterval: 2 bins: [0.0, 0.13175450000000002, 1.0]
Performing attribute: Att277
nInterval: 2 bins: [3.0000000000000001e-06, 0.10399499999999999, 1.0]
Performing attribute: Att278
nInterval: 2 bins: [0.0, 0.13715650000000001, 1.0]
Performing attribute: Att279
nInterval: 2 bins: [0.0, 0.14238699999999999, 1.0]
Performing attribute: Att280
nInterv

# large dataset

In [4]:
def read_data(dataPath):
    # input: '/Volumes/Samsung_T5/research/data/ABC_news_data/obesity/'
    # read data
    data = pd.read_csv(os.path.join(dataPath,'X_1500.csv'))
    label = pd.read_csv(os.path.join(dataPath,'y.csv'))
    return data,label

filePath = '/Volumes/Samsung_T5/research/data/large_datasets/'
for dataset in ['rcv1subset5']:
    print(filePath+dataset)
    X, y = read_data(filePath+dataset)
    X_dis = perform_LAIM(X, y)
    # y.to_csv(os.path.join(filePath,dataset,'y.csv'), index=False)
    X_dis.to_csv(os.path.join(filePath,dataset,'X_dis_1500.csv'), index=False)

/Volumes/Samsung_T5/research/data/large_datasets/rcv1subset5
Performing attribute: Att19
bins: [0.0, 0.21142800000000006]
Performing attribute: Att25
bins: [0.0, 0.28212800000000005]
Performing attribute: Att28
nInterval: 2 bins: [0.0, 0.07748, 0.398938]
Performing attribute: Att30
nInterval: 2 bins: [0.0, 0.0992375, 0.276069]
Performing attribute: Att31
nInterval: 2 bins: [0.0, 0.09, 0.390205]
Performing attribute: Att32
nInterval: 2 bins: [0.0, 0.1199825, 0.37455]
Performing attribute: Att46
bins: [0.0, 0.173209]
Performing attribute: Att87
bins: [0.0, 0.147427]
Performing attribute: Att274
nInterval: 2 bins: [0.0, 0.05404, 0.409281]
Performing attribute: Att305
nInterval: 2 bins: [0.0, 0.2297655, 0.460596]
Performing attribute: Att370
nInterval: 2 bins: [0.0, 0.093041, 0.336285]
Performing attribute: Att430
bins: [0.0, 0.100762]
Performing attribute: Att467
bins: [0.0, 0.641472]
Performing attribute: Att478
nInterval: 2 bins: [0.0, 0.042794, 0.198296]
Performing attribute: Att501
nI

nInterval: 2 bins: [0.0, 0.0648225, 0.459796]
Performing attribute: Att4087
nInterval: 2 bins: [0.0, 0.068899, 0.304468]
Performing attribute: Att4108
nInterval: 2 bins: [0.0, 0.2410065, 0.482013]
Performing attribute: Att4124
nInterval: 2 bins: [0.0, 0.085171, 0.683297]
Performing attribute: Att4249
nInterval: 2 bins: [0.0, 0.10816700000000003, 0.3377]
Performing attribute: Att4273
nInterval: 2 bins: [0.0, 0.11004000000000003, 0.434562]
Performing attribute: Att4469
bins: [0.0, 0.28939000000000004]
Performing attribute: Att4545
bins: [0.0, 0.668955]
Performing attribute: Att4574
nInterval: 2 bins: [0.0, 0.127123, 0.254246]
Performing attribute: Att4592
bins: [0.0, 0.184962]
Performing attribute: Att4665
nInterval: 2 bins: [0.0, 0.027964, 0.384828]
Performing attribute: Att4760
nInterval: 2 bins: [0.0, 0.34832399999999997, 0.6966479999999999]
Performing attribute: Att4805
nInterval: 2 bins: [0.0, 0.156285, 0.31257]
Performing attribute: Att4847
bins: [0.0, 0.148999]
Performing attribut

nInterval: 2 bins: [0.0, 0.050166, 0.325291]
Performing attribute: Att7716
bins: [0.0, 0.373802]
Performing attribute: Att7807
bins: [0.0, 0.297316]
Performing attribute: Att7808
nInterval: 2 bins: [0.0, 0.1446695, 0.289339]
Performing attribute: Att7852
nInterval: 2 bins: [0.0, 0.089076, 0.587753]
Performing attribute: Att7949
nInterval: 2 bins: [0.0, 0.126845, 0.286851]
Performing attribute: Att7970
nInterval: 2 bins: [0.0, 0.133602, 0.360929]
Performing attribute: Att8001
bins: [0.0, 0.55799]
Performing attribute: Att8005
bins: [0.0, 0.164679]
Performing attribute: Att8042
nInterval: 2 bins: [0.0, 0.0536595, 0.22391]
Performing attribute: Att8051
nInterval: 2 bins: [0.0, 0.039080500000000004, 0.173993]
Performing attribute: Att8071
bins: [0.0, 0.45396]
Performing attribute: Att8085
bins: [0.0, 0.123324]
Performing attribute: Att8104
nInterval: 2 bins: [0.0, 0.053441, 0.106882]
Performing attribute: Att8117
nInterval: 2 bins: [0.0, 0.0363995, 0.192239]
Performing attribute: Att8140
n

nInterval: 2 bins: [0.0, 0.0725905, 0.18288]
Performing attribute: Att10630
nInterval: 2 bins: [0.0, 0.1978375, 0.427219]
Performing attribute: Att10681
nInterval: 2 bins: [0.0, 0.0489575, 0.189873]
Performing attribute: Att10766
bins: [0.0, 0.112066]
Performing attribute: Att10782
bins: [0.0, 0.125808]
Performing attribute: Att10844
bins: [0.0, 0.13910899999999998]
Performing attribute: Att10908
bins: [0.0, 0.287966]
Performing attribute: Att10998
nInterval: 2 bins: [0.0, 0.1814085, 0.362817]
Performing attribute: Att11012
nInterval: 2 bins: [0.0, 0.0612545, 0.516129]
Performing attribute: Att11053
bins: [0.0, 0.127995]
Performing attribute: Att11058
bins: [0.0, 0.227498]
Performing attribute: Att11066
nInterval: 2 bins: [0.0, 0.0817165, 0.175932]
Performing attribute: Att11141
nInterval: 2 bins: [0.0, 0.084727, 0.282168]
Performing attribute: Att11158
nInterval: 2 bins: [0.0, 0.0472075, 0.29168]
Performing attribute: Att11206
bins: [0.0, 0.124411]
Performing attribute: Att11247
nInte

nInterval: 2 bins: [0.0, 0.0906475, 0.421994]
Performing attribute: Att13749
bins: [0.0, 0.189533]
Performing attribute: Att13767
nInterval: 2 bins: [0.0, 0.06287000000000001, 0.400088]
Performing attribute: Att13768
bins: [0.0, 0.121721]
Performing attribute: Att13789
nInterval: 2 bins: [0.0, 0.050481, 0.247043]
Performing attribute: Att13824
bins: [0.0, 0.401276]
Performing attribute: Att13943
bins: [0.0, 0.095231]
Performing attribute: Att13946
bins: [0.0, 0.27534200000000003]
Performing attribute: Att13961
nInterval: 2 bins: [0.0, 0.059824, 0.311871]
Performing attribute: Att14046
nInterval: 2 bins: [0.0, 0.1923505, 0.384701]
Performing attribute: Att14051
nInterval: 2 bins: [0.0, 0.115207, 0.4415520000000001]
Performing attribute: Att14067
nInterval: 2 bins: [0.0, 0.031723, 0.216938]
Performing attribute: Att14070
nInterval: 2 bins: [0.0, 0.0325955, 0.31516900000000003]
Performing attribute: Att14090
bins: [0.0, 0.232511]
Performing attribute: Att14099
nInterval: 2 bins: [0.0, 0.1

nInterval: 2 bins: [0.0, 0.05767, 0.154887]
Performing attribute: Att17590
nInterval: 2 bins: [0.0, 0.064368, 0.349638]
Performing attribute: Att17591
nInterval: 2 bins: [0.0, 0.0483595, 0.117991]
Performing attribute: Att17644
bins: [0.0, 0.170875]
Performing attribute: Att17666
bins: [0.0, 0.489883]
Performing attribute: Att17677
bins: [0.0, 0.263216]
Performing attribute: Att17685
nInterval: 2 bins: [0.0, 0.222519, 0.445038]
Performing attribute: Att17745
bins: [0.0, 0.477685]
Performing attribute: Att17827
bins: [0.0, 0.4798270000000001]
Performing attribute: Att17864
nInterval: 2 bins: [0.0, 0.0679845, 0.381103]
Performing attribute: Att17922
nInterval: 2 bins: [0.0, 0.045014, 0.140934]
Performing attribute: Att18009
nInterval: 2 bins: [0.0, 0.1351765, 0.270353]
Performing attribute: Att18038
nInterval: 2 bins: [0.0, 0.0690915, 0.364975]
Performing attribute: Att18179
bins: [0.0, 0.279738]
Performing attribute: Att18185
nInterval: 2 bins: [0.0, 0.050911, 0.188518]
Performing attri

nInterval: 2 bins: [0.0, 0.13177850000000002, 0.26355700000000004]
Performing attribute: Att21988
nInterval: 2 bins: [0.0, 0.082843, 0.310221]
Performing attribute: Att22027
bins: [0.0, 0.116348]
Performing attribute: Att22074
nInterval: 2 bins: [0.0, 0.0177775, 0.21969]
Performing attribute: Att22081
nInterval: 2 bins: [0.0, 0.0554405, 0.110881]
Performing attribute: Att22139
bins: [0.0, 0.225906]
Performing attribute: Att22146
nInterval: 2 bins: [0.0, 0.102306, 0.367361]
Performing attribute: Att22151
bins: [0.0, 0.295313]
Performing attribute: Att22185
nInterval: 2 bins: [0.0, 0.1303875, 0.269902]
Performing attribute: Att22215
nInterval: 2 bins: [0.0, 0.044354000000000005, 0.46045]
Performing attribute: Att22260
bins: [0.0, 0.198609]
Performing attribute: Att22272
nInterval: 2 bins: [0.0, 0.0504285, 0.26049]
Performing attribute: Att22278
bins: [0.0, 0.101481]
Performing attribute: Att22291
nInterval: 2 bins: [0.0, 0.054363, 0.427591]
Performing attribute: Att22431
nInterval: 2 bin

bins: [0.0, 0.16055]
Performing attribute: Att26339
nInterval: 2 bins: [0.0, 0.162477, 0.345714]
Performing attribute: Att26383
nInterval: 2 bins: [0.0, 0.1145235, 0.229047]
Performing attribute: Att26388
nInterval: 2 bins: [0.0, 0.184626, 0.460879]
Performing attribute: Att26395
nInterval: 2 bins: [0.0, 0.1217825, 0.5454180000000001]
Performing attribute: Att26436
bins: [0.0, 0.345827]
Performing attribute: Att26443
nInterval: 2 bins: [0.0, 0.083259, 0.166518]
Performing attribute: Att26462
nInterval: 2 bins: [0.0, 0.123878, 0.326348]
Performing attribute: Att26474
nInterval: 2 bins: [0.0, 0.1287465, 0.323252]
Performing attribute: Att26482
nInterval: 2 bins: [0.0, 0.0854705, 0.170941]
Performing attribute: Att26552
bins: [0.0, 0.323148]
Performing attribute: Att26564
nInterval: 2 bins: [0.0, 0.0638765, 0.216716]
Performing attribute: Att26566
nInterval: 2 bins: [0.0, 0.0812635, 0.184381]
Performing attribute: Att26575
nInterval: 2 bins: [0.0, 0.0413925, 0.324267]
Performing attribute

nInterval: 2 bins: [0.0, 0.090904, 0.413384]
Performing attribute: Att29460
bins: [0.0, 0.1195]
Performing attribute: Att29545
nInterval: 2 bins: [0.0, 0.11413800000000003, 0.346075]
Performing attribute: Att29561
nInterval: 2 bins: [0.0, 0.0695905, 0.370611]
Performing attribute: Att29563
bins: [0.0, 0.322831]
Performing attribute: Att29569
nInterval: 2 bins: [0.0, 0.208573, 0.496507]
Performing attribute: Att29590
bins: [0.0, 0.333283]
Performing attribute: Att29640
nInterval: 2 bins: [0.0, 0.055885, 0.295525]
Performing attribute: Att29643
nInterval: 2 bins: [0.0, 0.053893, 0.124391]
Performing attribute: Att29670
bins: [0.0, 0.307662]
Performing attribute: Att29700
bins: [0.0, 0.211454]
Performing attribute: Att29740
nInterval: 2 bins: [0.0, 0.085974, 0.357833]
Performing attribute: Att29755
nInterval: 2 bins: [0.0, 0.082646, 0.5149699999999999]
Performing attribute: Att29756
bins: [0.0, 0.332731]
Performing attribute: Att29765
nInterval: 2 bins: [0.0, 0.094363, 0.233739]
Performin

nInterval: 2 bins: [0.0, 0.0724595, 0.158363]
Performing attribute: Att33012
nInterval: 2 bins: [0.0, 0.0585525, 0.118608]
Performing attribute: Att33013
nInterval: 2 bins: [0.0, 0.0585525, 0.118608]
Performing attribute: Att33039
nInterval: 2 bins: [0.0, 0.248398, 0.496796]
Performing attribute: Att33070
nInterval: 2 bins: [0.0, 0.2771015, 0.554203]
Performing attribute: Att33105
nInterval: 2 bins: [0.0, 0.066052, 0.34267]
Performing attribute: Att33191
nInterval: 2 bins: [0.0, 0.0677365, 0.376464]
Performing attribute: Att33312
nInterval: 2 bins: [0.0, 0.1872655, 0.374531]
Performing attribute: Att33361
nInterval: 2 bins: [0.0, 0.0410625, 0.13910899999999998]
Performing attribute: Att33373
nInterval: 2 bins: [0.0, 0.1560605, 0.312121]
Performing attribute: Att33375
nInterval: 2 bins: [0.0, 0.0712585, 0.354006]
Performing attribute: Att33514
nInterval: 2 bins: [0.0, 0.031943, 0.16975]
Performing attribute: Att33529
bins: [0.0, 0.45493]
Performing attribute: Att33567
nInterval: 2 bins:

bins: [0.0, 0.175103]
Performing attribute: Att36960
bins: [0.0, 0.734356]
Performing attribute: Att36975
nInterval: 2 bins: [0.0, 0.041704000000000005, 0.285957]
Performing attribute: Att37010
bins: [0.0, 0.103067]
Performing attribute: Att37050
nInterval: 2 bins: [0.0, 0.070388, 0.628105]
Performing attribute: Att37107
nInterval: 2 bins: [0.0, 0.042354, 0.319985]
Performing attribute: Att37114
bins: [0.0, 0.365615]
Performing attribute: Att37134
bins: [0.0, 0.090395]
Performing attribute: Att37167
bins: [0.0, 0.229108]
Performing attribute: Att37195
nInterval: 2 bins: [0.0, 0.1408165, 0.281633]
Performing attribute: Att37227
bins: [0.0, 0.183565]
Performing attribute: Att37287
bins: [0.0, 0.499754]
Performing attribute: Att37349
bins: [0.0, 0.225219]
Performing attribute: Att37451
bins: [0.0, 0.341685]
Performing attribute: Att37470
nInterval: 2 bins: [0.0, 0.0240205, 0.252203]
Performing attribute: Att37541
nInterval: 2 bins: [0.0, 0.0451975, 0.115084]
Performing attribute: Att37544

bins: [0.0, 0.415142]
Performing attribute: Att41062
bins: [0.0, 0.12599000000000002]
Performing attribute: Att41069
nInterval: 2 bins: [0.0, 0.0741655, 0.472775]
Performing attribute: Att41093
nInterval: 2 bins: [0.0, 0.20496950000000005, 0.5477390000000001]
Performing attribute: Att41113
bins: [0.0, 0.260008]
Performing attribute: Att41147
nInterval: 2 bins: [0.0, 0.0362005, 0.26160700000000003]
Performing attribute: Att41158
nInterval: 2 bins: [0.0, 0.067662, 0.146322]
Performing attribute: Att41207
nInterval: 2 bins: [0.0, 0.0763925, 0.567954]
Performing attribute: Att41248
nInterval: 2 bins: [0.0, 0.1419205, 0.43893]
Performing attribute: Att41253
bins: [0.0, 0.102968]
Performing attribute: Att41289
bins: [0.0, 0.614975]
Performing attribute: Att41297
nInterval: 2 bins: [0.0, 0.1703935, 0.360016]
Performing attribute: Att41346
nInterval: 2 bins: [0.0, 0.1512215, 0.31188]
Performing attribute: Att41361
bins: [0.0, 0.397663]
Performing attribute: Att41387
nInterval: 2 bins: [0.0, 0.

bins: [0.0, 0.379488]
Performing attribute: Att44992
nInterval: 2 bins: [0.0, 0.0223255, 0.223013]
Performing attribute: Att45040
bins: [0.0, 0.490283]
Performing attribute: Att45048
bins: [0.0, 0.138897]
Performing attribute: Att45058
bins: [0.0, 0.328043]
Performing attribute: Att45162
nInterval: 2 bins: [0.0, 0.148029, 0.296058]
Performing attribute: Att45166
nInterval: 2 bins: [0.0, 0.049289, 0.390236]
Performing attribute: Att45179
nInterval: 2 bins: [0.0, 0.030327, 0.273136]
Performing attribute: Att45196
bins: [0.0, 0.086001]
Performing attribute: Att45199
nInterval: 2 bins: [0.0, 0.0731135, 0.541792]
Performing attribute: Att45256
nInterval: 2 bins: [0.0, 0.0500775, 0.24187600000000006]
Performing attribute: Att45260
nInterval: 2 bins: [0.0, 0.054604, 0.33926300000000004]
Performing attribute: Att45327
nInterval: 2 bins: [0.0, 0.05441, 0.266903]
Performing attribute: Att45328
nInterval: 2 bins: [0.0, 0.0681745, 0.246056]
Performing attribute: Att45335
nInterval: 2 bins: [0.0, 0

In [3]:
def perform_LAIM(data, label):
    
    data_dis = pd.DataFrame()
    for i in range(data.shape[1]):
        index = random.sample(range(data.shape[0]), 2000)
        attr_long = data.iloc[:,i]
        attr = data.iloc[index,i].reset_index(drop=True)
        # step 1: intialization
        #attr = data.iloc[:,i]
        print("Performing attribute:",attr.name)
        max_value = attr.max()
        min_value = attr.min()
        sort_attr = attr.sort_values(ascending=True)
        
        # form B, the set of candidate interval boundaries, 
        # with min, max and 
        # all the mid points of all the adjacent pairs
        B = set()
        for i in range(len(sort_attr)):
            if i == 0:
                B.add(sort_attr[i])
                B.add((sort_attr[i] + sort_attr[i+1])/2)
            elif i == len(sort_attr)-1:
                B.add(sort_attr[i])
                break
            else:
                B.add((sort_attr[i] + sort_attr[i+1])/2) 
        len_b = len(B)
        
        D_i = [min_value, max_value] # initial interval
        globalLAIM = 0
        
        if len(B) == 2:
            feature_data = pd.cut(list(attr_long), bins = [min_value, max_value/2,max_value], include_lowest=True).codes
            data_dis = pd.concat([data_dis, pd.Series(feature_data)],axis=1)
        
        else:
            # step 2: Discretization
            nIntervals = 1
            while True:
                midpoint_LAIM = []
                for b_j in B:

                    D = list(D_i)
                    # Add a midpoint, b_j, into C which is not still in D_i
                    if b_j not in D:
                        D.append(b_j)
                        D.sort()
                        # calculate LAIM value
                        LAIM = calculate_LAIM(D, attr, label)
                        midpoint_LAIM.append((b_j, LAIM))       
                # Accept the midpoint, bj, with highest value of LAIM
                midpoint_max = max(midpoint_LAIM, key=lambda x: x[1])

                if midpoint_max[1] > globalLAIM:
                    D_i.append(midpoint_max[0])
                    D_i.sort()
                    B.remove(midpoint_max[0])
                    globalLAIM = midpoint_max[1]

                else:
                    feature_data = pd.cut(list(attr_long), bins = D_i, include_lowest=True).codes
                    data_dis = pd.concat([data_dis, pd.Series(feature_data)],axis=1)
                    print("nInterval:",nIntervals)
                    break

                nIntervals += 1
                if len(D_i) == len_b:
                    feature_data = pd.cut(list(attr_long), bins = D_i, include_lowest=True).codes
                    data_dis = pd.concat([data_dis, pd.Series(feature_data)],axis=1)
                    print("nInterval:",nIntervals)
                    break
    
    
    data_dis.columns = data.columns
    
    return data_dis

def read_data(dataPath):
    # input: '/Volumes/Samsung_T5/research/data/ABC_news_data/obesity/'
    # read data
    data = pd.read_csv(os.path.join(dataPath,'X.csv'))
    label = pd.read_csv(os.path.join(dataPath,'y.csv'))
    return data,label

filePath = '/Volumes/Samsung_T5/research/data/large_datasets/'
for dataset in ["mediamill"]:
    print(filePath+dataset)
    X, y = read_data(filePath+dataset)
    X_dis = perform_LAIM(X, y)
    # y.to_csv(os.path.join(filePath,dataset,'y.csv'), index=False)
    X_dis.to_csv(os.path.join(filePath,dataset,'X_dis_1500.csv'), index=False)

/Volumes/Samsung_T5/research/data/large_datasets/mediamill
Performing attribute: Att1
nInterval: 2
Performing attribute: Att2
nInterval: 2
Performing attribute: Att3
nInterval: 2
Performing attribute: Att4
nInterval: 2
Performing attribute: Att5
nInterval: 2
Performing attribute: Att6
nInterval: 2
Performing attribute: Att7
nInterval: 2
Performing attribute: Att8
nInterval: 2
Performing attribute: Att9
nInterval: 2
Performing attribute: Att10
nInterval: 2
Performing attribute: Att11
nInterval: 2
Performing attribute: Att12
nInterval: 2
Performing attribute: Att13
nInterval: 2
Performing attribute: Att14
nInterval: 2
Performing attribute: Att15
nInterval: 2
Performing attribute: Att16
nInterval: 2
Performing attribute: Att17
nInterval: 2
Performing attribute: Att18
nInterval: 2
Performing attribute: Att19
nInterval: 2
Performing attribute: Att20
nInterval: 2
Performing attribute: Att21
nInterval: 2
Performing attribute: Att22
nInterval: 2
Performing attribute: Att23
nInterval: 2
Perform

In [6]:
X_dis.describe().loc["min",:].min()

0.0