In [1]:
from EAExperiment import EAExperiment
import pandas as pd
import functools
from skfeature.function.statistical_based import CFS
import numpy as np
import math

In [2]:
data = pd.read_csv('data.csv')
window = 12
horizont = 6
target = 'PESO_7'
n_gen = 50
pop_size = 50
max_window_size = 6

In [3]:
def create_propositional_table_dataframe(df,w, target, h):
    columns =[]
    for i in range(w,0,-1):
        columns.extend([s + "_t-"+str(i) for s in df.columns])
    for t in target:
        for j in range(h):
            columns.append(t+"_t"+str(j))
    dataframe = pd.DataFrame(columns=columns)
    return dataframe
    
def create_propositional_table(df, w, h, target):
    #dataframe = create_event_table_dataframe(df, w, target, h)
    #print(dataframe.head())
    columns =[]
    for i in range(w,0,-1):
        columns.extend([s + "_t-"+str(i) for s in df.columns])
    for t in target:
        for j in range(h):
            columns.append(t+"_t"+str(j))
    dataframe = pd.DataFrame(columns=columns)
    
    for i in range((len(df)-w-h+1)):
        window = df.iloc[i:(i+w)]
        row = window.values.reshape(1, len(window.columns)*len(window))
        targets = {}
        for t in target:
            row = np.append(row, df[t].iloc[(i+w):(i+w+h)])
            
        dataframe.loc[i]=row.reshape(1, len(row))[0]
    return dataframe

In [4]:
def merit_calculation(X, y):
    """
    This function calculates the merit of X given class labels y, where
    merits = (k * rcf)/sqrt(k+k*(k-1)*rff)
    rcf = (1/k)*sum(su(fi,y)) for all fi in X
    rff = (1/(k*(k-1)))*sum(su(fi,fj)) for all fi and fj in X

    Input
    ----------
    X: {numpy array}, shape (n_samples, n_features)
        input data
    y: {numpy array}, shape (n_samples,)
        input class labels

    Output
    ----------
    merits: {float}
        merit of a feature subset X
    """

    n_samples, n_features = X.shape
    rff = 0
    rcf = 0
    for i in range(n_features):
        fi = X[:, i]
        rcf += su_calculation(fi, y)
        for j in range(n_features):
            if j > i:
                fj = X[:, j]
                rff += su_calculation(fi, fj)
    rff *= 2
    merits = rcf / np.sqrt(n_features + rff)
    return merits

In [12]:
def _evaluation_func(data, horizont, target, create_propositional_table, individuals):
    
    fitnesses = []
    for individual in individuals:
        target_idx = list(data.columns).index(target)
        individual[target_idx] = 1
        filtered_data = data.loc[:, individual[:-1].astype(bool)]
        
        window = individual[-1]
        propositional_filtered = create_propositional_table(filtered_data, window, horizont, [target])
        X = propositional_filtered.iloc[:, :-horizont]
        Y = propositional_filtered.iloc[:, -horizont:]
        fitness = 0
        all_features = []
        for h in Y.columns:
            merits = CFS.merit_calculation(X.values, Y[h].values)
            if not math.isnan(merits):
                fitness += merits/len(Y.columns)
        fitnesses.append(fitness)
            
    return fitnesses
    
evaluation_func = functools.partial(_evaluation_func, data, horizont, target, create_propositional_table)

In [13]:
eaExperiment = EAExperiment(evaluation_func)
pop, fits = eaExperiment.start_search_ga(n_gen, pop_size, max_window_size, data)



-- Generation 1 --


  merits = rcf / np.sqrt(n_features + rff)


  Min 0.0
  Max 1.0047273783444914
  Avg 0.9824392029663558
  Std 0.14035039947481093
-- Generation 2 --


  merits = rcf / np.sqrt(n_features + rff)


  Min 0.0
  Max 1.0047273783444914
  Avg 0.9830072286311686
  Std 0.14043140118400577
-- Generation 3 --
  Min 1.0019479163932987
  Max 1.0060865635297953
  Avg 1.00373301268186
  Std 0.0007680632103442906
-- Generation 4 --
  Min 1.0022475188706979
  Max 1.006113142083123
  Avg 1.004189207609929
  Std 0.0007581274446030183
-- Generation 5 --


  merits = rcf / np.sqrt(n_features + rff)


  Min 0.0
  Max 1.0063638721015433
  Avg 0.9847834957424897
  Std 0.14068487802564272
-- Generation 6 --


  merits = rcf / np.sqrt(n_features + rff)


  Min 0.0
  Max 1.0063638721015433
  Avg 0.9852310642937477
  Std 0.14074892340861808
-- Generation 7 --


  merits = rcf / np.sqrt(n_features + rff)


  Min 0.0
  Max 1.006870054947356
  Avg 0.9856738579963406
  Std 0.14081121386365064
-- Generation 8 --
  Min 1.0038480421644116
  Max 1.006870054947356
  Avg 1.0060274986525923
  Std 0.00048777542445152826
-- Generation 9 --
  Min 1.0041598661350404
  Max 1.0071611003970153
  Avg 1.0062122560640956
  Std 0.0005515901621127733
-- Generation 10 --
  Min 1.003888454224924
  Max 1.0076257926382288
  Avg 1.006433378488101
  Std 0.0006127351516282274
-- Generation 11 --
  Min 1.0057318435009281
  Max 1.0076509809629717
  Avg 1.0068034228398113
  Std 0.0004751454616035846
-- Generation 12 --


  merits = rcf / np.sqrt(n_features + rff)
  merits = rcf / np.sqrt(n_features + rff)


  Min 0.0
  Max 1.0081643839546504
  Avg 0.9667908254184607
  Std 0.1973465858066745
-- Generation 13 --


  merits = rcf / np.sqrt(n_features + rff)
  merits = rcf / np.sqrt(n_features + rff)


  Min 0.0
  Max 1.0093509340021611
  Avg 0.9672069920778654
  Std 0.1974313948598403
-- Generation 14 --
  Min 1.0059919794150238
  Max 1.0090794136195762
  Avg 1.0077412888210793
  Std 0.0005825293602154161
-- Generation 15 --


  merits = rcf / np.sqrt(n_features + rff)


  Min 0.0
  Max 1.0090794136195762
  Avg 0.9877670134053056
  Std 0.1411112918551815
-- Generation 16 --


  merits = rcf / np.sqrt(n_features + rff)


  Min 0.0
  Max 1.009296291074243
  Avg 0.9880530665467119
  Std 0.14115256112785757
-- Generation 17 --
  Min 1.006278400243963
  Max 1.0107762110591623
  Avg 1.008561819060166
  Std 0.0008140344214101398
-- Generation 18 --
  Min 1.0075637704562739
  Max 1.0107762110591623
  Avg 1.0091320711901985
  Std 0.0006323170777025353
-- Generation 19 --
  Min 1.0074369597107748
  Max 1.0110595149221362
  Avg 1.009598319641441
  Std 0.0006745907218557277
-- Generation 20 --


  merits = rcf / np.sqrt(n_features + rff)
  merits = rcf / np.sqrt(n_features + rff)
  merits = rcf / np.sqrt(n_features + rff)


  Min 0.0
  Max 1.0122420689560632
  Avg 0.9494527518815599
  Std 0.23987721842615722
-- Generation 21 --
  Min 1.0088089538722798
  Max 1.0122483597496874
  Avg 1.0105559036173772
  Std 0.0007645568023727357
-- Generation 22 --
  Min 1.006747520194306
  Max 1.0133693219446562
  Avg 1.010872165412499
  Std 0.0011565256876057016
-- Generation 23 --
  Min 1.008663917759365
  Max 1.0144911091109863
  Avg 1.0115479892494739
  Std 0.0009525433702766627
-- Generation 24 --


  merits = rcf / np.sqrt(n_features + rff)


  Min 0.0
  Max 1.0151009159474769
  Avg 0.9920065400554574
  Std 0.1417218657475124
-- Generation 25 --
  Min 1.0085895626806771
  Max 1.0160214069804132
  Avg 1.0130143249730867
  Std 0.0013785526616596757
-- Generation 26 --


  merits = rcf / np.sqrt(n_features + rff)


  Min 0.0
  Max 1.0160214069804132
  Avg 0.9932261549000035
  Std 0.14189896954675263
-- Generation 27 --
  Min 1.0103892546211648
  Max 1.0164300655007728
  Avg 1.0145456350939572
  Std 0.001283193212628447
-- Generation 28 --
  Min 1.0105442598818881
  Max 1.016630480166994
  Avg 1.0151948270546394
  Std 0.0014279559772260992
-- Generation 29 --
  Min 1.0096668462762663
  Max 1.0171545728835851
  Avg 1.015669768184099
  Std 0.0014170228046591704
-- Generation 30 --
  Min 1.0097060587787299
  Max 1.0171545728835851
  Avg 1.0154207250989975
  Std 0.0018348304979451436
-- Generation 31 --
  Min 1.0116276642917537
  Max 1.0181109056614952
  Avg 1.0158370876846459
  Std 0.0015479501125676876
-- Generation 32 --


  merits = rcf / np.sqrt(n_features + rff)


  Min 0.0
  Max 1.0181109056614952
  Avg 0.9955273580346342
  Std 0.14223096764600426
-- Generation 33 --


  merits = rcf / np.sqrt(n_features + rff)


  Min 0.0
  Max 1.01874077629385
  Avg 0.9959010554979092
  Std 0.14228321067526223
-- Generation 34 --
  Min 1.0135449992987953
  Max 1.019082432452909
  Avg 1.017221199481373
  Std 0.0012675336321858926
-- Generation 35 --
  Min 1.0136601446891307
  Max 1.0196758335539033
  Avg 1.0177517797998734
  Std 0.001380668392447622
-- Generation 36 --


  merits = rcf / np.sqrt(n_features + rff)
  merits = rcf / np.sqrt(n_features + rff)


  Min 0.0
  Max 1.021358373507332
  Avg 0.9769596184216245
  Std 0.1994337006331339
-- Generation 37 --
  Min 1.011055876064408
  Max 1.021358373507332
  Avg 1.0183121681373133
  Std 0.0021154539888631673
-- Generation 38 --
  Min 1.011303639829278
  Max 1.022749998112081
  Avg 1.018874740508954
  Std 0.0019337440191414764
-- Generation 39 --


  merits = rcf / np.sqrt(n_features + rff)


  Min 0.0
  Max 1.022749998112081
  Avg 0.9980737788735098
  Std 0.14261211265177834
-- Generation 40 --
  Min 1.0120489542733313
  Max 1.0246406531926673
  Avg 1.0198873851110932
  Std 0.002588297936997435
-- Generation 41 --


  merits = rcf / np.sqrt(n_features + rff)


  Min 0.0
  Max 1.0246406531926673
  Avg 1.0009272382606549
  Std 0.1430025018918224
-- Generation 42 --


  merits = rcf / np.sqrt(n_features + rff)


  Min 0.0
  Max 1.0258284146013834
  Avg 1.0014986179059604
  Std 0.1430965886843372
-- Generation 43 --
  Min 1.010809153436736
  Max 1.027010529413281
  Avg 1.0218679626788338
  Std 0.003347250280752222
-- Generation 44 --
  Min 1.0122157108388976
  Max 1.029056296983937
  Avg 1.023378617314773
  Std 0.0032116502917490628
-- Generation 45 --
  Min 1.0157858258548813
  Max 1.029056296983937
  Avg 1.0243558205271595
  Std 0.0030278279751180044
-- Generation 46 --
  Min 1.0122911860025172
  Max 1.0303685471521247
  Avg 1.0256300013479487
  Std 0.0032430790959577776
-- Generation 47 --
  Min 1.0157191837796884
  Max 1.0333232344606393
  Avg 1.0261254742103956
  Std 0.0034181155717733923
-- Generation 48 --
  Min 1.015767412669448
  Max 1.0333232344606393
  Avg 1.0266809040429976
  Std 0.004242178609242094
-- Generation 49 --


  merits = rcf / np.sqrt(n_features + rff)


  Min 0.0
  Max 1.0333232344606393
  Avg 0.9858024172627898
  Std 0.20128378203169964
-- Generation 50 --
  Min 1.0157598074296414
  Max 1.0333232344606393
  Avg 1.028689189101224
  Std 0.0035515508921129093


In [7]:
max_idx = fits.index(max(fits))
fits[max_idx]

6.225572245031367

In [8]:
columns = data.loc[:, list(map(bool,pop[max_idx][:-1]))].columns

In [9]:
sum(pop[max_idx][:-1])

14

In [10]:
print("Variables: "+str(columns.values))
print("Window size: "+str(pop[max_idx][-1]))

Variables: ['NUM_OPERACIONES_1' 'NUM_OPERACIONES_5' 'NUM_OPERACIONES_8'
 'NUM_OPERACIONES_9' 'NUM_OPERACIONES_11' 'NUM_OPERACIONES_13'
 'NUM_OPERACIONES_14' 'NUM_OPERACIONES_15' 'NUM_OPERACIONES_18'
 'NUM_OPERACIONES_22' 'w_med' 'hr' 'w_racha_dir' 'PIB']
Window size: 5


In [11]:
'\',\''.join(columns.values)

"NUM_OPERACIONES_1','NUM_OPERACIONES_5','NUM_OPERACIONES_8','NUM_OPERACIONES_9','NUM_OPERACIONES_11','NUM_OPERACIONES_13','NUM_OPERACIONES_14','NUM_OPERACIONES_15','NUM_OPERACIONES_18','NUM_OPERACIONES_22','w_med','hr','w_racha_dir','PIB"