In [1]:
from EAExperiment import EAExperiment
import pandas as pd
import functools
from skfeature.function.statistical_based import CFS
import numpy as np
import math

In [2]:
data = pd.read_csv('data.csv')
window = 6
horizont = 3
target = 'PESO_7'
n_gen = 2
pop_size = 12
max_window_size = 6

In [3]:
def create_propositional_table_dataframe(df,w, target, h):
    columns =[]
    for i in range(w,0,-1):
        columns.extend([s + "_t-"+str(i) for s in df.columns])
    for t in target:
        for j in range(h):
            columns.append(t+"_t"+str(j))
    dataframe = pd.DataFrame(columns=columns)
    return dataframe
    
def create_propositional_table(df, w, h, target):
    #dataframe = create_event_table_dataframe(df, w, target, h)
    #print(dataframe.head())
    columns =[]
    for i in range(w,0,-1):
        columns.extend([s + "_t-"+str(i) for s in df.columns])
    for t in target:
        for j in range(h):
            columns.append(t+"_t"+str(j))
    dataframe = pd.DataFrame(columns=columns)
    
    for i in range((len(df)-w-h+1)):
        window = df.iloc[i:(i+w)]
        row = window.values.reshape(1, len(window.columns)*len(window))
        targets = {}
        for t in target:
            row = np.append(row, df[t].iloc[(i+w):(i+w+h)])
            
        dataframe.loc[i]=row.reshape(1, len(row))[0]
    return dataframe

In [4]:
def merit_calculation(X, y):
    """
    This function calculates the merit of X given class labels y, where
    merits = (k * rcf)/sqrt(k+k*(k-1)*rff)
    rcf = (1/k)*sum(su(fi,y)) for all fi in X
    rff = (1/(k*(k-1)))*sum(su(fi,fj)) for all fi and fj in X

    Input
    ----------
    X: {numpy array}, shape (n_samples, n_features)
        input data
    y: {numpy array}, shape (n_samples,)
        input class labels

    Output
    ----------
    merits: {float}
        merit of a feature subset X
    """

    n_samples, n_features = X.shape
    rff = 0
    rcf = 0
    for i in range(n_features):
        fi = X[:, i]
        rcf += su_calculation(fi, y)
        for j in range(n_features):
            if j > i:
                fj = X[:, j]
                rff += su_calculation(fi, fj)
    rff *= 2
    merits = rcf / np.sqrt(n_features + rff)
    return merits

In [5]:
def _evaluation_func(data, horizont, target, create_propositional_table, individuals):
    
    fitnesses = []
    for individual in individuals:
        target_idx = list(data.columns).index(target)
        individual[target_idx] = 1
        filtered_data = data.loc[:, individual[:-1].astype(bool)]
        
        window = individual[-1]
        propositional_filtered = create_propositional_table(filtered_data, window, horizont, [target])
        X = propositional_filtered.iloc[:, :-horizont]
        Y = propositional_filtered.iloc[:, -horizont:]
        fitness = 0
        all_features = []
        for h in Y.columns:
            merits = CFS.merit_calculation(X.values, Y[h].values)
            if not math.isnan(merits):
                fitness += merits
        fitnesses.append(fitness)
            
    return fitnesses
    
evaluation_func = functools.partial(_evaluation_func, data, horizont, target, create_propositional_table)

In [6]:
eaExperiment = EAExperiment(evaluation_func)
pop, fits = eaExperiment.start_search_ga(n_gen, pop_size, max_window_size, data)

37
37
34
36
37
41
34
36
41
36
39
43
12
12
-- Generation 1 --
34
34
37
41
38
39
6
6
  Min 3.0057221539138324
  Max 3.0125138921071906
  Avg 3.0102341299313866
  Std 0.001848285165105795
-- Generation 2 --
34
33
34
38
34
35
37
39
41
37
10
10
  Min 3.0078965039341736
  Max 3.0132086239774813
  Avg 3.011634231542308
  Std 0.0014968423704878589


In [12]:
max_idx = fits.index(max(fits))
fits[max_idx]

3.0132086239774813

In [20]:
columns = data.loc[:, list(map(bool,pop[max_idx][:-1]))].columns

In [23]:
columns.values, pop[max_idx][-1]

(array(['PESO_1', 'NUM_OPERACIONES_1', 'VALOR_2', 'PESO_4', 'VALOR_4',
        'NUM_OPERACIONES_4', 'VALOR_5', 'NUM_OPERACIONES_5',
        'NUM_OPERACIONES_6', 'NUM_OPERACIONES_7', 'PESO_8',
        'NUM_OPERACIONES_8', 'VALOR_10', 'PESO_11', 'VALOR_11',
        'NUM_OPERACIONES_11', 'PESO_12', 'VALOR_12', 'NUM_OPERACIONES_12',
        'VALOR_13', 'PESO_14', 'VALOR_14', 'NUM_OPERACIONES_14',
        'NUM_OPERACIONES_15', 'VALOR_17', 'NUM_OPERACIONES_18', 'PESO_19',
        'NUM_OPERACIONES_19', 'VALOR_21', 'NUM_OPERACIONES_21', 'VALOR_22',
        'VALOR_23', 'w_med', 'hr', 'e', 'w_racha_dir'], dtype=object), 4)