# Ibovespa prediction / Feature Selection

In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
%matplotlib inline

n_jobs = 15 # number of parallel jobs to run in parallel

## Loading Dataset

In [2]:
data = pd.read_csv('dataset2.csv', index_col=0)
data.index = data.index.astype('datetime64[ns]')
date_index = pd.date_range(start=data.index.min(), end=data.index.max(), freq='D')
data = data.reindex(date_index)

## Checking for empty values

In [3]:
(data.isnull().sum() > 0).sum()

0

There are no empty values in the dataset

# Feature Selection

## Genetic Filter

In [4]:
import pygad
from sklearn.feature_selection import mutual_info_regression
from scipy.stats import pearsonr
from sklearn.feature_selection import f_regression

def genetic_filter(X, y):
    def f(i, j):
        mi = mutual_info_regression(i.reshape(-1,1), j)[0]
        F, _ = f_regression(i.reshape(-1, 1), j)
        corr, _ = pearsonr(i, j)

        return mi + F[0] + abs(corr)

    f_values = {}
    ncols = X.shape[1]

    results = Parallel(n_jobs=n_jobs)(delayed(f)(X[:,i], X[:,j]) for i in range(ncols - 1) for j in range(i + 1, ncols))

    index = 0
    for i in range(ncols - 1):
        for j in range(i + 1, ncols):
            f_values[(i, j)] = results[index]
            f_values[(j, i)] = results[index]
            index += 1

    results = Parallel(n_jobs=n_jobs)(delayed(f)(y, X[:,i]) for i in range(ncols))
    index = 0
    for i in range(ncols):
        f_values[('target', i)] = results[index]
        index += 1
        
    def fitness_func(solution, solution_idx):
        idx_selected = np.nonzero(solution)[0]

        f_features_target = 0
        for idx in idx_selected:
            f_features_target += f_values[('target', idx)]

        f_features = 0
        for i in range(len(idx_selected) - 1):
            for j in range(i + 1, len(idx_selected)):
                f_features += f_values[(idx_selected[i], idx_selected[j])]

        return f_features_target - f_features

    ga = pygad.GA(
        num_parents_mating=4,
        keep_parents=3,
        sol_per_pop=100,
        num_generations=500,
        num_genes=ncols,
        crossover_type='two_points',
        mutation_type='random',
        mutation_probability=0.001,
        parent_selection_type='rws',
        gene_space=(0, 1),
        fitness_func=fitness_func,
        parallel_processing=['thread', 15]
    )

    ga.run()
    
    return ga

# Model training/validation

## Train/Test split function definition

In [5]:
def train_test_split(X, y, train_size=0.8):
    nrows = X.shape[0]
    sep = math.ceil(nrows * train_size)
    
    return X[:sep,:], y[:sep], X[sep:,:], y[sep:]

## SVM

In [6]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, accuracy_score, roc_curve, auc
from sklearn.preprocessing import StandardScaler

target = 'IBOV_Direction'
X = data.drop([target], axis=1).values
y = data[target].values
X_train, y_train, X_test, y_test = train_test_split(X, y)

def run_svm(X_train_, y_train_, X_test_, y_test_):
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('SVM', SVC(
            kernel='poly',
            degree=1
        ))
    ])

    pipe.fit(X_train_, y_train_)
    y_pred = pipe.predict(X_test_)
    
    fpr, tpr, _ = roc_curve(y_test_, y_pred)
    return auc(fpr, tpr), accuracy_score(y_test_, y_pred)

results = Parallel(n_jobs=n_jobs)(delayed(run_svm)(X_train, y_train, X_test, y_test) for i in range(50))
aucs, accs = zip(*results)
print(f'SVM AUC: {np.mean(aucs)} +- {np.std(aucs)}')
print(f'SVM Accuracy: {np.mean(accs)} +- {np.std(accs)}')

SVM AUC: 0.6427935440104021 +- 1.1102230246251565e-16
SVM Accuracy: 0.6434262948207171 +- 0.0


In [7]:
ga = genetic_filter(X, y)

In [8]:
filter_selected = np.nonzero(ga.best_solution()[0])[0]

In [9]:
X_train_fs = X_train[:,filter_selected]
X_test_fs = X_test[:,filter_selected]

results = Parallel(n_jobs=n_jobs)(delayed(run_svm)(X_train_fs, y_train, X_test_fs, y_test) for i in range(50))
aucs, accs = zip(*results)
print(f'SVM AUC: {np.mean(aucs)} +- {np.std(aucs)}')
print(f'SVM Accuracy: {np.mean(accs)} +- {np.std(accs)}')

SVM AUC: 0.6427935440104021 +- 1.1102230246251565e-16
SVM Accuracy: 0.6434262948207171 +- 0.0


## ANN - MLP

In [10]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, roc_curve, auc
from itertools import combinations_with_replacement
from joblib import Parallel, delayed

target = 'IBOV_Direction'
X = data.drop([target], axis=1).values
y = data[target].values
X_train, y_train, X_test, y_test = train_test_split(X, y)

n_layers = np.arange(2) + 1
n_neurons = np.arange(0, 35, 5) + 5
epochs = np.arange(50, 500, 50) + 50
combinations = []

for layers in n_layers:
    combinations.extend(combinations_with_replacement(n_neurons, int(layers)))
    
def run_mlp(X_train, y_train, X_test, y_test, layer_sizes, epochs):
    aucs = []
    accs = []
    
    for i in range(50):
        pipe = Pipeline([
            ('scaler', StandardScaler()),
            ('MLP', MLPClassifier(
                hidden_layer_sizes=layer_sizes,
                solver='adam',
                max_iter=epochs,
                activation='tanh',
                random_state=15
            ))
        ])

        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        fpr, tpr, _ = roc_curve(y_test, y_pred)
        aucs.append(auc(fpr, tpr))
        accs.append(accuracy_score(y_test, y_pred))
    
    return epochs, layer_sizes, aucs, accs
    
results = Parallel(n_jobs=n_jobs)(delayed(run_mlp)(
    X_train,
    y_train,
    X_test,
    y_test,
    ls,
    ep
) for ep in epochs for ls in combinations)

In [11]:
data = []
for result in results:
    data.append([
        result[0],
        result[1],
        f'{np.mean(result[2])} +- {np.std(result[2])}',
        f'{np.mean(result[3])} +- {np.std(result[3])}'
    ])
    
summary = pd.DataFrame(data, columns=['Epochs', 'Hidden Layer Sizes', 'AUC', 'Accuracy'])
summary.sort_values(['Accuracy', 'AUC'], ascending=False)

Unnamed: 0,Epochs,Hidden Layer Sizes,AUC,Accuracy
214,400,"(25,)",0.893378300073492 +- 1.1102230246251565e-16,0.8930942895086321 +- 0.0
249,450,"(25,)",0.893378300073492 +- 1.1102230246251565e-16,0.8930942895086321 +- 0.0
284,500,"(25,)",0.893378300073492 +- 1.1102230246251565e-16,0.8930942895086321 +- 0.0
248,450,"(20,)",0.8891489767652213 +- 0.0,0.8897742363877821 +- 1.1102230246251565e-16
283,500,"(20,)",0.8891489767652213 +- 0.0,0.8897742363877821 +- 1.1102230246251565e-16
...,...,...,...,...
1,100,"(10,)",0.6331424190174685 +- 0.0,0.6401062416998671 +- 1.1102230246251565e-16
2,100,"(15,)",0.6232722313302053 +- 0.0,0.6367861885790171 +- 1.1102230246251565e-16
34,100,"(35, 35)",0.6458268556730172 +- 1.1102230246251565e-16,0.6314741035856573 +- 1.1102230246251565e-16
23,100,"(15, 30)",0.6402866894680309 +- 0.0,0.6261620185922974 +- 1.1102230246251565e-16


# Ensemble A

In [12]:
#Todo