# Ibovespa prediction / Feature Selection

In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
%matplotlib inline

n_jobs = 15 # number of parallel jobs to run in parallel

## Loading Dataset

In [2]:
data = pd.read_csv('dataset2.csv', index_col=0)
data.index = data.index.astype('datetime64[ns]')
date_index = pd.date_range(start=data.index.min(), end=data.index.max(), freq='D')
data = data.reindex(date_index)

In [3]:
data['IBOV_Direction'] = [1 if x >= 0 else 0 for x in data['IBOV_Close'].diff()]

In [4]:
lag_features = [
    'IBOV_Direction',
    'IBOV_Open',
    'IBOV_High',
    'IBOV_Low',
    'IBOV_Close',
    'IBOV_Volume'
]

wma_features = [
    'IBOV_Open',
    'IBOV_High',
    'IBOV_Low',
    'IBOV_Close'
]

for feature in lag_features:
    for i in range(1, 8):
        data[f'{feature}_L{i}'] = data[feature].shift(periods=i)
        
for feature in wma_features:    
    data[f'{feature}_WMA30'] = data[feature].shift(periods=1).rolling(
        window=30,
        center=False
    ).apply(lambda x: np.sum(np.arange(1, 31) * x) / np.sum(np.arange(1, 31)), raw=False)
    
data = data.iloc[30:,:]
data.drop([
    'IBOV_Volume',
    'Crude_Oil_Close',
    'Gold_Close',
    'Nasdaq_Close',
    'Dow_Jones_Close',
    'S&P500_Close'
], axis=1, inplace=True)

## Checking for empty values

In [5]:
(data.isnull().sum() > 0).sum()

0

There are no empty values in the dataset

# Feature Selection

## Genetic Filter

In [6]:
import pygad
from sklearn.feature_selection import mutual_info_regression
from scipy.stats import pearsonr
from sklearn.feature_selection import f_regression

def genetic_filter(X, y):
    def f(i, j):
        mi = mutual_info_regression(i.reshape(-1,1), j)[0]
        F, _ = f_regression(i.reshape(-1, 1), j)
        corr, _ = pearsonr(i, j)

        return mi + F[0] + abs(corr)

    f_values = {}
    ncols = X.shape[1]

    results = Parallel(n_jobs=n_jobs)(delayed(f)(X[:,i], X[:,j]) for i in range(ncols - 1) for j in range(i + 1, ncols))

    index = 0
    for i in range(ncols - 1):
        for j in range(i + 1, ncols):
            f_values[(i, j)] = results[index]
            f_values[(j, i)] = results[index]
            index += 1

    results = Parallel(n_jobs=n_jobs)(delayed(f)(y, X[:,i]) for i in range(ncols))
    index = 0
    for i in range(ncols):
        f_values[('target', i)] = results[index]
        index += 1
        
    def fitness_func(solution, solution_idx):
        idx_selected = np.nonzero(solution)[0]

        f_features_target = 0
        for idx in idx_selected:
            f_features_target += f_values[('target', idx)]

        f_features = 0
        for i in range(len(idx_selected) - 1):
            for j in range(i + 1, len(idx_selected)):
                f_features += f_values[(idx_selected[i], idx_selected[j])]

        return f_features_target - f_features

    ga = pygad.GA(
        num_parents_mating=4,
        keep_parents=3,
        sol_per_pop=100,
        num_generations=500,
        num_genes=ncols,
        crossover_type='two_points',
        mutation_type='random',
        mutation_probability=0.001,
        parent_selection_type='rws',
        gene_space=(0, 1),
        fitness_func=fitness_func,
        parallel_processing=['thread', 15]
    )

    ga.run()
    
    return ga

# Model training/validation

## Train/Test split function definition

In [7]:
def train_test_split(X, y, train_size=0.8):
    nrows = X.shape[0]
    sep = math.ceil(nrows * train_size)
    
    return X[:sep,:], y[:sep], X[sep:,:], y[sep:]

## SVM

In [8]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, accuracy_score, roc_curve, auc
from sklearn.preprocessing import StandardScaler

target = 'IBOV_Direction'
X = data.drop([
    target,
    'IBOV_High',
    'IBOV_Low',
    'IBOV_Close'
], axis=1).values
y = data[target].values
X_train, y_train, X_test, y_test = train_test_split(X, y)

def run_svm(X_train_, y_train_, X_test_, y_test_):
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('SVM', SVC(
            kernel='poly',
            degree=1
        ))
    ])

    pipe.fit(X_train_, y_train_)
    y_pred = pipe.predict(X_test_)
    
    fpr, tpr, _ = roc_curve(y_test_, y_pred)
    return auc(fpr, tpr), accuracy_score(y_test_, y_pred)

results = Parallel(n_jobs=n_jobs)(delayed(run_svm)(X_train, y_train, X_test, y_test) for i in range(50))
aucs, accs = zip(*results)
print(f'SVM AUC: {np.mean(aucs)} +- {np.std(aucs)}')
print(f'SVM Accuracy: {np.mean(accs)} +- {np.std(accs)}')

SVM AUC: 0.6428329173357074 +- 1.1102230246251565e-16
SVM Accuracy: 0.6434262948207171 +- 0.0


In [9]:
ga = genetic_filter(X, y)

In [10]:
filter_selected = np.nonzero(ga.best_solution()[0])[0]

In [11]:
X_train_fs = X_train[:,filter_selected]
X_test_fs = X_test[:,filter_selected]

results = Parallel(n_jobs=n_jobs)(delayed(run_svm)(X_train_fs, y_train, X_test_fs, y_test) for i in range(50))
aucs, accs = zip(*results)
print(f'SVM AUC: {np.mean(aucs)} +- {np.std(aucs)}')
print(f'SVM Accuracy: {np.mean(accs)} +- {np.std(accs)}')

SVM AUC: 0.56233340222466 +- 0.0
SVM Accuracy: 0.5630810092961487 +- 1.1102230246251565e-16


## ANN - MLP

In [12]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, roc_curve, auc
from itertools import combinations_with_replacement
from joblib import Parallel, delayed

target = 'IBOV_Direction'
X = data.drop([
    target,
    'IBOV_High',
    'IBOV_Low',
    'IBOV_Close'
], axis=1).values
y = data[target].values
X_train, y_train, X_test, y_test = train_test_split(X, y)

n_layers = np.arange(2) + 1
n_neurons = np.arange(0, 35, 5) + 5
epochs = [500]
combinations = []

for layers in n_layers:
    combinations.extend(combinations_with_replacement(n_neurons, int(layers)))
    
def run_mlp(X_train, y_train, X_test, y_test, layer_sizes, epochs):
    aucs = []
    accs = []
    
    for i in range(50):
        pipe = Pipeline([
            ('scaler', StandardScaler()),
            ('MLP', MLPClassifier(
                hidden_layer_sizes=layer_sizes,
                solver='adam',
                max_iter=epochs,
                activation='tanh'
            ))
        ])

        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        fpr, tpr, _ = roc_curve(y_test, y_pred)
        aucs.append(auc(fpr, tpr))
        accs.append(accuracy_score(y_test, y_pred))
    
    return epochs, layer_sizes, aucs, accs
    
results = Parallel(n_jobs=n_jobs)(delayed(run_mlp)(
    X_train,
    y_train,
    X_test,
    y_test,
    ls,
    ep
) for ep in epochs for ls in combinations)



In [13]:
data = []
for result in results:
    data.append([
        result[0],
        result[1],
        f'{np.mean(result[2])} +- {np.std(result[2])}',
        f'{np.mean(result[3])} +- {np.std(result[3])}'
    ])
    
summary = pd.DataFrame(data, columns=['Epochs', 'Hidden Layer Sizes', 'AUC', 'Accuracy'])
summary.sort_values(['Accuracy', 'AUC'], ascending=False)

Unnamed: 0,Epochs,Hidden Layer Sizes,AUC,Accuracy
8,500,"(5, 10)",0.6352290106675569 +- 0.006883879068387828,0.6372111553784859 +- 0.006605167041842615
11,500,"(5, 25)",0.6343552566731848 +- 0.008035835034513827,0.6363081009296149 +- 0.007270282616540219
7,500,"(5, 5)",0.6325443686639852 +- 0.009406688735247115,0.63535192563081 +- 0.008044520912022516
10,500,"(5, 20)",0.6326198841565848 +- 0.01142897408617064,0.635152722443559 +- 0.010148906596761802
0,500,"(5,)",0.6324695950779797 +- 0.013377278823596846,0.6349402390438247 +- 0.011184028535433719
12,500,"(5, 30)",0.6312482578443374 +- 0.011733527795064948,0.6343160690571049 +- 0.009927382989601389
13,500,"(5, 35)",0.6313954203165821 +- 0.010495543112280627,0.6337450199203186 +- 0.0095894629469875
9,500,"(5, 15)",0.6308841936800157 +- 0.008985076877636533,0.6333864541832669 +- 0.007959978125688272
1,500,"(10,)",0.6314573165238496 +- 0.008336824119995598,0.6330544488711819 +- 0.008327328619714115
2,500,"(15,)",0.6311565793866553 +- 0.012644883963616453,0.6312616201859229 +- 0.0136995160066769


In [14]:
ga = genetic_filter(X, y)

In [15]:
filter_selected = np.nonzero(ga.best_solution()[0])[0]

In [16]:
X_train_fs = X_train[:,filter_selected]
X_test_fs = X_test[:,filter_selected]

results = Parallel(n_jobs=n_jobs)(delayed(run_mlp)(
    X_train_fs,
    y_train,
    X_test_fs,
    y_test,
    ls,
    ep
) for ep in epochs for ls in combinations)

In [17]:
data = []
for result in results:
    data.append([
        result[0],
        result[1],
        f'{np.mean(result[2])} +- {np.std(result[2])}',
        f'{np.mean(result[3])} +- {np.std(result[3])}'
    ])
    
summary = pd.DataFrame(data, columns=['Epochs', 'Hidden Layer Sizes', 'AUC', 'Accuracy'])
summary.sort_values(['Accuracy', 'AUC'], ascending=False)

Unnamed: 0,Epochs,Hidden Layer Sizes,AUC,Accuracy
9,500,"(5, 15)",0.6401816611288639 +- 0.0043402994171005726,0.6411686586985391 +- 0.003957414190106594
1,500,"(10,)",0.6398923705505477 +- 0.005576386452107,0.6409163346613544 +- 0.005717112193855502
7,500,"(5, 5)",0.6397044032156353 +- 0.005060374887903802,0.6405577689243027 +- 0.004862154035084402
0,500,"(5,)",0.6390522673195445 +- 0.0070334399604160165,0.6399468791500663 +- 0.006659414159560076
13,500,"(5, 35)",0.638438816552996 +- 0.008724384039054456,0.639734395750332 +- 0.00742524220853333
11,500,"(5, 25)",0.6384053777629396 +- 0.008756387248272485,0.6395883134130146 +- 0.00769025216607155
12,500,"(5, 30)",0.6380418965252274 +- 0.006707329061464426,0.6393359893758299 +- 0.005852883594734675
2,500,"(15,)",0.6371750316635135 +- 0.00875553428403704,0.6385657370517929 +- 0.00787175003008163
10,500,"(5, 20)",0.636117125853855 +- 0.0069997312666110105,0.6377290836653386 +- 0.00594812696766
3,500,"(20,)",0.6367278209673402 +- 0.0076295524650608825,0.6375166002656042 +- 0.008321354013627652


# Ensemble A

In [18]:
#Todo