In [20]:
import scipy.io


def carregar_dados(filepath: str):
    """Carrega os dados do arquivo .mat e os retorna."""
    raw_data = scipy.io.loadmat(filepath)

    data = {}

    data.update({
        'X_cal': raw_data['inputCalibration'].copy(),
        'y_cal': raw_data['targetCalibration'].ravel(),
        'X_test': raw_data['inputTest'].copy(),
        'y_test': raw_data['targetTest'].ravel(),
        'X_val': raw_data['inputValidation'].copy(),
    })

    wavenumbers = raw_data['wl'].ravel()

    # Converter wavenumbers de nm para cm-1
    # wavenumbers = 10**7 / wavenumbers
    data['wavenumbers'] = wavenumbers

    print("Dados carregados com sucesso.")
    print(f"  Calibração (X, y): {data['X_cal'].shape}, {data['y_cal'].shape}")
    print(f"  Teste (X, y):      {data['X_test'].shape}, {data['y_test'].shape}")
    print(f"  Validação (X):   {data['X_val'].shape}")
    print(f"  Números de onda (cm-1): {data['wavenumbers'].shape}, de {data['wavenumbers'].min():.2f} a {data['wavenumbers'].max():.2f}")
    return data



In [21]:
mat_file_path = '2012/ShootOut2012MATLAB/ShootOut2012MATLAB.mat'


dados = carregar_dados(mat_file_path)


Dados carregados com sucesso.
  Calibração (X, y): (89, 372), (89,)
  Teste (X, y):      (72, 372), (72,)
  Validação (X):   (67, 372)
  Números de onda (cm-1): (372,), de 952.42 a 1309.33


In [22]:
# x_cal = dados['X_cal']
# y_cal = dados['y_cal']
# x_test = dados['X_test']
# y_test = dados['y_test']
# x_val = dados['X_val']
# wavenumbers = dados['wavenumbers']
import pandas as pd

x_cal = pd.DataFrame(dados['X_cal'], columns=[f'feature_{i}' for i in range(dados['X_cal'].shape[1])])
y_cal = pd.Series(dados['y_cal'], name='target')
x_test = pd.DataFrame(dados['X_test'], columns=[f'feature_{i}' for i in range(dados['X_test'].shape[1])])
y_test = pd.Series(dados['y_test'], name='target')
x_val = pd.DataFrame(dados['X_val'], columns=[f'feature_{i}' for i in range(dados['X_val'].shape[1])])
wavenumbers = dados['wavenumbers']

In [41]:
from meta_heuristics_fs import AntColonyOptimizationFS
from sklearn.linear_model import LinearRegression
"""
Machine Learning Parameters
columns_list : Column names present in x_train_dataframe and x_test which will be used as input list for searching best list of features.

data_dict : X and Y training and test data provided in dictionary format. Below is example of 5 fold cross validation data with keys. {0:{'x_train':x_train_dataframe,'y_train':y_train_array,'x_test':x_test_dataframe,'y_test':y_test_array},
1:{'x_train':x_train_dataframe,'y_train':y_train_array,'x_test':x_test_dataframe,'y_test':y_test_array},
2:{'x_train':x_train_dataframe,'y_train':y_train_array,'x_test':x_test_dataframe,'y_test':y_test_array},
3:{'x_train':x_train_dataframe,'y_train':y_train_array,'x_test':x_test_dataframe,'y_test':y_test_array},
4:{'x_train':x_train_dataframe,'y_train':y_train_array,'x_test':x_test_dataframe,'y_test':y_test_array}}
If you only have train and test data and do not wish to do cross validation, use above dictionary format, with only one key.

use_validation_data : Whether you want to use validation data as a boolean True or False. Default value is True. If false, user need not provide x_validation_dataframe and y_validation_dataframe

x_validation_dataframe : dataframe containing features of validatoin dataset

y_validation_dataframe : dataframe containing dependent variable of validation dataset

model : Model object. It should have .fit and .predict attribute

cost_function_improvement : Objective is to whether increase or decrease the cost during subsequent iterations. For regression it should be 'decrease' and for classification it should be 'increase'

cost_function : Cost function for finding cost between actual and predicted values, depending on regression or classification problem. cost function should accept 'actual' and 'predicted' as arrays and return cost for the both.

average : Averaging to be used. This is useful for clasification metrics such as 'f1_score', 'jaccard_score', 'fbeta_score', 'precision_score', 'recall_score' and 'roc_auc_score' when dependent variable is multi-class

Ant Colony Optimization Parameters
iterations : Number of times ant colony optimization will search for solutions. Default is 100

N_ants : Number of ants in each iteration. Default is 100.

run_time : Number of minutes to run the algorithm. This is checked in between each iteration. At start of each generation it is checked if runtime has exceeded than alloted time.
If case run time did exceeds provided limit, best result from iterations executed so far is given as output.
Default is 2 hours. i.e. 120 minutes.

evaporation_rate : Evaporation rate. Values are between 0 and 1. If it is too large, chances are higher to find global optima, but computationally expensive. If it is low, chances of finding global optima are less. Default is kept as 0.9

Q : Pheromene update coefficient. Value between 0 and 1. It affects the convergence speed. If it is large, ACO will get stuck at local optima. Default is kept as 0.2

Output
best_columns : List object with list of column names which gives best performance for the model. These features can be used for training and saving models separately by the user.


"""

def cost_function(actual, predicted):
    """Cost function to calculate mean squared error."""
    return ((actual - predicted) ** 2).mean()

aco = AntColonyOptimizationFS(
    columns_list=[f'feature_{i}' for i in range(x_cal.shape[1])],
    data_dict={
        0: {'x_train': x_cal, 'y_train': y_cal, 'x_test': x_test, 'y_test': y_test}
    },
    use_validation_data=False,
    model=LinearRegression(),
    cost_function_improvement='decrease',
    cost_function=cost_function,
    average=None,
    iterations=100,
    N_ants=1000,
    run_time=1,  # in minutes
    evaporation_rate=0.5,
    Q=0.2
)

In [42]:
features = aco.GetBestFeatures()

Best combined performance on test and validation data for iteration 0: 0.4979209681054345
Best combined performance on test and validation data for iteration 1: 0.433978420202741
Best combined performance on test and validation data for iteration 2: 0.3178403922806912
Best combined performance on test and validation data for iteration 3: 0.29458939216973196
Best combined performance on test and validation data for iteration 4: 0.2753668901295895
Best combined performance on test and validation data for iteration 5: 0.25207428037295165
Best combined performance on test and validation data for iteration 6: 0.24498420117609557
Best combined performance on test and validation data for iteration 7: 0.2400973556407217
Best combined performance on test and validation data for iteration 8: 0.23506136576583814
Best combined performance on test and validation data for iteration 9: 0.23060956982010428
Best combined performance on test and validation data for iteration 10: 0.2296254075287146
Best 

KeyboardInterrupt: 