## Setup

In [1]:
import sys, os
sys.path.insert(0, os.path.abspath(os.path.join('..', 'src')))

In [2]:
from dotenv import load_dotenv

# Carrega as variáveis de ambiente do arquivo .env
load_dotenv(dotenv_path='../.env')

True

In [3]:
import concurrent.futures
import json
from pathlib import Path
import time
from pprint import pprint
from pymongo import MongoClient
from datetime import datetime

import matplotlib.pyplot as plt
import sklearn.datasets
import sklearn.metrics
import sklearn.model_selection

import autosklearn.classification

from AutoSklearn.with_ensemble import with_ensemble_experiment
from AutoSklearn.without_ensemble import without_ensemble_experiment

  from pkg_resources import parse_version  # type: ignore


In [4]:
def get_mongo_connection():
    """Estabelece conexão com MongoDB e retorna a collection"""
    client = MongoClient(os.getenv('MONGODB_CONNECTION_STRING'))
    db = client['TCC']
    collection = db['Experimentos']
    return collection

def serialize_model_info(automl_model):
    """Extrai informações detalhadas do modelo AutoSklearn serializáveis para MongoDB"""
    try:
        model_info = {
            'total_models': len(automl_model.get_models_with_weights()),
            'models': []
        }
        
        # Informações detalhadas de cada modelo no ensemble
        for weight, pipeline in automl_model.get_models_with_weights():
            model_detail = {
                'weight': float(weight),
                'classifier': {},
                'data_preprocessor': {},
                'feature_preprocessor': {}
            }
            
            # Extrair informações de cada step do pipeline
            for name, step in pipeline.steps:
                real_step = step.choice if hasattr(step, "choice") else step
                
                # Pegar apenas parâmetros simples (números, strings, booleanos)
                simple_params = {}
                try:
                    params = real_step.get_params() if hasattr(real_step, 'get_params') else {}
                    for key, value in params.items():
                        if isinstance(value, (int, float, str, bool, type(None))):
                            simple_params[key] = value
                        elif isinstance(value, dict):
                            # Para dicionários, pegar apenas valores simples
                            simple_dict = {}
                            for k, v in value.items():
                                if isinstance(v, (int, float, str, bool, type(None))):
                                    simple_dict[k] = v
                            if simple_dict:
                                simple_params[key] = simple_dict
                        else:
                            # Para objetos complexos, converter para string
                            simple_params[key] = str(type(value).__name__)
                except:
                    simple_params = {}
                
                step_info = {
                    'algorithm': real_step.__class__.__name__,
                    'parameters': simple_params
                }
                
                # Mapear para as categorias corretas
                if name == 'classifier':
                    model_detail['classifier'] = step_info
                elif name == 'data_preprocessor':
                    model_detail['data_preprocessor'] = step_info
                elif name == 'feature_preprocessor':
                    model_detail['feature_preprocessor'] = step_info
            
            model_info['models'].append(model_detail)
        
        return model_info
    
    except Exception as e:
        return {'error': f'Erro ao processar modelo: {str(e)}'}

## Dataset and Experiments Parameters

In [5]:
dataset_name = 'iris'

X, y = sklearn.datasets.fetch_openml(
    name=dataset_name,
    version=1,
    return_X_y=True,
    as_frame=True
)
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
    X, y, random_state=1
)

tmp_path = Path('../results/tmp') / dataset_name
time_limit = 60 * 1  # 60 * número de minutos

## Experiments Functions

In [6]:
start_time = time.time()

results = {}

def process_result_mongodb(experiment_name, automl_model, dataset_name):
    """Processa o resultado e salva no MongoDB"""
    print(f"{experiment_name} concluído!")
    
    predictions = automl_model.predict(X_test)
    accuracy = sklearn.metrics.accuracy_score(y_test, predictions)
    
    precision = sklearn.metrics.precision_score(y_test, predictions, average='weighted')
    recall = sklearn.metrics.recall_score(y_test, predictions, average='weighted')
    f1 = sklearn.metrics.f1_score(y_test, predictions, average='weighted')
    
    experiment_document = {
        'timestamp': datetime.now(),
        'dataset': dataset_name,
        'with_ensemble': experiment_name == "COM ensemble",
        'metrics': {
            'accuracy': float(accuracy),
            'precision': float(precision),
            'recall': float(recall),
            'f1_score': float(f1)
        },
        'model': serialize_model_info(automl_model),
    }
    
    # Salva no MongoDB
    try:
        collection = get_mongo_connection()
        result = collection.insert_one(experiment_document)
        print(f"✅ Salvo no MongoDB: {result.inserted_id}")
    except Exception as e:
        print(f"❌ Erro MongoDB: {e}")
    
    # Mantém local para análises
    results[experiment_name] = {
        'model': automl_model,
        'accuracy': accuracy,
        'predictions': predictions
    }

with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
    print("Iniciando experimento COM ensemble...")
    future_with = executor.submit(
        with_ensemble_experiment, 
        X_train, y_train, time_limit, dataset_name, tmp_path / 'with_ensemble'
    )
    
    print("Iniciando experimento SEM ensemble...")
    future_without = executor.submit(
        without_ensemble_experiment, 
        X_train, y_train, time_limit, dataset_name, tmp_path / 'without_ensemble'
    )
    
    for future in concurrent.futures.as_completed([future_with, future_without]):
        if future == future_with:
            automl_with_ensemble = future.result()
            process_result_mongodb("COM ensemble", automl_with_ensemble, dataset_name)
        else:
            automl_without_ensemble = future.result()
            process_result_mongodb("SEM ensemble", automl_without_ensemble, dataset_name)

total_time = time.time() - start_time
print(f"Todos os experimentos concluídos em {total_time:.2f} segundos")

Iniciando experimento COM ensemble...
Iniciando experimento SEM ensemble...




COM ensemble concluído!
❌ Erro MongoDB: SSL handshake failed: ac-wvjp0kv-shard-00-02.qc5bvbn.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1147) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms),SSL handshake failed: ac-wvjp0kv-shard-00-01.qc5bvbn.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1147) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms),SSL handshake failed: ac-wvjp0kv-shard-00-00.qc5bvbn.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1147) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 687ef191b772851e67661be7, topology_type: ReplicaSetNoPrimary, servers: [<ServerDescription ('ac-wvjp0kv-shard-00-00.qc5bvbn.mongodb.net', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('SSL handshake failed:

## Explaining the models

## Lista de próximas etapas:

- Verificar informações dos modelos de pre processamento

- Testar com outros datasets (mnist e wine quality dataset)

- Testar o imdb com mais tempo para estourar o teto

- Melhorar estrutura para salver os resultados dos exprimentos com mais métricas as e os gráficos

### Outros tópicos

- Pesquisar sobre SHAP e interpretabilidade
- Começar a usar o TPOT ou H2O