# Experimentos: Baseline com Rede Neural Clássica(MLP)

Este notebook estabelece baselines de performance com um modelo de Rede Neural Clássica(MLP) com feature selection em datasets

# Dataset = Breast Cancer Wisconsin (Diagnostic)

In [1]:
from sklearn.datasets import load_breast_cancer 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier  # Importando MLP
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd

## Loading data and initial analysis

In [2]:
X, y = load_breast_cancer(return_X_y=True)  # Carregando o dataset

# Verificando as dimensões dos nossos dados
print(f"Dimensões dos dados: {X.shape}")


# Vendo a distribuição das classes (0 e 1)
print("\nDistribuição das classes:")
# Usando numpy para contar as ocorrências de cada classe
unique, counts = np.unique(y, return_counts=True)
for class_label, count in zip(unique, counts):
    print(f"Classe {class_label}: {count} ocorrências")


Dimensões dos dados: (569, 30)

Distribuição das classes:
Classe 0: 212 ocorrências
Classe 1: 357 ocorrências


## Split data into training and testing sets

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y) # Stratify -> mantém a proporção das classes no split

print(f"Dimensões do conjunto de treino: {X_train.shape}")
print(f"Dimensões do conjunto de teste: {X_test.shape}")


Dimensões do conjunto de treino: (426, 30)
Dimensões do conjunto de teste: (143, 30)


## Applying feature selection by correlation (30 -> 5 features)

In [4]:

top_features_bc = ['worst concave points', 'mean concave points', 'worst perimeter', 'worst radius', 'mean perimeter']


feature_names = load_breast_cancer().feature_names
df_X_train = pd.DataFrame(X_train, columns=feature_names)
df_X_test = pd.DataFrame(X_test, columns=feature_names)

X_train_selected_bc = df_X_train[top_features_bc]
X_test_selected_bc = df_X_test[top_features_bc]

print("Seleção de Features Aplicada (Breast Cancer)")
print("Novo formato do X_train:", X_train_selected_bc.shape)

Seleção de Features Aplicada (Breast Cancer)
Novo formato do X_train: (426, 5)


## Normalize data

In [5]:
# normalizing the data
scaler_bc = StandardScaler()
X_train_scaled_bc = scaler_bc.fit_transform(X_train_selected_bc)
X_test_scaled_bc = scaler_bc.transform(X_test_selected_bc)
print("\nDados normalizados")






Dados normalizados


# Training MLP model

In [6]:
# Treinando o modelo Rede neural MLP
mlp_breast = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)
mlp_breast.fit(X_train_scaled_bc, y_train)  # Treinando o modelo com os dados de treino
print("\nMLP com feature selection treinado com sucesso!")


MLP com feature selection treinado com sucesso!


# Predictions

In [7]:
# predicting the test set com MLP
mlp_predictions_breast = mlp_breast.predict(X_test_scaled_bc)
# Avaliando o modelo
print("\n--- Relatório de Classificação para o MLP com 5 Features (Breast Cancer) ---")
print(classification_report(y_test, mlp_predictions_breast, target_names=load_breast_cancer().target_names))


--- Relatório de Classificação para o MLP com 5 Features (Breast Cancer) ---
              precision    recall  f1-score   support

   malignant       0.96      0.92      0.94        53
      benign       0.96      0.98      0.97        90

    accuracy                           0.96       143
   macro avg       0.96      0.95      0.95       143
weighted avg       0.96      0.96      0.96       143



# Dataset = Student Performance

In [None]:
from imblearn.over_sampling import SMOTE


def run_mlp_feature_selection(filepath, subject_name, n_features=5, separator=','):

    print(f" Iniciando processamento: MLP com Feature Selection para conjunto de:({subject_name})    ")
    
    # Carregamento e Pré-processamento
    df = pd.read_csv(filepath, sep=separator)
    y_grades = df[['G1', 'G2', 'G3']]
    X_features = df.drop(columns=['G1', 'G2', 'G3'])
    y_final = np.where(y_grades['G3'] >= 10, 1, 0)
    X_final = pd.get_dummies(X_features, drop_first=True)

    # Divisão dos dados 
    X_train_full, X_test_full, y_train, y_test = train_test_split(
        X_final, y_final, test_size=0.25, random_state=42, stratify=y_final
    )
    
    # Seleção de Features baseada APENAS no treino
    train_df = X_train_full.copy()
    train_df["aprovado"] = y_train
    correlation = train_df.corr(numeric_only=True)["aprovado"].abs().sort_values(ascending=False)
    top_features = correlation[1:n_features+1].index.tolist()
    print(f"Top {n_features} features selecionadas: {top_features}")
    
    X_train_selected = X_train_full[top_features]
    X_test_selected = X_test_full[top_features]

    # Normalização e SMOTE (nos dados já selecionados)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)
    X_test_scaled = scaler.transform(X_test_selected)
    
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)
    
    # Treinamento e Avaliação do MLP
    mlp_model = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)
    mlp_model.fit(X_train_resampled, y_train_resampled)
    
    predictions = mlp_model.predict(X_test_scaled)
    
    print(f"\n  Relatório de Classificação  ")
    print(classification_report(y_test, predictions, target_names=['Reprovado', 'Aprovado']))

# Execution

In [9]:

path_portugues = '../data/student-por.csv'
path_matematica = '../data/student-mat.csv'


run_mlp_feature_selection(filepath=path_portugues, subject_name="Português", separator=",")

print("\n" + "="*50 + "\n") 


run_mlp_feature_selection(filepath=path_matematica, subject_name="Math", separator=";")

--- Iniciando Experimento: MLP com Seleção de 5 Features (Português) ---
Top 5 features selecionadas: ['failures', 'higher_yes', 'school_MS', 'Medu', 'studytime']

--- Relatório de Classificação ---
              precision    recall  f1-score   support

   Reprovado       0.27      0.48      0.35        25
    Aprovado       0.89      0.77      0.82       138

    accuracy                           0.72       163
   macro avg       0.58      0.62      0.59       163
weighted avg       0.80      0.72      0.75       163



--- Iniciando Experimento: MLP com Seleção de 5 Features (Math) ---
Top 5 features selecionadas: ['failures', 'higher_yes', 'goout', 'age', 'schoolsup_yes']

--- Relatório de Classificação ---
              precision    recall  f1-score   support

   Reprovado       0.41      0.27      0.33        33
    Aprovado       0.69      0.80      0.74        66

    accuracy                           0.63        99
   macro avg       0.55      0.54      0.53        99
weighte