# Experimentos: Baseline com Random Forest

Este notebook estabelece baselines de performance com um modelo de Random Forest com feature selection clássico em datasets

# Dataset = Breast Cancer Wisconsin (Diagnostic)

In [10]:
from sklearn.datasets import load_breast_cancer 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier  # Importando Random Forest
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd

## Loading data and initial analysis

In [11]:
X, y = load_breast_cancer(return_X_y=True)  # Carregando o dataset

# Verificando as dimensões dos nossos dados
print(f"Dimensões dos dados: {X.shape}")


# Vendo a distribuição das classes (0 e 1)
print("\nDistribuição das classes:")
# Usando numpy para contar as ocorrências de cada classe
unique, counts = np.unique(y, return_counts=True)
for class_label, count in zip(unique, counts):
    print(f"Classe {class_label}: {count} ocorrências")


Dimensões dos dados: (569, 30)

Distribuição das classes:
Classe 0: 212 ocorrências
Classe 1: 357 ocorrências


## Split data into training and testing sets

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y) # Stratify -> mantém a proporção das classes no split

print(f"Dimensões do conjunto de treino: {X_train.shape}")
print(f"Dimensões do conjunto de teste: {X_test.shape}")


Dimensões do conjunto de treino: (426, 30)
Dimensões do conjunto de teste: (143, 30)


## Applying feature selection by correlation (30 -> 5 features)

In [13]:

top_features_bc = ['worst concave points', 'mean concave points', 'worst perimeter', 'worst radius', 'mean perimeter']


feature_names = load_breast_cancer().feature_names
df_X_train = pd.DataFrame(X_train, columns=feature_names)
df_X_test = pd.DataFrame(X_test, columns=feature_names)

X_train_selected_bc = df_X_train[top_features_bc]
X_test_selected_bc = df_X_test[top_features_bc]

print("Seleção de Features Aplicada (Breast Cancer)")
print("Novo formato do X_train:", X_train_selected_bc.shape)

Seleção de Features Aplicada (Breast Cancer)
Novo formato do X_train: (426, 5)


## Normalize data

In [14]:
# normalizing the data
scaler_bc = StandardScaler()
X_train_scaled_bc = scaler_bc.fit_transform(X_train_selected_bc)
X_test_scaled_bc = scaler_bc.transform(X_test_selected_bc)
print("\nDados normalizados")






Dados normalizados


# Training Random Forest model

In [15]:
# Treinando o modelo Random Forest
rf_model_bc = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_bc.fit(X_train_scaled_bc, y_train)

print("\nModelo treinado com Random Forest e feature selection")


Modelo treinado com Random Forest e feature selection


# Predictions

In [16]:
# predicting the test set com Random Forest
rf_predictions_bc = rf_model_bc.predict(X_test_scaled_bc)
# Avaliando o modelo
print("\nRelatório de Classificação com 5 features(bc):")
print(classification_report(y_test, rf_predictions_bc))


Relatório de Classificação com 5 features(bc):
              precision    recall  f1-score   support

           0       0.94      0.91      0.92        53
           1       0.95      0.97      0.96        90

    accuracy                           0.94       143
   macro avg       0.94      0.94      0.94       143
weighted avg       0.94      0.94      0.94       143



# Dataset = Student Performance

In [17]:
from imblearn.over_sampling import SMOTE


def run_rf_feature_selection(filepath, subject_name, n_features=5, separator=','):

    print(f"   Iniciando processamento para Random Forest com Feature Selection no dataset de ({subject_name})    ")
    
    # Carregamento e Pré-processamento
    df = pd.read_csv(filepath, sep=separator)
    y_grades = df[['G1', 'G2', 'G3']]
    X_features = df.drop(columns=['G1', 'G2', 'G3'])
    y_final = np.where(y_grades['G3'] >= 10, 1, 0)
    X_final = pd.get_dummies(X_features, drop_first=True)

    # Divisão dos dados PRIMEIRO
    X_train_full, X_test_full, y_train, y_test = train_test_split(
        X_final, y_final, test_size=0.25, random_state=42, stratify=y_final
    )
    
    # Seleção de Features baseada APENAS no treino
    train_df = X_train_full.copy()
    train_df["aprovado"] = y_train
    correlation = train_df.corr(numeric_only=True)["aprovado"].abs().sort_values(ascending=False)
    top_features = correlation[1:n_features+1].index.tolist()
    print(f"Top {n_features} features selecionadas: {top_features}")
    
    X_train_selected = X_train_full[top_features]
    X_test_selected = X_test_full[top_features]

    # Normalização e SMOTE (nos dados já selecionados)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)
    X_test_scaled = scaler.transform(X_test_selected)
    
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)
    
    # Treinamento e Avaliação do Random Forest
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train_resampled, y_train_resampled)
    
    predictions = rf_model.predict(X_test_scaled)
    
    print(f"\n Relatório de Classificação ")
    print(classification_report(y_test, predictions, target_names=['Reprovado', 'Aprovado']))

# Execution

In [18]:

path_portugues = '../data/student-por.csv'
path_matematica = '../data/student-mat.csv'


run_rf_feature_selection(filepath=path_portugues, subject_name="Português", separator=",")

print("\n" + "="*50 + "\n") 


run_rf_feature_selection(filepath=path_matematica, subject_name="Math", separator=";")

   Iniciando processamento para Random Forest com Feature Selection no dataset de (Português)    
Top 5 features selecionadas: ['failures', 'higher_yes', 'school_MS', 'Medu', 'studytime']

 Relatório de Classificação 
              precision    recall  f1-score   support

   Reprovado       0.33      0.40      0.36        25
    Aprovado       0.89      0.86      0.87       138

    accuracy                           0.79       163
   macro avg       0.61      0.63      0.62       163
weighted avg       0.80      0.79      0.79       163



   Iniciando processamento para Random Forest com Feature Selection no dataset de (Math)    
Top 5 features selecionadas: ['failures', 'higher_yes', 'goout', 'age', 'schoolsup_yes']

 Relatório de Classificação 
              precision    recall  f1-score   support

   Reprovado       0.50      0.33      0.40        33
    Aprovado       0.71      0.83      0.77        66

    accuracy                           0.67        99
   macro avg       0.61