# Download dos Arquivos

## Conjunto Teste

In [1]:
url = "https://storage.googleapis.com/kaggle-competitions-data/kaggle-v2/105175/12693976/bundle/archive.zip?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1750180799&Signature=iZC%2FUZrRGa3lCrMjoRFA2A2z3FHAk4liH8mgu2fA49lsP9RyRkkuQJP1%2BIYzrPoZlKAO%2FKymLIl9YJjLjbTQJlNjSNE9KEj%2Bp5W1oPduAMMMuExewCcini2kItdsOwOcC104YJ425P%2FCQa9h3uIrrKFt%2B6bpB1RX4Xtp26Dcq6%2B4qGJ26O9ad6UThXaBF0Kwvz2kDYugaaUX3DB%2FhoebkqIs%2FpGUTgWgUoSBMO48njHNdT%2FAdydQNLU2uq571kRwrlJozglM0gYjDGmvNVTxlu0IoIxyqwH%2BGtfn2jHXvQqXZU4f36vf5MaYeVXsuUyRJ24tO6MUMi%2B7h9RCozAVnQ%3D%3D&response-content-disposition=attachment%3B+filename%3Destudo-de-caso-bcc2025-1.zip"

In [None]:
import requests
import zipfile
import io

response = requests.get(url)

with zipfile.ZipFile(io.BytesIO(response.content)) as z:
    z.extractall('test')

## Conjunto Amostral

In [3]:
url = "https://drive.usercontent.google.com/download?id=1Fu369-ojLT6qHVSu-LWtyQ3BVYGTIYpd&export=download&authuser=0"

In [4]:
import requests
import io

csv_response = requests.get(url)

with open("samples.csv", "wb") as f:
    f.write(csv_response.content)

# Treinamento do Modelo

## Carregamento dos Dados

In [5]:
import pandas as pd

samples = pd.read_csv("samples.csv")

samples

Unnamed: 0,peso,altura,imc,idade,pas,pad,ppa,b2,sopro,fc,hda1,hda2,sexo,motivo1,motivo2,pc
0,25.0,110,21.0,4.25,100.0,60.0,Normal,Normal,Ausente,72.0,Assintomático,,M,Parecer cardiológico,Cirurgia,Normal
1,19.3,105,18.0,3.44,100.0,60.0,Normal,Normal,Sistólico,80.0,Assintomático,,F,Suspeita de cardiopatia,Sopro,Anormal
2,41.0,123,27.0,7.43,100.0,60.0,Normal,Normal,Ausente,80.0,,,M,Suspeita de cardiopatia,Dor precordial,Normal
3,28.0,108,24.0,4.42,90.0,60.0,Normal,Normal,Sistólico,100.0,Ganho de peso,,F,Suspeita de cardiopatia,Sopro,Anormal
4,55.0,172,19.0,14.16,110.0,80.0,Normal,Normal,Sistólico,80.0,,,M,Suspeita de cardiopatia,Dispnéia,Anormal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6287,51.5,153,22.0,12.86,100.0,60.0,Normal,Normal,Ausente,74.0,Assintomático,,M,Parecer cardiológico,,Normal
6288,42.0,142,21.0,2.79,105.0,65.0,Pre-Hipertensão PAD,Normal,Ausente,78.0,Dispneia,,F,Parecer cardiológico,,Normal
6289,13.9,95,15.0,3.87,90.0,65.0,Pre-Hipertensão PAD,Normal,Ausente,92.0,Assintomático,,M,Parecer cardiológico,Cirurgia,Normal
6290,20.0,124,13.0,7.94,100.0,70.0,Normal,Normal,Sistólico,100.0,Assintomático,,F,Parecer cardiológico,,Anormal


## Pre-processamento dos Dados

In [10]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [None]:
def get_preprocessor(X):
  colunas_numericas = X.select_dtypes(include=['int64', 'float64']).columns
  colunas_categoricas = X.select_dtypes(include=['object']).columns

  return make_column_transformer(
    (StandardScaler(), colunas_numericas),
    (OneHotEncoder(sparse_output=False, handle_unknown='ignore'), colunas_categoricas)
  ).fit(X)

In [None]:
samples['pc'] = samples['pc'].map({'Normal': 0, 'Anormal': 1})

X = samples.drop(columns=["pc", "sopro"])
y = samples["pc"]

In [None]:
preprocessor = get_preprocessor(X)

In [None]:
X = preprocessor.transform(X)

In [None]:
X_calib = X.sample(frac=0.1, random_state=42)
y_calib = y[X_calib.index]

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.model_selection import KFold

from scipy.stats import uniform, randint

In [None]:
estimator = RandomForestClassifier(random_state=42, n_jobs=-1)

param_distributions = {
    "criterion": [ "gini", "entropy"],
    "max_depth": randint(5, 100),
    "max_features": uniform(loc=0, scale=1),
    "min_samples_split": randint(2, 40),
    "min_samples_leaf": randint(1, 10),
    "class_weight": [None, "balanced", "balanced_subsample" ]
}

hyperparams = HalvingRandomSearchCV(
    estimator=estimator,
    param_distributions=param_distributions,
    scoring='f1',
    factor=2,
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    random_state=42,
    refit = False,
    n_jobs=-1,
    verbose=1
)

hyperparams.fit(X_calib, y_calib)

## Avaliação do Modelo

In [11]:
from sklearn.metrics import precision_score, recall_score

In [12]:
def calc_precision(estimator, X, y):
  y_pred = estimator.predict(X)
  return precision_score(y, y_pred)

def calc_recall(estimator, X, y):
  y_pred = estimator.predict(X)
  return recall_score(y, y_pred)