# CardioIA - Pipeline de Treinamento no Colab
Este notebook automatiza o download, preparação, treinamento e exportação dos modelos do projeto CardioIA.

In [None]:
import tensorflow as tf
from IPython.display import HTML, display

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f"GPUs detectadas: {gpus}")
else:
    display(HTML("<h2 style='color: red;'>ALERTA: GPU não detectada! Vá em Runtime -&gt; Change runtime type -&gt; GPU (T4).</h2>"))

In [None]:
%pip install tensorflow pandas python-dotenv kaggle tqdm matplotlib

## Configuração da API Kaggle
Informe as credenciais via variáveis de ambiente (.env com `KAGGLE_USERNAME` e `KAGGLE_KEY`) ou faça upload do arquivo `kaggle.json`.

In [None]:
import json
import os
from pathlib import Path

from dotenv import load_dotenv
from google.colab import files

load_dotenv()

kaggle_dir = Path.home() / '.kaggle'
kaggle_dir.mkdir(exist_ok=True)
token_path = kaggle_dir / 'kaggle.json'

username = os.getenv('KAGGLE_USERNAME')
key = os.getenv('KAGGLE_KEY')

if username and key:
    with token_path.open('w', encoding='utf-8') as fp:
        json.dump({'username': username, 'key': key}, fp)
    print('Credenciais carregadas a partir do .env.')
else:
    if not token_path.exists():
        print('Faça upload do arquivo kaggle.json exportado das suas credenciais Kaggle.')
        uploaded = files.upload()
        if 'kaggle.json' in uploaded:
            with token_path.open('wb') as fp:
                fp.write(uploaded['kaggle.json'])
            print('kaggle.json recebido com sucesso.')
        else:
            raise RuntimeError('kaggle.json não foi enviado. Tente novamente.')
    else:
        print('kaggle.json já estava presente. Pulando upload.')

os.chmod(token_path, 0o600)
print('Kaggle API configurada.')

## ETL: Download, Preparação e Organização do Dataset

In [None]:
from pathlib import Path

TEMP_DIR = Path('temp_data')
TEMP_DIR.mkdir(parents=True, exist_ok=True)
zip_path = TEMP_DIR / 'nih-chest-x-ray-14-224x224-resized.zip'

if not zip_path.exists():
    !kaggle datasets download -d xhlulu/nih-chest-x-ray-14-224x224-resized -p temp_data
else:
    print('Arquivo zip já existe. Pulando download...')

!unzip -q -o temp_data/nih-chest-x-ray-14-224x224-resized.zip -d temp_data
print('Extração concluída.')

In [None]:
from pathlib import Path
import shutil

import pandas as pd
from tqdm.auto import tqdm

TEMP_DIR = Path('temp_data')
DATA_DIR = Path('data')
TRAIN_DIR = DATA_DIR / 'train'
VAL_DIR = DATA_DIR / 'validation'

metadata_path = next(TEMP_DIR.rglob('Data_Entry_2017.csv'))
df = pd.read_csv(metadata_path)

cardio_df = df[df['Finding Labels'].str.contains('Cardiomegaly')].copy()
normal_df = df[df['Finding Labels'] == 'No Finding'].copy()

cardio_target = min(1000, len(cardio_df))
normal_target = min(1000, len(normal_df))

if cardio_target < 1000:
    print(f'Aviso: apenas {cardio_target} imagens de Cardiomegalia disponíveis.')
if normal_target < 1000:
    print(f'Aviso: apenas {normal_target} imagens Normais disponíveis.')

cardio_df = cardio_df.sample(n=cardio_target, random_state=42).reset_index(drop=True)
normal_df = normal_df.sample(n=normal_target, random_state=42).reset_index(drop=True)

def split_dataframe(dataframe, train_frac=0.8):
    split_idx = int(len(dataframe) * train_frac)
    treino = dataframe.iloc[:split_idx]
    valid = dataframe.iloc[split_idx:]
    return treino, valid

cardio_train, cardio_val = split_dataframe(cardio_df)
normal_train, normal_val = split_dataframe(normal_df)

dataset_root_candidates = [p for p in TEMP_DIR.iterdir() if p.is_dir()]
dataset_root = dataset_root_candidates[0] if dataset_root_candidates else TEMP_DIR

images_dir = None
for name in ['images_224', 'images']:
    candidate = dataset_root / name
    if candidate.exists() and candidate.is_dir():
        images_dir = candidate
        break

if images_dir is None:
    for candidate in dataset_root.rglob('*'):
        if candidate.is_dir() and 'image' in candidate.name.lower():
            sample_file = next(candidate.glob('*.png'), None) or next(candidate.glob('*.jpg'), None)
            if sample_file:
                images_dir = candidate
                break

if images_dir is None:
    raise RuntimeError('Diretório de imagens não encontrado após extração.')

print(f'Imagens localizadas em: {images_dir}')

for subset_dir in [TRAIN_DIR, VAL_DIR]:
    if subset_dir.exists():
        shutil.rmtree(subset_dir)
    subset_dir.mkdir(parents=True, exist_ok=True)

def copiar_imagens(dataframe, subset, label):
    destino = DATA_DIR / subset / label
    destino.mkdir(parents=True, exist_ok=True)
    sucesso = 0
    for img_name in tqdm(dataframe['Image Index'], desc=f'{subset}/{label}', unit='img'):
        origem = images_dir / img_name
        if origem.exists():
            shutil.copy2(origem, destino / img_name)
            sucesso += 1
        else:
            tqdm.write(f'Arquivo ausente: {img_name}')
    return sucesso

print('Organizando dataset...')
copiar_imagens(cardio_train, 'train', 'cardiomegaly')
copiar_imagens(normal_train, 'train', 'normal')
copiar_imagens(cardio_val, 'validation', 'cardiomegaly')
copiar_imagens(normal_val, 'validation', 'normal')

shutil.rmtree(TEMP_DIR, ignore_errors=True)
print(f'Dataset pronto: {cardio_target} Cardiomegalia, {normal_target} Normal.')

In [None]:
from pathlib import Path

DATA_DIR = Path('data')
for subset in ['train', 'validation']:
    for label in ['cardiomegaly', 'normal']:
        destino = DATA_DIR / subset / label
        count = sum(1 for _ in destino.glob('*')) if destino.exists() else 0
        print(f'{subset}/{label}: {count} imagens')

## Treinamento: ResNet50 (Transfer Learning) vs CNN Simples

In [None]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd()
SRC_DIR = PROJECT_ROOT / 'src'
if not SRC_DIR.exists():
    raise FileNotFoundError(f'Diretório src não encontrado em {SRC_DIR}. Faça upload do código-fonte do projeto.')

if str(SRC_DIR) not in sys.path:
    sys.path.append(str(SRC_DIR))

import data_preprocessing
import model_resnet
import model_simple_cnn
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

In [None]:
from pathlib import Path

DATA_BASE = Path('data')
MODELS_DIR = Path('models')
MODELS_DIR.mkdir(parents=True, exist_ok=True)

train_gen_resnet, val_gen_resnet = data_preprocessing.configurar_geradores(
    diretorio_base=DATA_BASE,
    batch_size=32,
    target_size=(224, 224),
)

resnet_model = model_resnet.construir_modelo(learning_rate=1e-4)

resnet_callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    ModelCheckpoint(filepath=str(MODELS_DIR / 'best_model.h5'), monitor='val_loss', save_best_only=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=1e-7),
]

history_resnet = resnet_model.fit(
    train_gen_resnet,
    epochs=20,
    validation_data=val_gen_resnet,
    callbacks=resnet_callbacks,
    verbose=1,
)

In [None]:
train_gen_cnn, val_gen_cnn = data_preprocessing.configurar_geradores(
    diretorio_base=DATA_BASE,
    batch_size=32,
    target_size=(224, 224),
)

cnn_model = model_simple_cnn.construir_modelo(learning_rate=1e-3)

cnn_callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    ModelCheckpoint(filepath=str(MODELS_DIR / 'cnn_baseline_best.h5'), monitor='val_loss', save_best_only=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=1e-7),
]

history_cnn = cnn_model.fit(
    train_gen_cnn,
    epochs=20,
    validation_data=val_gen_cnn,
    callbacks=cnn_callbacks,
    verbose=1,
)

cnn_model.save(MODELS_DIR / 'cnn_baseline_final.h5')

In [None]:
import matplotlib.pyplot as plt

def plot_history(history, titulo):
    metric_map = {'loss': 'Loss', 'accuracy': 'Acurácia'}
    for metric, label in metric_map.items():
        plt.figure(figsize=(7, 4))
        plt.plot(history.history[metric], label='Treino')
        plt.plot(history.history[f'val_{metric}'], label='Validação')
        plt.title(f'{titulo} - {label}')
        plt.xlabel('Épocas')
        plt.ylabel(label)
        plt.legend()
        plt.grid(True, linestyle='--', alpha=0.3)
        plt.show()

plot_history(history_resnet, 'ResNet50')
plot_history(history_cnn, 'CNN Simples')

## Exportação dos Modelos

In [None]:
import shutil
from pathlib import Path

from google.colab import files

MODELS_DIR = Path('models')
MODELS_DIR.mkdir(exist_ok=True)

archive_path = shutil.make_archive('cardioia_models', 'zip', root_dir=MODELS_DIR)
print(f'Arquivo gerado: {archive_path}')
files.download(archive_path)