In [None]:
#RSNA DATASET

# Install required packages (if not preinstalled)
!pip install -q pydicom opencv-python-headless

import os
import cv2
import torch
import numpy as np
import pydicom
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from torchvision.models import vit_b_16
from sklearn.metrics import classification_report, confusion_matrix

# ✅ Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ✅ Paths
base_path = '/kaggle/input/rsna-pneumonia-detection-challenge'
train_img_path = os.path.join(base_path, 'stage_2_train_images')
test_img_path = os.path.join(base_path, 'stage_2_test_images')
labels_csv_path = os.path.join(base_path, 'stage_2_train_labels.csv')
sample_submission_path = os.path.join(base_path, 'stage_2_sample_submission.csv')

# ✅ Load train labels
def load_labels(path):
    df = pd.read_csv(path)
    labels = {}
    for _, row in df.iterrows():
        labels[row['patientId']] = 1 if row['Target'] == 1 else 0
    return labels

labels_dict = load_labels(labels_csv_path)

# ✅ Image preprocessing (CLAHE + denoise + hist eq)
def preprocess_image(image):
    image = cv2.convertScaleAbs(image, alpha=1.5, beta=0)
    image = cv2.equalizeHist(image)
    clahe = cv2.createCLAHE(clipLimit=2.0)
    image = clahe.apply(image)
    image = cv2.fastNlMeansDenoising(image, h=10)
    return image

# ✅ Dataset for train/test
class RSNADataset(Dataset):
    def __init__(self, img_dir, labels_dict=None, transform=None, limit=None):
        self.img_dir = img_dir
        self.image_names = sorted([f for f in os.listdir(img_dir) if f.endswith(".dcm")])
        if limit:
            self.image_names = self.image_names[:limit]
        self.labels_dict = labels_dict
        self.transform = transform

    def __len__(self):
        return len(self.image_names)

    def __getitem__(self, idx):
        fname = self.image_names[idx]
        fpath = os.path.join(self.img_dir, fname)
        dcm = pydicom.dcmread(fpath)
        image = dcm.pixel_array.astype(np.uint8)
        image = preprocess_image(image)
        image = cv2.resize(image, (224, 224))
        image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)

        if self.transform:
            image = self.transform(image)

        if self.labels_dict:
            patient_id = fname.replace(".dcm", "")
            label = self.labels_dict.get(patient_id, 0)
            return image, label
        else:
            return image, fname.replace(".dcm", "")  # for test set

# ✅ Transforms
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3),
])

# ✅ Datasets and Loaders
train_dataset = RSNADataset(train_img_path, labels_dict, transform=transform, limit=3000)
test_dataset = RSNADataset(test_img_path, labels_dict=None, transform=transform, limit=1000)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=2)

# ✅ Model
model = vit_b_16(pretrained=True)
model.heads.head = torch.nn.Linear(model.heads.head.in_features, 2)
model = model.to(device)

# ✅ Loss & Optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

# ✅ Training
epochs = 3
for epoch in range(epochs):
    model.train()
    running_loss, correct, total = 0, 0, 0
    loop = tqdm(train_loader, desc=f"Epoch [{epoch+1}/{epochs}]")

    for images, labels in loop:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        _, preds = torch.max(outputs, 1)
        running_loss += loss.item()
        correct += (preds == labels).sum().item()
        total += labels.size(0)

        loop.set_postfix(loss=running_loss/total, acc=100.*correct/total)

# ✅ Evaluate on train data
model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for images, labels in tqdm(train_loader, desc="Evaluating"):
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, preds = torch.max(outputs, 1)
        y_true.extend(labels.cpu().numpy())
        y_pred.extend(preds.cpu().numpy())

print("Classification Report:\n", classification_report(y_true, y_pred))

# ✅ Predict on test set and create submission
model.eval()
predictions = []
ids = []

with torch.no_grad():
    for images, patient_ids in tqdm(test_loader, desc="Predicting on Test Set"):
        images = images.to(device)
        outputs = model(images)
        probs = torch.nn.functional.softmax(outputs, dim=1)
        preds = probs[:, 1].cpu().numpy()
        predictions.extend(preds)
        ids.extend(patient_ids)

submission_df = pd.DataFrame({'patientId': ids, 'Target': predictions})
submission_df.to_csv("submission.csv", index=False)
print("✅ submission.csv created.")
