In [None]:
import pandas as pd
import os
import shutil
from sklearn.model_selection import train_test_split

# Paths for CSV and images
csv_file_path = 'skin_cancer/dataset/train-metadata.csv'
image_dir = 'skin_cancer/dataset/train-image/image/'

# Split directories
train_dir = 'skin_cancer/dataset/train/'
val_dir = 'skin_cancer/dataset/val/'

# Create directories if they don't exist
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)

In [None]:
base_dir = 'skin_cancer/dataset/'

In [None]:
val_df.head()

In [None]:
train_df['image_name'] = train_df['isic_id'] + '.jpg'

In [None]:
# Load your dataset
data = pd.read_csv(csv_file_path)

# Split the data into a 60% train and 40% validation set
train_df, val_df = train_test_split(data, test_size=0.4, random_state=42)  # Random state for reproducibility

# Save the new datasets to CSV files
train_csv_path = os.path.join(base_dir, 'train.csv')
val_csv_path = os.path.join(base_dir, 'val.csv')
train_df.to_csv(train_csv_path, index=False)
val_df.to_csv(val_csv_path, index=False)

In [None]:




# Function to move images
def move_images(df, source_dir, target_dir):
    for isic_id in df['isic_id']:  # Assuming the column name that stores image filenames
        image_name = isic_id + '.jpg'
        source_path = os.path.join(source_dir, image_name)
        target_path = os.path.join(target_dir, image_name)
        if os.path.exists(source_path):
            shutil.copy(source_path, target_path)
        else:
            print(f"Warning: {source_path} does not exist and cannot be moved.")

# Move images to their respective directories
move_images(train_df, image_dir, train_dir)
move_images(val_df, image_dir, val_dir)

print(f"Train dataset and images saved to {train_dir}")
print(f"Validation dataset and images saved to {val_dir}")


In [None]:
import pandas as pd

# Specify the path to your CSV file
csv_file_path = 'dataset/dump/train-metadata.csv'

# Load the dataset
data = pd.read_csv(csv_file_path)

# Check the distribution of the 'target' column
count_total = len(data['target'])
count_zeros = (data['target'] == 0).sum()
count_ones = (data['target'] == 1).sum()

# Calculate percentages
percentage_zeros = (count_zeros / count_total) * 100
percentage_ones = (count_ones / count_total) * 100

# Print the results
print(f"Percentage of 0s: {percentage_zeros:.2f}%")
print(f"Percentage of 1s: {percentage_ones:.2f}%")


In [None]:
import torch
import torchvision.transforms as transforms
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import pandas as pd
import h5py
import numpy as np
import io  # Ensure io is imported for handling byte streams
import logging, os
import torch.nn as nn
import torch.nn.functional as F

# Define a custom dataset class to handle HDF5 files
class HDF5Dataset(Dataset):
    def __init__(self, hdf5_file, transform=None):
        """
        Args:
            hdf5_file (string): Path to the HDF5 file with images.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.hdf5_file = hdf5_file
        self.transform = transform
        # Open the HDF5 file
        self.file = h5py.File(hdf5_file, 'r')
        self.keys = list(self.file.keys())

    def __len__(self):
        return len(self.keys)

    def __getitem__(self, idx):
        # HDF5 keys can be used to access images
        image_name = self.keys[idx]
        image_data = self.file[image_name][()]
        # Convert image data to PIL Image for consistency with transforms
        image = Image.open(io.BytesIO(image_data)).convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image, image_name

    def close(self):
        if self.file:
            self.file.close()

# Initialize device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define transformations
transform = transforms.Compose([
    transforms.Resize((64, 64)),  # Adjust the size to 224x224 for ResNet50
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Use ImageNet norms
])

# Create the dataset
hdf5_file = 'dataset/dump/test-image.hdf5'
dataset = HDF5Dataset(hdf5_file=hdf5_file, transform=transform)

# Create the DataLoader
test_loader = DataLoader(dataset, batch_size=64, shuffle=False)



# Model setup
model = models.resnet101()
num_ftrs = model.fc.in_features


model.fc = nn.Linear(num_ftrs, 2)  # Assuming 2 classes (benign and malignant)
model = model.to(device)


# mlp = MLP(input_size=3, hidden_size=32, output_size=16)
# model = CombinedModel(mlp=mlp, n_classes=2, train_resnet=params['train_resnet']).to(device)


# Load existing model if available
model_saved_path = os.path.join("checkpoint/07 July 14:01-partial_auc_resnet101.pt")
if os.path.exists(model_saved_path):
    model.load_state_dict(torch.load(model_saved_path))
    logging.info(f'Model loaded from {model_saved_path}')

model.eval()

# Perform inference
predictions = []
image_ids = []

with torch.no_grad():
    for images, ids in test_loader:
        images = images.to(device)
        outputs = model(images)
        probabilities = F.softmax(outputs, dim=1)
        # Extract the probability of class 1 for each image
        class_one_prob = probabilities[:, 1].cpu().numpy()
        predictions.extend(class_one_prob)
        image_ids.extend(ids)
        
        
# Cleanup dataset
dataset.close()

df = pd.DataFrame({
    'isic_id': image_ids,
    'target': predictions
})
df.to_csv('sample_submission.csv', index=False)
print("Predictions saved to sample_submission.csv")


In [None]:
print (predictions)