In [11]:
import os
import time
import logging
import hashlib
import pickle
import multiprocessing

import pandas as pd
import numpy as np
import torch
from PIL import Image
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from imblearn.over_sampling import RandomOverSampler

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

logging.basicConfig(level=logging.DEBUG,
                    format='(asctime)s - %(name)s - %(levelname)s - %(message)s',
                    handlers=[logging.StreamHandler()])

# Data paths and parameters
METADATA_PATH = '..\\Data\\train-metadata.csv'
IMAGE_DIR = '..\\Data\\lanczos_train_image\\image'
CACHE_DIR ='../dataset_cache'
BATCH_SIZE = 32
OVERSAMPLING_RATIO = 0.1 # .1 = 1:10, .2 = 1:5
    
# Due to different datatype, need to separate features for processing
FEATURES = ['age_approx', 'sex', 'anatom_site_general', 'clin_size_long_diam_mm', 'image_type', 'tbp_tile_type']
NUMERIC_FEATURES = ['age_approx', 'clin_size_long_diam_mm']
CATEGORICAL_FEATURES = ['sex', 'anatom_site_general', 'image_type', 'tbp_tile_type']

def load_and_preprocess_metadata(metadata_path):
    """Loads and preprocesses the metadata."""
    df = pd.read_csv(metadata_path, low_memory=False)

    if 'isic_id' not in df.columns:
        raise ValueError("CSV file does not contain 'isic_id' column")

    df['image_filename'] = df['isic_id'] + '.jpg'

    # Fill missing values
    df['age_approx'].fillna(df['age_approx'].mean())
    df['sex'].fillna('unknown')
    df['anatom_site_general'].fillna('unknown')

    return df

def prepare_features(df):
    """Prepares features for the model."""

    print("Before one-hot encoding, shape:", df.shape)
    print("Before one-hot encoding, columns:", df.columns)
    
    df[NUMERIC_FEATURES] = df[NUMERIC_FEATURES].fillna(df[NUMERIC_FEATURES].mean())

    # One-hot encode categorical features
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoded_features = encoder.fit_transform(df[CATEGORICAL_FEATURES])
    encoded_feature_names = encoder.get_feature_names_out(CATEGORICAL_FEATURES)

    # Combine numeric and encoded features
    all_feature_names = NUMERIC_FEATURES + list(encoded_feature_names)
    feature_array = np.hstack((df[NUMERIC_FEATURES].values, encoded_features))
    features_df = pd.DataFrame(feature_array, columns=all_feature_names)
    print("After one-hot encoding, shape:", features_df.shape)
    print("After one-hot encoding, columns:", features_df.columns)
    return features_df, all_feature_names

# Define the Dataset class
class SkinLesionDataset(Dataset):
    def __init__(self, image_dir, dataframe, features_df, features, transform=None, use_cache=True, cache_dir='./dataset_cache'):
        self.image_dir = image_dir
        self.dataframe = dataframe
        self.features_df = features_df
        self.features = features
        self.transform = transform
        self.use_cache = use_cache
        self.cache_dir = cache_dir

        if self.use_cache:
            os.makedirs(self.cache_dir, exist_ok =True)

        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(logging.DEBUG)

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # print(f"DataFrame length: {len(self.dataframe)}, Accessing index: {idx}")  # Debugging

        try:
            start_time = time.time()
            
            if self.use_cache:
                cache_path = self._get_cache_path(idx)
                if os.path.exists(cache_path):
                    # with multiprocessing.Pool() as pool:
                    #     item = pool.apply(self._load_from_cache, args=(cache_path,)) # load in parallel
                    item = self._load_from_cache(cache_path)
                    # self.logger.debug(f"Cache Load Time: {time.time() - start_time:.4f} seconds")
                    # print(f"Cache Load Time: {time.time() - start_time:.4f} seconds")
                    return item
                else: # Does not exist, load and save
                    load_start_time = time.time()
                    image, features, target = self._load_item(idx)
                    # print(f"Item Load Time: {time.time() - load_start_time:.4f} seconds")
                    # self.logger.debug(f"Item Load Time: {time.time() - load_start_time:.4f}: seconds")
                    
                    transform_start_time = time.time()
                    if target == 1:
                        image = self.positive_transforms(image)
                    else:
                        image = self.negative_transforms(image)
                    # print(f"Transform Time: {time.time - transform_start_time:.4f} seconds")
                    # self.logger.debug(f"Transform Time: {time.time() - start_time:.4f} seconds")
        
                    cache_start_time = time.time()
                    item = (image, features, target)
                    # with multiprocessing.Pool as pool:
                    #     pool.apply_async(self._save_to_cache, args=(cache_path, item))
                    self._save_to_cache(cache_path, item)
                    # print(f"Cache Save Time: {time.time() - cache_start_time:.4f} seconds")
                    # self.logger.debug(f"Cache Save Time: {time.time() - cache_start_time:4.f} seconds")
        
            else: # Caching disabled, just load and transform
                load_start_time = time.time()
                image, features, target = self._load_item(idx)
                # print(f"Item Load Time: {time.time() - load_start_time:.4f} seconds")
                # self.logger.debug(f"Item Load Time: {time.time() - load_start_time:.4f} seconds")
        
                transform_start_time = time.time()
                if target == 1:
                    image = self.positive_transforms(image)
                else:
                    image = self.negative_transforms(image)
                # print(f"Transform Time: {time.time() - transform_start_time:.4f} seconds")
                # self.logger.debug(f"Transform Time: {time.time() - transform_start_time} seconds")
        
            # print(f"Total GetItem Time: {time.time() - start_time:.4f} seconds")
            # self.logger.debug(f"Total GetItem Time: {time.time() - start_time:.4f} seconds")
            return image, features, target
        except Exception as e:
            print(f"Error getting item at index {idx}: {e}")
            set.logger.error(f"Error getting item at index {idx}: {e}")
            raise

    @staticmethod
    def positive_transforms(image):
        positive_transforms = transforms.Compose([
            transforms.RandomHorizontalFlip(),
            transforms.RandomVerticalFlip(),
            transforms.RandomRotation(30),
            transforms.ColorJitter(brightness=0.2,
                                   contrast=0.2, saturation=0.2, hue=0.1)
        ])
        return positive_transforms(image)
    @staticmethod
    def negative_transforms(image):
        negative_transforms = transforms.Compose([
            transforms.RandomHorizontalFlip(),
            transforms.ColorJitter(brightness=0.1, contrast=0.1),
        ])
        return negative_transforms(image)
        
    def _load_item(self, idx):
        try:
            load_start = time.time()
            img_name = os.path.join(self.image_dir, self.dataframe.iloc[idx]['image_filename'])
            image = Image.open(img_name)
            # print(f"Image Load Time: {time.time() - load_start}")
            if self.transform:
                transform_start = time.time()
                image = self.transform(image)
                # print(f"Transform Time: {time.time() - transform_start}")
    
            feature_time = time.time()
            features = torch.tensor(self.features_df.iloc[idx].values, dtype=torch.float)
            # print(f"Feature Time: {time.time() - feature_time}")
            target_time = time.time()
            target = torch.tensor(self.dataframe.iloc[idx]['target'], dtype=torch.long)
            # print(f"Target Time: {time.time() - target_time}")
            return image, features, target
            # self.logger.debug("Image Load Time: {time.time() - load_start}")

        except Exception as e:
            self.logger.error(f"Error loading item at index {idx}: {e}")
            raise
    
    def _get_cache_path(self, idx):
        filename = f"{idx}_{self.dataframe.iloc[idx]['image_filename']}"
        hashed_filename = hashlib.md5(filename.encode()).hexdigest()
        return os.path.join(self.cache_dir, f"{hashed_filename}.pkl")

    def _save_to_cache(self, cache_path, item):
        try:
            with open(cache_path, 'wb') as f:
                pickle.dump(item, f)
        except Exception as e:
            # self.logger.error(f"Error Saving item to disk at index {idx}: {e}")
            raise
            
    def _load_from_cache(self, cache_path):
        try:
            with open(cache_path, 'rb') as f:
                return pickle.load(f)
        except Exception as e:
            # self.logger.error(f"Error loading item from cache at index {idx}: {e}")
            raise
    def preload(self):
        for i in tqdm(range(len(self)), desc='Preloading Data'):
            _ = self[i]
        # print(f'Preloaded {len(self)} items into cache.')

if __name__ == "__main__":
    # Load and preprocess metadata
    df = load_and_preprocess_metadata(METADATA_PATH)

    # Split data into features and target
    features_data = df[FEATURES]
    target_labels = df['target']

    # Initial train-test split (before oversampling)
    # Create a test set that represents the true distribution
    features_train_val, features_test, target_train_val, target_test = train_test_split(
        features_data, target_labels, test_size=0.25, random_state=42, stratify=target_labels
    )

    # Split the remaining data into training and validation sets
    features_train, features_val, target_train, target_val = train_test_split(
        features_train_val, target_train_val, test_size=0.2, random_state=42, stratify=target_train_val
    )

    # Oversample the minority class in the training set only
    oversampler = RandomOverSampler(sampling_strategy=OVERSAMPLING_RATIO, random_state=42)
    features_train_resampled, target_train_resampled = oversampler.fit_resample(features_train, target_train)

    # Prepare features for each split
    train_features, all_features = prepare_features(features_train_resampled)
    val_features, _ = prepare_features(features_val)
    test_features, _ = prepare_features(features_test)

    # Image transformation
    transform = transforms.Compose([
        transforms.Resize((160, 160)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    # Reconstruct DataFrames for each set
    train_df = pd.concat([features_train_resampled, target_train_resampled, df['image_filename']], axis=1)
    val_df = pd.concat([features_val, target_val, df['image_filename']], axis=1)
    test_df = pd.concat([features_test, target_test, df['image_filename']], axis=1)

    # Create datasets
    train_dataset = SkinLesionDataset(IMAGE_DIR, train_df[:len(features_train_resampled)], train_features, all_features, transform=transform, use_cache=True, cache_dir='./train_cache')
    val_dataset = SkinLesionDataset(IMAGE_DIR, val_df[:len(features_val)], val_features, all_features, transform=transform, use_cache=True, cache_dir='/val_cache')
    test_dataset = SkinLesionDataset(IMAGE_DIR, test_df[:len(features_test)], test_features, all_features, transform=transform, use_cache=True, cache_dir='./test_cache')

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

    print('Data loaders and datasets created successfully!')

Before one-hot encoding, shape: (264438, 6)
Before one-hot encoding, columns: Index(['age_approx', 'sex', 'anatom_site_general', 'clin_size_long_diam_mm',
       'image_type', 'tbp_tile_type'],
      dtype='object')
After one-hot encoding, shape: (264438, 14)
After one-hot encoding, columns: Index(['age_approx', 'clin_size_long_diam_mm', 'sex_female', 'sex_male',
       'sex_nan', 'anatom_site_general_anterior torso',
       'anatom_site_general_head/neck', 'anatom_site_general_lower extremity',
       'anatom_site_general_posterior torso',
       'anatom_site_general_upper extremity', 'anatom_site_general_nan',
       'image_type_TBP tile: close-up', 'tbp_tile_type_3D: XP',
       'tbp_tile_type_3D: white'],
      dtype='object')
Before one-hot encoding, shape: (60159, 6)
Before one-hot encoding, columns: Index(['age_approx', 'sex', 'anatom_site_general', 'clin_size_long_diam_mm',
       'image_type', 'tbp_tile_type'],
      dtype='object')
After one-hot encoding, shape: (60159, 14)
A

In [2]:
len(all_features)

14

In [3]:
# Preload data
# print("Preloading training data...")
# train_dataset.preload()
# print("Preloading validation data...")
# val_dataset.preload()
# print("Preloading test data...")
# test_dataset.preload()

In [7]:
import torch
import torch.nn as nn
from torchvision.models import resnet50, ResNet50_Weights
import torch.optim as optim
from sklearn.metrics import f1_score
from torch.utils.data import DataLoader
from tqdm import tqdm
from sklearn.metrics import confusion_matrix

class SkinLesionModel(nn.Module):
    def __init__(self, num_classes, num_features):
        super(SkinLesionModel, self).__init__()

        # Image feature extractor (pretrained resnet model)
        self.resnet = resnet50(weights=ResNet50_Weights.DEFAULT)
        self.resnet.fc = nn.Identity() # Remove the final fully connected layer
    
        # Freeze the parameters of the ResNet
        for param in self.resnet.parameters():
            param.requires_grad = True
    
        # Additional features processing
        self.feature_fc = nn.Sequential(
            nn.Linear(num_features, 64),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
    
        # Combine image features and additional features
        self.classifier = nn.Sequential(
            nn.Linear(2048 + 64, 512), # 2048 from ResNet50, 64 from additional features
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )
    def forward(self, image, features):
        # Process image through ResNet
        # Input shape: [batch_size, 3, 160, 160]
        # Output shape: [batch_size, 2048]
        image_features = self.resnet(image)

        # Process additional features
        # Input shape: [batch_size, num_features]
        # Output shape: [batch_size, 64]
        processed_features = self.feature_fc(features)

        # Combine features
        # Output shape: [batch_size, 2048 + 64]
        combined_features = torch.cat((image_features, processed_features), dim=1)

        # Final classification
        # Output shape: [batch_size, num_classes]
        output = self.classifier(combined_features)

        return output

def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10, device='cuda'):
    model.to(device)
    print(f'Sending model to device: {device}')
    for epoch in range(num_epochs):
        print(f'\nEpoch {epoch+1}/{num_epochs}')
        print('-' * 60)

        # Training phase
        model.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0
        train_positives = 0
        train_true_positives = 0
        train_false_positives = 0
        train_predictions = []
        train_targets = []

        # print("Checking Data Loader...")
        # for i, (images, features, targets) in enumerate(train_loader):
        #     if i == 0:
        #         print(f'First batch loaded. Shapes: images: {images.shape}, features {features.shape}, targets {targets.shape}')
        #         break
        # print("Data Loader Check Complete")
        
        print('Training:')
        progress_bar = tqdm(train_loader, desc='Training')
        for batch_idx, (images, features, targets) in enumerate(progress_bar):
            # print(f'Processing batch {batch_idx+1}/{len(train_loader)}')
            # print(f'Images shape: {images.shape}, Features shape: {features.shape}, Targets shape: {targets.shape}')
            images = images.to(device)
            features = features.to(device)
            targets = targets.to(device)

            optimizer.zero_grad()
            outputs = model(images, features)
            loss = criterion(outputs, targets.float().unsqueeze(1))
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            predictions = (torch.sigmoid(outputs) > 0.5).squeeze()
            train_correct += (predictions == targets).sum().item()
            train_total += targets.size(0)

            batch_positives = predictions.sum().item()
            train_positives += batch_positives
            train_true_positives += ((predictions == 1) & (targets == 1)).sum().item()
            train_false_positives += ((predictions == 1) & (targets == 0)).sum().item()

            progress_bar.set_postfix({
                'loss': f'{loss.item():.4f}',
                'acc': f'{train_correct/train_total:.4f}',
                'pos_pred': f'{batch_positives}/{targets.size(0)}'
            })

        train_loss /= len(train_loader)
        train_accuracy = train_correct / train_total
        train_precision = train_true_positives / (train_positives + 1e-8)
        train_recall = train_true_positives / (targets.sum().item() + 1e-8)
        train_f1 = 2 * (train_precision * train_recall) / (train_precision + train_recall + 1e-8)
            
        print('\nTraining Results:')
        print(f'Loss: {train_loss:.4f}, Accuracy: {train_accuracy}')
        print(f'Total Predictions: {train_total}, Positive Predictions: {train_positives}')
        print(f'True Positives: {train_true_positives}, False Positives: {train_false_positives}')
        print(f'Precision: {train_precision:.4f}, Recall: {train_recall:.4f}, F1: {train_f1:.4f}')
        


        # Validation
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        val_predictions = []
        val_targets = []

        with torch.no_grad():
            progress_bar = tqdm(val_loader, desc='Validation')
            for batch_idx, (images, features, targets) in enumerate(progress_bar):
                images = images.to(device)
                features = features.to(device)
                targets = targets.to(device)

                outputs = model(images, features)
                loss = criterion(outputs, targets.float().unsqueeze(1))

                val_loss += loss.item()
                predictions = (torch.sigmoid(outputs) > 0.5).squeeze()
                val_correct += (predictions == targets).sum().item()
                val_total += targets.size(0)

                val_predictions.extend(predictions.cpu().numpy())
                val_targets.extend(targets.cpu().numpy())
                
                progress_bar.set_postfix({
                    'loss': f'{loss.item():.4f}',
                    'acc': f'{val_correct/val_total:.4f}'
                })

        val_loss /= len(val_loader)
        val_accuracy = val_correct / val_total
        val_f1 = f1_score(val_targets, val_predictions)
        val_confusion_matrix = confusion_matrix(val_targets, val_predictions)

        print(f'\nValidation Results:')
        print(f'Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.4f}, F1: {val_f1:.4f}')
        print('Confusion Matrix:')
        print(val_confusion_matrix)
        print('-' * 60)

    print("Training Complete!")

In [8]:
import torch.utils.data as data
import numpy as np
# Hyperparameters and setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Setting up Hyperparameters\nDevice = {device}")
num_classes = 1
num_features = len(all_features)
batch_size = 32
num_epochs = 5
learning_rate = 0.001

# Create model, loss function, and optimizer
model = SkinLesionModel(num_classes, num_features)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

Setting up Hyperparameters
Device = cuda


In [7]:
# Train the model
train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, device)

Sending model to device: cuda

Epoch 1/5
------------------------------------------------------------
Training:


Training: 100%|█████████████████████████████| 7748/7748 [32:06<00:00,  4.02it/s, loss=0.0002, acc=0.9932, pos_pred=2/7]



Training Results:
Loss: 0.0295, Accuracy: 0.9931991722836018
Total Predictions: 247911, Positive Predictions: 21675
True Positives: 21263, False Positives: 412
Precision: 0.9810, Recall: 10631.4999, F1: 1.9618


Validation: 100%|█████████████████████████████████████████| 1292/1292 [05:11<00:00,  4.14it/s, loss=0.0001, acc=0.9968]



Validation Results:
Loss: 0.0147, Accuracy: 0.9968, F1: 0.9823
Confusion Matrix:
[[37557     5]
 [  126  3630]]
------------------------------------------------------------

Epoch 2/5
------------------------------------------------------------
Training:


Training: 100%|█████████████████████████████| 7748/7748 [32:20<00:00,  3.99it/s, loss=0.0006, acc=0.9958, pos_pred=1/7]



Training Results:
Loss: 0.0197, Accuracy: 0.9957847776016393
Total Predictions: 247911, Positive Predictions: 21954
True Positives: 21723, False Positives: 231
Precision: 0.9895, Recall: 21722.9998, F1: 1.9789


Validation: 100%|█████████████████████████████████████████| 1292/1292 [05:09<00:00,  4.17it/s, loss=0.0007, acc=0.9969]



Validation Results:
Loss: 0.0120, Accuracy: 0.9969, F1: 0.9827
Confusion Matrix:
[[37499    63]
 [   67  3689]]
------------------------------------------------------------

Epoch 3/5
------------------------------------------------------------
Training:


Training: 100%|█████████████████████████████| 7748/7748 [32:17<00:00,  4.00it/s, loss=0.0001, acc=0.9964, pos_pred=1/7]



Training Results:
Loss: 0.0179, Accuracy: 0.9963777323313608
Total Predictions: 247911, Positive Predictions: 21997
True Positives: 21818, False Positives: 179
Precision: 0.9919, Recall: 21817.9998, F1: 1.9836


Validation: 100%|█████████████████████████████████████████| 1292/1292 [05:12<00:00,  4.14it/s, loss=0.0001, acc=0.9973]



Validation Results:
Loss: 0.0171, Accuracy: 0.9973, F1: 0.9848
Confusion Matrix:
[[37550    12]
 [  101  3655]]
------------------------------------------------------------

Epoch 4/5
------------------------------------------------------------
Training:


Training: 100%|█████████████████████████████| 7748/7748 [32:19<00:00,  3.99it/s, loss=0.0000, acc=0.9969, pos_pred=1/7]



Training Results:
Loss: 0.0153, Accuracy: 0.9969464848272163
Total Predictions: 247911, Positive Predictions: 22064
True Positives: 21922, False Positives: 142
Precision: 0.9936, Recall: 21921.9998, F1: 1.9870


Validation: 100%|█████████████████████████████████████████| 1292/1292 [05:15<00:00,  4.10it/s, loss=0.0001, acc=0.9963]



Validation Results:
Loss: 0.0152, Accuracy: 0.9963, F1: 0.9798
Confusion Matrix:
[[37474    88]
 [   64  3692]]
------------------------------------------------------------

Epoch 5/5
------------------------------------------------------------
Training:


Training: 100%|█████████████████████████████| 7748/7748 [32:50<00:00,  3.93it/s, loss=0.0012, acc=0.9964, pos_pred=0/7]



Training Results:
Loss: 0.0186, Accuracy: 0.9964019345652271
Total Predictions: 247911, Positive Predictions: 22015
True Positives: 21830, False Positives: 185
Precision: 0.9916, Recall: 2183000000000.0000, F1: 1.9832


Validation: 100%|█████████████████████████████████████████| 1292/1292 [05:16<00:00,  4.09it/s, loss=0.0003, acc=0.9978]


Validation Results:
Loss: 0.0093, Accuracy: 0.9978, F1: 0.9879
Confusion Matrix:
[[37554     8]
 [   82  3674]]
------------------------------------------------------------
Training Complete!





In [10]:
# Saving the Weights (Only run if training was run)
MODEL_SAVE_PATH = '..\\Model_Weights\\Attempt_1\\evaluation.pth'

torch.save(model.state_dict(), MODEL_SAVE_PATH)

print(f'Model saved to: {MODEL_SAVE_PATH}')

Model saved to: ..\Model_Weights\Attempt_1\evaluation.pth


In [9]:
import torch

MODEL_SAVE_PATH = '..\\Model_Weights\\Attempt_1\\evaluation.pth'

# Hyperparameters and setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')

model.load_state_dict(torch.load(MODEL_SAVE_PATH))
model.to(device)
print(f'Model loaded from {MODEL_SAVE_PATH}')

Device: cuda
Model loaded from ..\Model_Weights\Attempt_1\evaluation.pth


In [10]:
# Testing the model on the test dataset
model.eval()

all_predictions = []
all_targets = []

with torch.no_grad():
    for images, features, targets in tqdm(test_loader, desc="Evaluating"):
        images = images.to(device)
        features = features.to(device)
        outputs = model(images, features)
        probabilities = torch.sigmoid(outputs)
        predictions = (probabilities > 0.5).int()

        all_predictions.extend(predictions.cpu().numpy())
        all_targets.extend(targets.cpu().numpy())

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

accuracy = accuracy_score(all_targets, all_predictions)
precision = precision_score(all_targets, all_predictions)
recall = recall_score(all_targets, all_predictions)
f1 = f1_score(all_targets, all_predictions)
conf_matrix = confusion_matrix(all_targets, all_predictions)

print(f'Test Accuracy: {accuracy:.4f}')
print(f'Test Precision: {precision:.4f}')
print(f'Test Recall: {recall:.4f}')
print(f'Test F1 Score: {f1:.4f}')

print('Confusion Matrix:')
print(conf_matrix)

Evaluating: 100%|██████████████████████████████████████████████████████████████████| 3134/3134 [13:33<00:00,  3.85it/s]


Test Accuracy: 0.9995
Test Precision: 0.6667
Test Recall: 0.9592
Test F1 Score: 0.7866
Confusion Matrix:
[[100120     47]
 [     4     94]]


In [None]:
# Pickle Save
# Saving the Encoder for later use
import pandas as pd

CATEGORICAL_FEATURES = ['sex', 'anatom_site_general', 'image_type', 'tbp_tile_type']
TRAIN_METADATA_PATH = 'train-metadata.csv'

train_df_categorical = pd.read_csv(TRAIN_METADATA_PATH, usecols=CATEGORICAL_FEATURES)

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoder.fit(train_df_categorical)

import pickle

with open('encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)

print('pickle dumped')

In [12]:
# Print total positive samples in the original dataset
print("Total positive samples in the original dataset:")
print(target_labels.value_counts())

# Print positive samples in the initial train and test splits
print("\nPositive samples in the initial training set (before oversampling):")
print(target_train.value_counts())
print("\nPositive samples in the initial test set:")
print(target_temp.value_counts())

# Print positive samples after oversampling the training set
print("\nPositive samples in the training set after oversampling:")
print(target_train_resampled.value_counts())


# Print positive samples in each split after the final stratified split
print("\nPositive samples in the final training set:")
print(target_train.value_counts())
print("\nPositive samples in the final validation set:")
print(target_val.value_counts())
print("\nPositive samples in the final test set:")
print(target_test.value_counts())

Total positive samples in the original dataset:
target
0    400666
1       393
Name: count, dtype: int64

Positive samples in the initial training set (before oversampling):
target
0    240399
1       236
Name: count, dtype: int64

Positive samples in the initial test set:
target
0    75125
1     7512
Name: count, dtype: int64

Positive samples in the training set after oversampling:
target
0    240399
1     24039
Name: count, dtype: int64

Positive samples in the final training set:
target
0    240399
1       236
Name: count, dtype: int64

Positive samples in the final validation set:
target
0    60100
1       59
Name: count, dtype: int64

Positive samples in the final test set:
target
0    100167
1        98
Name: count, dtype: int64
