<a href="https://colab.research.google.com/github/LifeLiveOn/Pytorch-jupiterNb/blob/main/Melanoma_CNNs_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
import pandas as pd
from PIL import Image
import os

# Define the dataset class
class MelanomaDataset(Dataset):
    def __init__(self, csv_file, img_dir, transform=None):
        self.data = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = os.path.join(self.img_dir, self.data.iloc[idx, 0] + '.jpg')
        image = Image.open(img_name)

        # Extract metadata features
        metadata = self.data.iloc[idx, 1:-1].values.astype(float)

        label = self.data.iloc[idx, -1]

        if self.transform:
            image = self.transform(image)

        return image, torch.FloatTensor(metadata), label

# Define the CNN model
class MelanomaCNN(nn.Module):
    def __init__(self, num_metadata_features):
        super(MelanomaCNN, self).__init__()
        self.efficientnet = models.efficientnet_b0(pretrained=True)
        self.efficientnet.classifier = nn.Identity()

        self.metadata_fc = nn.Linear(num_metadata_features, 64)

        self.classifier = nn.Sequential(
            nn.Linear(1280 + 64, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 1)
        )

    def forward(self, img, metadata):
        img_features = self.efficientnet(img)
        metadata_features = torch.relu(self.metadata_fc(metadata))
        combined_features = torch.cat((img_features, metadata_features), dim=1)
        return self.classifier(combined_features)

# Set up data transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Create datasets and dataloaders
train_dataset = MelanomaDataset('train.csv', 'train_images', transform=transform)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

val_dataset = MelanomaDataset('val.csv', 'val_images', transform=transform)
val_loader = DataLoader(val_dataset, batch_size=32)

# Initialize the model, loss function, and optimizer
num_metadata_features = len(train_dataset[0][1])
model = MelanomaCNN(num_metadata_features)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    for images, metadata, labels in train_loader:
        images, metadata, labels = images.to(device), metadata.to(device), labels.to(device).float().unsqueeze(1)

        optimizer.zero_grad()
        outputs = model(images, metadata)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, metadata, labels in val_loader:
            images, metadata, labels = images.to(device), metadata.to(device), labels.to(device).float().unsqueeze(1)
            outputs = model(images, metadata)
            val_loss += criterion(outputs, labels).item()
            predicted = (outputs > 0.5).float()
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Val Loss: {val_loss/len(val_loader):.4f}, Val Accuracy: {100*correct/total:.2f}%")

# Save the model
torch.save(model.state_dict(), 'melanoma_model.pth')

# Function for inference
def predict(model, image_path, metadata):
    model.eval()
    image = Image.open(image_path)
    image = transform(image).unsqueeze(0).to(device)
    metadata = torch.FloatTensor(metadata).unsqueeze(0).to(device)

    with torch.no_grad():
        output = model(image, metadata)
        probability = torch.sigmoid(output).item()

    return probability

# Example usage
model.load_state_dict(torch.load('melanoma_model.pth'))
probability = predict(model, 'test_image.jpg', [0.5, 0.3, 0.2])  # Replace with actual metadata values
print(f"Probability of melanoma: {probability:.4f}")

FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

This is my implementation so far

In [None]:
import numpy as np
from PIL import Image
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import pandas as pd
import os
import torch
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
import cv2
from tqdm import tqdm


In [None]:
# Load the data
train_data = pd.read_csv('train.csv')

# Path to the image folder
ImgPath = 'jpeg/train'

# Step 1: Filter rows where target == 1 (all of them) and target == 0 (random 1000 samples)
data_target_1 = train_data[train_data['target'] == 1]  # All rows where target is 1
data_target_0 = train_data[train_data['target'] == 0]  # All rows where target is 0

# Step 2: Randomly sample 1000 rows where target == 0
sampled_target_0 = data_target_0.sample(n=5000, random_state=42)
sampled_target_1 = data_target_1.sample(n=500, random_state=42)

# Step 3: Combine both datasets (all target == 1 and 2 random target == 0)
combined_data = pd.concat([sampled_target_0, sampled_target_1])

# Step 4: Shuffle the combined data to randomize the order
combined_data = combined_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Step 5: Preprocess the combined data
# Convert sex to numeric
combined_data['sex'] = combined_data['sex'].map({'male': 0, 'female': 1})

# Create dummy variables for anatomical site
site_dummies = pd.get_dummies(combined_data['anatom_site_general_challenge'], prefix='site')

# Combine the DataFrame with the new dummy variables
combined_data_processed = pd.concat([combined_data, site_dummies], axis=1)

# Drop the original categorical columns
columns_to_drop = ['anatom_site_general_challenge', 'diagnosis', 'benign_malignant']
combined_data_processed = combined_data_processed.drop(columns_to_drop, axis=1)

# Step 6: Select features for modeling
feature_columns = ['image_name','age_approx', 'sex'] + list(site_dummies.columns)
train_data_for_modeling = combined_data_processed[feature_columns]



In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define a fixed size to resize all images to
image_size = (224, 224)

# Transformation to convert images to tensors and preprocessing them for training efficiently
transform = transforms.Compose([
    transforms.Resize(image_size),  # Resizes the image to (128, 128)
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.RandomRotation(15),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

**This is data_loader.py seperate from jp.ipynb to use parallel computing power of gpu and cpu**

In [None]:

import os
import cv2
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm import tqdm


class ImageDataSet(Dataset):
    def __init__(self, train_data, path, transform=None):
        self.image_names = train_data.iloc[:, 0]
        self.metadata = train_data.iloc[:, 1:].astype(np.float32)
        self.ImgPath = path
        self.transform = transform

    def __len__(self):
        return len(self.image_names)

    def __getitem__(self, idx):
        image_name = self.image_names.iloc[idx]
        img_path = os.path.join(self.ImgPath, f"{image_name}.jpg")

        image = cv2.imread(img_path)
        if image is None:
            raise FileNotFoundError(f"Image not found: {img_path}")

        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = Image.fromarray(image)

        if self.transform:
            image = self.transform(image)

        metadata = self.metadata.iloc[idx].values
        metadata_tensor = torch.tensor(metadata, dtype=torch.float32)

        return image, metadata_tensor

# Updated DataLoader function
def images_dataloader(train_data, img_folder, batch_size, num_workers=0, device='cuda', flatten=True, transform=None):
    try:
        dataset = ImageDataSet(train_data, img_folder, transform=transform)
    except Exception as e:
        print(f"Error creating dataset: {e}")
        return None, None

    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False,
                            num_workers=num_workers, pin_memory=True)

    all_images = []
    all_metadata = []

    with torch.no_grad():
        for images, metadata in tqdm(dataloader, desc="Processing batches"):
            images = images.to(device)
            metadata = metadata.to(device)

            if flatten:
                images = images.view(images.size(0), -1)

            all_images.append(images.cpu().numpy())
            all_metadata.append(metadata.cpu().numpy())

    try:
        all_images = np.concatenate(all_images, axis=0)
        all_metadata = np.concatenate(all_metadata, axis=0)
    except Exception as e:
        print(f"Error concatenating data: {e}")
        return None, None

    return all_images, all_metadata



**load the data set and image loader to get the data ready for training**

In [None]:
from data_loader import images_dataloader
img_folder = 'jpeg/train'
X_images, X_metadata = images_dataloader(
    train_data=train_data_for_modeling,
    img_folder=img_folder,
    batch_size=64,
    num_workers=4,  # Set to 0 if you encounter issues
    device=device,
    flatten=False,
    transform=transform
)

Apply PCA due to large amount of features and different shape so we need to flatten the images, might lose some key features and have to Imputer the NaN data either remove or provide mean

In [None]:

print(f"X_images shape: {X_images.shape}")
print(f"X_metadata shape: {X_metadata.shape}")
pca = PCA(n_components=0.95)
X_images_reduced = pca.fit_transform(X_images.reshape(X_images.shape[0], -1))

new_combine_traindata = np.hstack((X_images_reduced, X_metadata))




```
X_images shape: (5500, 3, 224, 224)
X_metadata shape: (5500, 8)
```



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
y = combined_data['target']
X_train, X_test, y_train, y_test = train_test_split(new_combine_traindata, y, test_size=0.2, random_state=42)

imputer = SimpleImputer(strategy='mean')  # You can also use 'median', 'most_frequent', or a constant value
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(random_state=42,max_iter=3000,C=0.1)
logistic_model.fit(X_train_imputed, y_train)


y_pred = logistic_model.predict(X_test_imputed)
y_pred_class = logistic_model.predict(X_test_imputed)  # Class labels
y_pred_proba = logistic_model.predict_proba(X_test_imputed)  # Probabilities

In [None]:
from sklearn.metrics import f1_score, log_loss, accuracy_score, precision_score, recall_score, confusion_matrix
# Metrics
accuracy = accuracy_score(y_pred_class, y_test)
log_loss_value = log_loss(y_test, y_pred_proba)  # log_loss expects the full probability array
f1 = f1_score(y_test, y_pred_class)  # f1_score expects discrete class labels

print("Accuracy:", accuracy)
print("Log Loss:", log_loss_value)
print("F1 Score:", f1)


precision = precision_score(y_test, y_pred_class)
print("Precision:", precision)

recall = recall_score(y_test, y_pred_class)
print("Recall:", recall)

cm = confusion_matrix(y_test, y_pred_class)
print("Confusion Matrix:\n", cm)






```
Accuracy: 0.8936363636363637
Log Loss: 0.2769834811995161
F1 Score: 0.10687022900763359
Precision: 0.3333333333333333
Recall: 0.06363636363636363
Confusion Matrix:
 [[976  14]
 [103   7]]
```



In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=5)

knn_model.fit(X_train_imputed, y_train)


y_pred_knn = knn_model.predict(X_test_imputed)

knn_accuracy = accuracy_score(y_test, y_pred_knn)
knn_f1 = f1_score(y_test, y_pred_knn)

print("KNN Accuracy:", knn_accuracy)
print("KNN F1 Score:", knn_f1)



```
KNN Accuracy: 0.89
KNN F1 Score: 0.07633587786259542

```



In [None]:
from sklearn.naive_bayes import GaussianNB
nb_model = GaussianNB()

nb_model.fit(X_train_imputed, y_train)

y_pred_nb = nb_model.predict(X_test_imputed)

nb_accuracy = accuracy_score(y_test, y_pred_nb)
nb_f1 = f1_score(y_test, y_pred_nb)

print("Naive Bayes Accuracy:", nb_accuracy)
print("Naive Bayes F1 Score:", nb_f1)


```
Naive Bayes Accuracy: 0.8054545454545454
Naive Bayes F1 Score: 0.27702702702702703
```

