In [3]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import kagglehub
import torch
import random
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader, Dataset
from torchvision.io import read_image
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from PIL import Image

# Set seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False

# Load dataset path
path = kagglehub.dataset_download("mehradaria/leukemia")
print("Path to dataset files:", path)

data_dir = os.path.join(path, "leukemia")

# Define transforms
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

# Custom dataset class
class LeukemiaDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.classes = ['Benign', 'Early', 'Pre', 'Pro']
        self.data = []
        for label in self.classes:
            class_path = os.path.join(root_dir, label)
            for img_name in os.listdir(class_path):
                img_path = os.path.join(class_path, img_name)
                self.data.append((img_path, self.classes.index(label)))
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        img_path, label = self.data[idx]
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image, label

# Load dataset
dataset = LeukemiaDataset(data_dir, transform=transform)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define CNN model
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.fc1 = nn.Linear(128 * 28 * 28, 512)
        self.fc2 = nn.Linear(512, 4)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = self.pool(self.relu(self.conv3(x)))
        x = x.view(x.size(0), -1)
        x = self.dropout(self.relu(self.fc1(x)))
        x = self.fc2(x)
        return x

# Training setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training function
def train_model(model, train_loader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader):.4f}")

# Evaluation function
def evaluate_model(model, test_loader):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, preds = torch.max(outputs, 1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(preds.cpu().numpy())
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("F1 Score:", f1_score(y_true, y_pred, average='weighted'))
    print("Classification Report:\n", classification_report(y_true, y_pred, target_names=['Benign', 'Early', 'Pre', 'Pro']))

# Train and evaluate
train_model(model, train_loader, criterion, optimizer, epochs=10)
evaluate_model(model, test_loader)


Downloading from https://www.kaggle.com/api/v1/datasets/download/mehradaria/leukemia?dataset_version_number=1...


100%|████████████████████████████████████████| 110M/110M [00:11<00:00, 10.4MB/s]


Extracting files...
Path to dataset files: /Users/manishankar/.cache/kagglehub/datasets/mehradaria/leukemia/versions/1


FileNotFoundError: [Errno 2] No such file or directory: '/Users/manishankar/.cache/kagglehub/datasets/mehradaria/leukemia/versions/1/leukemia/Benign'

In [4]:
!pip install kaggle


Collecting kaggle
  Downloading kaggle-1.6.17.tar.gz (82 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.7/82.7 kB[0m [31m709.8 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25ldone
[?25h  Created wheel for kaggle: filename=kaggle-1.6.17-py3-none-any.whl size=105786 sha256=79cdf5b45c69414ecbf96c67bf6e79ceb016e22f5a5c1a9fea3ffffbfeb14eb4
  Stored in directory: /Users/manishankar/Library/Caches/pip/wheels/46/d2/26/84d0a1acdb9c6baccf7d28cf06962ec80529fe1ad938489983
Successfully built kaggle
Installing collected packages: kaggle
Successfully installed kaggle-1.6.17


In [13]:
import os
import shutil

# Define the path for Kaggle API credentials
kaggle_dir = os.path.expanduser("~/.kaggle")
os.makedirs(kaggle_dir, exist_ok=True)

# Move kaggle.json to ~/.kaggle directory
shutil.move("kaggle.json", os.path.join(kaggle_dir, "kaggle.json"))

# Set permissions to prevent access issues
os.chmod(os.path.join(kaggle_dir, "kaggle.json"), 600)

print("✅ kaggle.json has been moved successfully!")


FileNotFoundError: [Errno 2] No such file or directory: 'kaggle.json'

In [16]:
import os
import shutil

# Define paths
current_dir = os.getcwd() 
print(current_dir)

/Users/manishankar


In [17]:
import os
import shutil

# Define the source and destination paths
source_path = "/Users/manishankar/kaggle.json"
destination_dir = os.path.expanduser("~/.kaggle")
destination_path = os.path.join(destination_dir, "kaggle.json")

# Create ~/.kaggle directory if it doesn't exist
os.makedirs(destination_dir, exist_ok=True)

# Move the file
shutil.move(source_path, destination_path)

# Set permissions
os.chmod(destination_path, 600)

print("✅ kaggle.json has been moved successfully!")


FileNotFoundError: [Errno 2] No such file or directory: '/Users/manishankar/kaggle.json'

In [18]:
import os

# Define the path for the new directory
directory_path = "/Users/manishankar/all_dataset"

# Create the directory if it does not exist
os.makedirs(directory_path, exist_ok=True)

print(f"✅ Directory 'all_dataset' created at: {directory_path}")


✅ Directory 'all_dataset' created at: /Users/manishankar/all_dataset


In [19]:
import os
import shutil

# Define the source and destination paths
source_path = "/Users/manishankar/all_dataset/kaggle.json"  # Correct location of your file
destination_dir = os.path.expanduser("~/.kaggle")  # Kaggle API directory
destination_path = os.path.join(destination_dir, "kaggle.json")

# Create ~/.kaggle directory if it doesn't exist
os.makedirs(destination_dir, exist_ok=True)

# Move the file to the correct location
shutil.move(source_path, destination_path)

# Set permissions
os.chmod(destination_path, 600)

print("✅ kaggle.json has been moved successfully to ~/.kaggle/")


✅ kaggle.json has been moved successfully to ~/.kaggle/


In [22]:
import os

file_path = "/Users/manishankar/.kaggle/kaggle.json"
permissions = oct(os.stat(file_path).st_mode)[-3:]
print(f"Current permissions for kaggle.json: {permissions}")


Current permissions for kaggle.json: 130


In [1]:
import os
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
from torchvision.io import read_image
from PIL import Image
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, f1_score

# =============================
# 1. Set up Reproducibility
# =============================
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# =============================
# 2. Define Dataset Paths
# =============================
dataset_path = "/Users/manishankar/all_dataset/Original"
classes = ["Benign", "Early", "Pre", "Pro"]

# Image transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

# =============================
# 3. Custom Dataset Class
# =============================
class LeukemiaDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.data = []
        
        for label in classes:
            class_path = os.path.join(root_dir, label)
            for img_name in os.listdir(class_path):
                img_path = os.path.join(class_path, img_name)
                self.data.append((img_path, classes.index(label)))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path, label = self.data[idx]
        image = Image.open(img_path).convert("RGB")
        
        if self.transform:
            image = self.transform(image)

        return image, label

# =============================
# 4. Load Data
# =============================
dataset = LeukemiaDataset(dataset_path, transform=transform)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# =============================
# 5. Improved CNN Model
# =============================
class ImprovedCNN(nn.Module):
    def __init__(self):
        super(ImprovedCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(128 * 28 * 28, 512)
        self.fc2 = nn.Linear(512, 4)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.pool(torch.relu(self.bn1(self.conv1(x))))
        x = self.pool(torch.relu(self.bn2(self.conv2(x))))
        x = self.pool(torch.relu(self.bn3(self.conv3(x))))
        x = x.view(-1, 128 * 28 * 28)
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

# =============================
# 6. Train and Evaluate CNN
# =============================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cnn_model = ImprovedCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(cnn_model.parameters(), lr=0.001)

def train_model(model, train_loader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader):.4f}")

def evaluate_model(model, test_loader):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, preds = torch.max(outputs, 1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(preds.cpu().numpy())
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("F1 Score:", f1_score(y_true, y_pred, average='weighted'))
    print("Classification Report:\n", classification_report(y_true, y_pred, target_names=classes))

# Train and Evaluate CNN
train_model(cnn_model, train_loader, criterion, optimizer, epochs=10)
evaluate_model(cnn_model, test_loader)

# =============================
# 7. ResNet-50 Model
# =============================
resnet_model = models.resnet50(pretrained=True)
resnet_model.fc = nn.Linear(resnet_model.fc.in_features, 4)  # Adjust output for 4 classes
resnet_model = resnet_model.to(device)

optimizer_resnet = optim.Adam(resnet_model.parameters(), lr=0.001)

# Train and Evaluate ResNet-50
print("Training ResNet-50...")
train_model(resnet_model, train_loader, criterion, optimizer_resnet, epochs=10)
print("Evaluating ResNet-50...")
evaluate_model(resnet_model, test_loader)

# =============================
# 8. SVM Classifier using Deep Features
# =============================
def extract_features(model, data_loader):
    model.eval()
    features, labels = [], []
    
    with torch.no_grad():
        for images, batch_labels in data_loader:
            images = images.to(device)
            outputs = model(images)
            features.append(outputs.cpu().numpy())
            labels.extend(batch_labels.numpy())
    
    features = np.vstack(features)
    return features, np.array(labels)

# Extract deep features using ResNet
print("Extracting deep features for SVM...")
train_features, train_labels = extract_features(resnet_model, train_loader)
test_features, test_labels = extract_features(resnet_model, test_loader)

# Train SVM on extracted features
svm_model = SVC(kernel='linear', C=1.0)
svm_model.fit(train_features, train_labels)

# Evaluate SVM
svm_predictions = svm_model.predict(test_features)
print("SVM Accuracy:", accuracy_score(test_labels, svm_predictions))
print("SVM F1 Score:", f1_score(test_labels, svm_predictions, average='weighted'))
print("SVM Classification Report:\n", classification_report(test_labels, svm_predictions, target_names=classes))


Epoch 1, Loss: 7.3962
Epoch 2, Loss: 0.3274
Epoch 3, Loss: 0.1824
Epoch 4, Loss: 0.1271
Epoch 5, Loss: 0.1077
Epoch 6, Loss: 0.0981
Epoch 7, Loss: 0.0846
Epoch 8, Loss: 0.0996
Epoch 9, Loss: 0.1351
Epoch 10, Loss: 0.0788
Accuracy: 0.9877300613496932
F1 Score: 0.9877304541097388
Classification Report:
               precision    recall  f1-score   support

      Benign       0.97      0.97      0.97       116
       Early       0.99      0.99      0.99       203
         Pre       1.00      0.98      0.99       182
         Pro       0.99      1.00      0.99       151

    accuracy                           0.99       652
   macro avg       0.99      0.99      0.99       652
weighted avg       0.99      0.99      0.99       652



Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /Users/manishankar/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████████████████████████████████| 97.8M/97.8M [01:09<00:00, 1.47MB/s]


Training ResNet-50...
Epoch 1, Loss: 0.2407
Epoch 2, Loss: 0.0942
Epoch 3, Loss: 0.0829
Epoch 4, Loss: 0.0299
Epoch 5, Loss: 0.0372
Epoch 6, Loss: 0.0429
Epoch 7, Loss: 0.0251
Epoch 8, Loss: 0.0173
Epoch 9, Loss: 0.0203
Epoch 10, Loss: 0.0189
Evaluating ResNet-50...
Accuracy: 0.9969325153374233
F1 Score: 0.9969416181444438
Classification Report:
               precision    recall  f1-score   support

      Benign       0.98      1.00      0.99       116
       Early       1.00      1.00      1.00       203
         Pre       1.00      0.99      1.00       182
         Pro       1.00      1.00      1.00       151

    accuracy                           1.00       652
   macro avg       1.00      1.00      1.00       652
weighted avg       1.00      1.00      1.00       652

Extracting deep features for SVM...
SVM Accuracy: 1.0
SVM F1 Score: 1.0
SVM Classification Report:
               precision    recall  f1-score   support

      Benign       1.00      1.00      1.00       116
       