In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter

In [2]:
import torchvision
import torchvision.transforms as transforms
import torchvision.models as models

In [10]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json
# --- The change is in the next line ---
!cp ~/.kaggle/kaggle.json ~/.config/kaggle/ # Copy the file to the path expected by the API

Saving kaggle.json to kaggle (1).json
User uploaded file "kaggle (1).json" with length 68 bytes


In [11]:
import os
import kagglehub
import kaggle
from torch.utils.data import Dataset, DataLoader

from PIL import Image
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

In [12]:
kaggle.api.authenticate()
kaggle.api.dataset_download_files('andrewmvd/lung-and-colon-cancer-histopathological-images', path='./', unzip=True)

Dataset URL: https://www.kaggle.com/datasets/andrewmvd/lung-and-colon-cancer-histopathological-images


In [13]:
class MyLungCancerDataset(Dataset):
    def __init__(self, data_folder, csv_path):
        self.data_folder = data_folder
        self.labels_df = pd.read_csv(csv_path)

        # Recursively get all image paths inside data_folder
        self.image_paths = []
        for root, _, files in os.walk(data_folder):
            for file in files:
                if file.endswith(".jpeg"):
                    self.image_paths.append(os.path.join(root, file))

        # Extract only images present in CSV file
        self.image_paths = [path for path in self.image_paths if os.path.basename(path) in self.labels_df['file_name'].tolist()]

        # Define transformations
        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])

        # Map labels to integers
        self.label_map = {
            'lung_n': 0,
            'lung_aca': 1,
            'lung_scc': 2,
            'colon_n': 3,
            'colon_aca': 4
        }

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, index):
        img_path = self.image_paths[index]
        image = Image.open(img_path).convert("RGB")  # Ensure RGB format
        transformed_image = self.transform(image)

        # Get label from CSV
        image_name = os.path.basename(img_path)
        label_str = self.labels_df[self.labels_df.file_name == image_name]['label'].values[0]
        label = self.label_map.get(label_str, -1)  # Assign -1 if label is missing

        return transformed_image, label

In [14]:
train_dataset=MyLungCancerDataset(
    data_folder='/content/lung_colon_image_set',
    csv_path='/content/drive/MyDrive/lung_cancer/lung_and_colon_image_labels.csv'
)

In [15]:
len(train_dataset)

25000

In [16]:
train_dataset[0]

(tensor([[[ 0.0510,  0.4745,  0.6314,  ...,  0.5843,  0.6471,  0.8431],
          [-0.1059,  0.3804,  0.4902,  ...,  0.6706,  0.6549,  0.8745],
          [-0.0118,  0.3804,  0.4353,  ...,  0.6549,  0.6863,  0.9137],
          ...,
          [ 0.8039,  0.8118,  0.8196,  ...,  0.8118,  0.7098,  0.7176],
          [ 0.8667,  0.8824,  0.8824,  ...,  0.8118,  0.7490,  0.7020],
          [ 0.8980,  0.9373,  0.9529,  ...,  0.8431,  0.8196,  0.7490]],
 
         [[-0.2000,  0.2078,  0.3569,  ...,  0.1922,  0.3333,  0.6549],
          [-0.3882,  0.0980,  0.2078,  ...,  0.2863,  0.3490,  0.6706],
          [-0.3098,  0.1059,  0.1686,  ...,  0.3098,  0.3882,  0.6863],
          ...,
          [ 0.4667,  0.5059,  0.5529,  ...,  0.4824,  0.3333,  0.2941],
          [ 0.5451,  0.6000,  0.6471,  ...,  0.5451,  0.4353,  0.3098],
          [ 0.5922,  0.6784,  0.7333,  ...,  0.6078,  0.5451,  0.3882]],
 
         [[ 0.5373,  0.7961,  0.8275,  ...,  0.8039,  0.7569,  0.8510],
          [ 0.4431,  0.8118,

In [17]:
train_dataloader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True,
    num_workers=2
)
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x79accc757850>

In [18]:
class MyCNN(nn.Module):
  def __init__(self):
    super(MyCNN,self).__init__()

    self.conv1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=1)
    self.relu1 = nn.ReLU()
    self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)

    self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
    self.relu2 = nn.ReLU()
    self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)

    self.linear1 = nn.Linear(in_features=32 * 56 * 56, out_features=256)
    self.relu3 = nn.ReLU()
    self.linear2 = nn.Linear(in_features=256, out_features=5)
    self.softmax = nn.Softmax(dim=1)

  def forward(self,x):
    x=self.conv1(x)
    x=self.relu1(x)
    x=self.pool1(x)

    x=self.conv2(x)
    x=self.relu2(x)
    x=self.pool2(x)

    x=x.view(-1,32*56*56)
    x=self.linear1(x)
    x=self.relu3(x)
    x=self.linear2(x)
    x=self.softmax(x)

    return x

In [19]:
model = MyCNN()
model

MyCNN(
  (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (relu1): ReLU()
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (relu2): ReLU()
  (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (linear1): Linear(in_features=100352, out_features=256, bias=True)
  (relu3): ReLU()
  (linear2): Linear(in_features=256, out_features=5, bias=True)
  (softmax): Softmax(dim=1)
)

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MyCNN().to(device)

In [21]:
loss_func = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

In [22]:
writer = SummaryWriter(log_dir='train_logs_1')
writer

<torch.utils.tensorboard.writer.SummaryWriter at 0x79accc2bca10>

In [23]:
num_epoch = 10

In [24]:
ckpt_folder='ckpt'
os.makedirs(ckpt_folder, exist_ok=True)

In [25]:
for epoch in range(num_epoch):
  model.train()
  for iteration_, (images, labels) in enumerate(tqdm(train_dataloader, total=len(train_dataloader))):
    images, labels = images.to(device), labels.to(device)  # Move to GPU
    optimizer.zero_grad()
    pred = model(images)
    loss_value = loss_func(pred, labels)
    loss_value.backward()
    optimizer.step()
    global_iteration = epoch * len(train_dataloader) + iteration_
    writer.add_scalar('train_loss_iter', loss_value, global_iteration)

  print(f'Epoch={epoch}', f'Training loss={loss_value.item()}')

  writer.add_scalar('train_loss_epoch', loss_value, epoch)

  model.eval()
  with torch.no_grad():

    loss_sum=0
    pred_list, label_list = [], []

    for images, labels in tqdm(train_dataloader, total=len(train_dataloader)):
      images, labels = images.to(device), labels.to(device)  # Move to GPU
      pred = model(images)
      loss_value = loss_func(pred, labels)
      loss_sum += loss_value.item()
      pred_list.extend(torch.argmax(pred, dim=1).tolist())
      label_list.extend(labels.tolist())

    print(f'Test loss={loss_sum/len(train_dataloader)}')
    writer.add_scalar('test_loss_epoch', loss_value,epoch)

    # Ensure pred_list and label_list are tensors before concatenation
    final_pred = torch.tensor(pred_list)
    final_label = torch.tensor(label_list)

    # If you're performing classification, you likely want to compare predictions directly
    epoch_accuracy = accuracy_score(final_label, final_pred)
    epoch_precision = precision_score(final_label, final_pred, average='weighted') # Assuming you need weighted average for multi-class
    epoch_recall = recall_score(final_label, final_pred, average='weighted')
    epoch_f1 = f1_score(final_label, final_pred, average='weighted')

    writer.add_scalar('test_accuracy_epoch', epoch_accuracy, epoch)
    writer.add_scalar('test_precision_epoch', epoch_precision, epoch)
    writer.add_scalar('test_recall_epoch', epoch_recall, epoch)
    writer.add_scalar('test_f1_epoch', epoch_f1, epoch)

    print(classification_report(final_label, final_pred))

    torch.save(model.state_dict(), os.path.join('ckpt', f'ckpt_{epoch}.pth'))

100%|██████████| 782/782 [04:16<00:00,  3.05it/s]


Epoch=0 Training loss=1.035280466079712


100%|██████████| 782/782 [04:12<00:00,  3.10it/s]


Test loss=1.1268312078912546
              precision    recall  f1-score   support

           0       0.95      0.94      0.95      5000
           1       0.78      0.78      0.78      5000
           2       0.83      0.85      0.84      5000
           3       0.64      0.96      0.76      5000
           4       0.87      0.43      0.58      5000

    accuracy                           0.79     25000
   macro avg       0.82      0.79      0.78     25000
weighted avg       0.82      0.79      0.78     25000



100%|██████████| 782/782 [04:17<00:00,  3.03it/s]


Epoch=1 Training loss=1.0344170331954956


100%|██████████| 782/782 [04:04<00:00,  3.20it/s]


Test loss=1.0856393282218357
              precision    recall  f1-score   support

           0       0.99      0.94      0.96      5000
           1       0.81      0.83      0.82      5000
           2       0.88      0.85      0.87      5000
           3       0.68      0.98      0.80      5000
           4       0.88      0.54      0.67      5000

    accuracy                           0.83     25000
   macro avg       0.85      0.83      0.83     25000
weighted avg       0.85      0.83      0.83     25000



100%|██████████| 782/782 [04:14<00:00,  3.08it/s]


Epoch=2 Training loss=1.1701042652130127


100%|██████████| 782/782 [04:01<00:00,  3.23it/s]


Test loss=1.082475118548669
              precision    recall  f1-score   support

           0       0.97      0.94      0.95      5000
           1       0.81      0.85      0.83      5000
           2       0.89      0.86      0.87      5000
           3       0.79      0.68      0.73      5000
           4       0.70      0.81      0.75      5000

    accuracy                           0.83     25000
   macro avg       0.83      0.83      0.83     25000
weighted avg       0.83      0.83      0.83     25000



100%|██████████| 782/782 [04:14<00:00,  3.07it/s]


Epoch=3 Training loss=1.031019926071167


100%|██████████| 782/782 [04:04<00:00,  3.20it/s]


Test loss=1.0637927675033774
              precision    recall  f1-score   support

           0       0.98      0.94      0.96      5000
           1       0.83      0.84      0.83      5000
           2       0.88      0.88      0.88      5000
           3       0.72      0.97      0.82      5000
           4       0.90      0.60      0.72      5000

    accuracy                           0.85     25000
   macro avg       0.86      0.85      0.84     25000
weighted avg       0.86      0.85      0.84     25000



100%|██████████| 782/782 [04:13<00:00,  3.08it/s]


Epoch=4 Training loss=1.0901637077331543


100%|██████████| 782/782 [04:11<00:00,  3.11it/s]


Test loss=1.0522458193552158
              precision    recall  f1-score   support

           0       1.00      0.94      0.97      5000
           1       0.79      0.90      0.84      5000
           2       0.91      0.84      0.88      5000
           3       0.80      0.85      0.82      5000
           4       0.82      0.76      0.79      5000

    accuracy                           0.86     25000
   macro avg       0.86      0.86      0.86     25000
weighted avg       0.86      0.86      0.86     25000



100%|██████████| 782/782 [04:07<00:00,  3.16it/s]


Epoch=5 Training loss=1.1607240438461304


100%|██████████| 782/782 [04:05<00:00,  3.19it/s]


Test loss=1.0360201068241577
              precision    recall  f1-score   support

           0       1.00      0.94      0.97      5000
           1       0.87      0.83      0.85      5000
           2       0.86      0.93      0.89      5000
           3       0.76      0.96      0.85      5000
           4       0.92      0.69      0.79      5000

    accuracy                           0.87     25000
   macro avg       0.88      0.87      0.87     25000
weighted avg       0.88      0.87      0.87     25000



100%|██████████| 782/782 [04:19<00:00,  3.02it/s]


Epoch=6 Training loss=1.0223369598388672


100%|██████████| 782/782 [04:05<00:00,  3.18it/s]


Test loss=1.028660893973792
              precision    recall  f1-score   support

           0       1.00      0.94      0.97      5000
           1       0.87      0.82      0.85      5000
           2       0.85      0.93      0.89      5000
           3       0.78      0.95      0.86      5000
           4       0.92      0.73      0.81      5000

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.87     25000
weighted avg       0.88      0.88      0.87     25000



100%|██████████| 782/782 [04:17<00:00,  3.04it/s]


Epoch=7 Training loss=1.1175885200500488


100%|██████████| 782/782 [04:09<00:00,  3.14it/s]


Test loss=1.0161886387468908
              precision    recall  f1-score   support

           0       1.00      0.94      0.97      5000
           1       0.87      0.86      0.87      5000
           2       0.88      0.93      0.90      5000
           3       0.81      0.94      0.87      5000
           4       0.91      0.78      0.84      5000

    accuracy                           0.89     25000
   macro avg       0.89      0.89      0.89     25000
weighted avg       0.89      0.89      0.89     25000



100%|██████████| 782/782 [04:11<00:00,  3.11it/s]


Epoch=8 Training loss=1.0322984457015991


100%|██████████| 782/782 [04:11<00:00,  3.11it/s]


Test loss=1.0116104774767785
              precision    recall  f1-score   support

           0       1.00      0.94      0.97      5000
           1       0.83      0.93      0.88      5000
           2       0.94      0.87      0.90      5000
           3       0.83      0.94      0.88      5000
           4       0.92      0.81      0.86      5000

    accuracy                           0.90     25000
   macro avg       0.90      0.90      0.90     25000
weighted avg       0.90      0.90      0.90     25000



100%|██████████| 782/782 [04:15<00:00,  3.06it/s]


Epoch=9 Training loss=1.1457183361053467


100%|██████████| 782/782 [04:06<00:00,  3.18it/s]


Test loss=1.0054890516468935
              precision    recall  f1-score   support

           0       1.00      0.94      0.97      5000
           1       0.88      0.89      0.88      5000
           2       0.90      0.93      0.92      5000
           3       0.88      0.86      0.87      5000
           4       0.86      0.89      0.87      5000

    accuracy                           0.90     25000
   macro avg       0.90      0.90      0.90     25000
weighted avg       0.90      0.90      0.90     25000

