In [None]:
import torch
from torch import nn 
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.utils import resample
from torchvision.transforms import v2
import pandas as pd
import numpy as np
import cv2 as cv
import os 
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import glob 
from random import sample
from tqdm.notebook import tqdm, trange
from torchvision.transforms import v2
from torchmetrics.classification import BinaryPrecision
from torchmetrics.classification import BinaryRecall
from torchmetrics.classification import BinaryAccuracy

In [None]:
img =cv.imread("/kaggle/input/skin-cancer-mnist-ham10000/HAM10000_images_part_1/ISIC_0028933.jpg")

plt.imshow(img)

In [None]:
img =cv.imread("/kaggle/input/skin-cancer-mnist-ham10000/HAM10000_images_part_1/ISIC_0028933.jpg")

img_path_list = glob.glob("/kaggle/input/skin-cancer-mnist-ham10000/HAM10000_images_part_1/*")
print(len(img_path_list))

print(img.shape)
img_path_sample=sample(img_path_list,9)

plt.figure(figsize=(9,9))
for i in range(9):
    plt.subplot(3,3,i+1)
    img = cv.imread(img_path_sample[i])
    plt.imshow(img[...,::-1])
    #plt.title(img.shape)


In [None]:
for i in range(9):
    plt.subplot(3, 3, i+1)
    img = cv.imread(img_path_sample[i])
    plt.imshow(img[..., ::-1])
    plt.axis('off')  # Esto elimina los tick labels

plt.subplots_adjust(wspace=0, hspace=0)  # Esto elimina los espacios entre las imágenes
plt.savefig("results__2.jpg", dpi=100)
plt.show()

In [None]:
img_path_list[100]

In [None]:
metadata = pd.read_csv("/kaggle/input/skin-cancer-mnist-ham10000/HAM10000_metadata.csv")
metadata = metadata.astype({'image_id': 'string'})
metadata.info()
print(metadata.columns)
metadata

In [None]:
f1 = '/kaggle/input/skin-cancer-mnist-ham10000/HAM10000_images_part_1/'
f2 = '/kaggle/input/skin-cancer-mnist-ham10000/HAM10000_images_part_2/'
metadata["path"] = metadata["image_id"].apply(lambda x : f1+x+'.jpg' if os.path.exists(f1+x+'.jpg') else f2+x+'.jpg')
metadata["exists"] = metadata["path"].apply(lambda x: os.path.exists(x))

In [None]:
metadata["exists"].value_counts()


Cases include a representative collection of all important diagnostic categories in the realm of pigmented lesions:
    Actinic keratoses and intraepithelial carcinoma / Bowen's disease (akiec)
    basal cell carcinoma (bcc),
    benign keratosis-like lesions (solar lentigines / seborrheic keratoses and lichen-planus like keratoses, bkl)
    dermatofibroma (df)
    melanoma (mel),
    melanocytic nevi (nv)
    and vascular lesions (angiomas, angiokeratomas, pyogenic granulomas and hemorrhage, vasc).

In [None]:
metadata2 = metadata[metadata.dx != "akiec"]
metadata2["label"] = metadata["dx"].apply(lambda x: 1 if ((x == "mel") or (x == "bcc")) else 0 )
metadata2[metadata.dx == "mel"]
metadata2.to_csv("metadata.csv")

In [None]:
metadata["dx"].value_counts()


In [None]:
categories = {
    'nv': 'Non-cancerous',
    'mel': 'Cancerous',
    'bkl': 'Non-cancerous',
    'bcc': 'Cancerous',
    'akiec': 'Cancerous',
    'vasc': 'Non-cancerous',
    'df': 'Non-cancerous'
}

fontsize = 15

# Calculate value counts for the 'dx' column
value_counts = metadata["dx"].value_counts()
value_counts2 = metadata2["label"].value_counts()

# Assign colors based on cancer status
colors = ['red' if categories[category] == 'Cancerous' else 'green' for category in value_counts.index]

# Create a figure and axes
fig, ax = plt.subplots(1, 2, figsize=(16, 6),sharey=True)

# Plot the first bar chart
ax[0].bar(value_counts.index, value_counts.values, color=colors, edgecolor='black')
ax[0].set_ylabel('Frequency', fontsize=fontsize)
ax[0].set_xticklabels(value_counts.index, rotation=90)

# Create custom legends
red_patch = plt.Line2D([0], [0], color='red', lw=4, label='Cancerous (malignant)')
green_patch = plt.Line2D([0], [0], color='green', lw=4, label='Non-cancerous (benign)')
ax[0].legend(handles=[red_patch, green_patch], fontsize=fontsize)

# Plot the second bar chart
ax[1].bar(value_counts2.index, value_counts2.values, color=colors, edgecolor='black')
#ax[1].set_ylabel('Frequency')
ax[1].legend(handles=[red_patch, green_patch], fontsize=fontsize) 

ax[1].tick_params(axis='x', labelsize=fontsize)
ax[0].tick_params(axis='y', labelsize=fontsize)

ax[0].set_xticklabels(value_counts.index, rotation=90, fontsize=fontsize)
# Show the bar charts
plt.tight_layout()
plt.savefig("results.jpg", dpi=100)
plt.show()


In [None]:
metadata2["label"].value_counts()


In [None]:
metadata2["dx"].value_counts()


In [None]:
metadata2["label"].value_counts()


# **Split the dataset**

In [None]:
metadata2.label.value_counts()

In [None]:
from sklearn.model_selection import train_test_split

train ,test = train_test_split(metadata2,
                               test_size= 0.2,
                               random_state= 42,
                               stratify=metadata2["label"]
                              )

train.label.value_counts()

In [None]:
df1 = resample(train[train.label == 1],n_samples=3875,random_state=42,replace=True)
df2 = resample(train[train.label == 0],n_samples=3875,random_state=42,replace=False)
bal_train = pd.concat([df1, df2], axis= 0)
bal_train.reset_index(inplace=True,drop = True)
bal_train

In [None]:
df2.image_id.duplicated().value_counts()

In [None]:
bal_train.label.value_counts()

In [None]:
bal_train.to_csv("balenced_train.csv")
train.to_csv("train.csv")
test.to_csv("test.csv")

# **Create Custom Dataset**

In [None]:
transform_main = v2.Compose([
    
    v2.Resize(232,interpolation= v2.InterpolationMode.BILINEAR),
    v2.CenterCrop(224),
    v2.Compose([v2.ToImage(), v2.ToDtype(torch.float32, scale=True)]),    #ToTensor()
    v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


transform_aug = v2.Compose([
            v2.RandomRotation(45),
            v2.RandomHorizontalFlip(0.5)])

train_transform = v2.Compose([transform_aug,transform_main])

train_transform,transform_aug,transform_main

In [None]:
class ham(Dataset):
    def __init__(self,csv_dir,transform = None):
        self.csv_dir = csv_dir
        self.datas = pd.read_csv(self.csv_dir)
        self.transform = transform
        
    def __getitem__(self,x):
        
        path = self.datas.path[x]
        img = Image.open(path)
        label = self.datas.label[x]
        label = torch.tensor(label, dtype = torch.float32)
        if self.transform:
            img = self.transform(img)
            
        return img, label
    
    def __len__(self):
        return len(self.datas)
    
train_dataset = ham("/kaggle/input/model-data/balenced_train.csv",train_transform)
test_dataset = ham("/kaggle/input/model-data/balenced_test.csv",transform_main)

In [None]:
img,label =train_dataset[150]
label.shape

In [None]:
train_dataloader = DataLoader(dataset= train_dataset,
                              batch_size= 64,    #64, 32
                              shuffle= True
                              )
test_dataloader = DataLoader(dataset= test_dataset,
                              batch_size= 64,     #64, 32
                              shuffle= False
                              )

# **Model VGG16_BN**

In [None]:
Device = "cuda" if torch.cuda.is_available() else "cpu"
Device

In [None]:
from torchvision.models import vgg16_bn, VGG16_BN_Weights

# Load VGG16_bn with pre-trained weights
weights_vgg16_bn = VGG16_BN_Weights.IMAGENET1K_V1
model_vgg16_bn = vgg16_bn(weights=weights_vgg16_bn).to('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
from torchinfo import summary

# Freeze all the layers
for param in model_vgg16_bn.parameters():
    param.requires_grad = False


# Modify the final fully connected layer
num_ftrs = model_vgg16_bn.classifier[6].in_features
model_vgg16_bn.classifier[6] = nn.Linear(num_ftrs, 1)

#for param in model_vgg16_bn.classifier.parameters():
#    param.requires_grad = True

summary(model=model_vgg16_bn,
        input_size=(1, 3, 224, 224),  # make sure this is "input_size", not "input_shape"
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"])

In [None]:
def train_vgg16_bn(model, device, train_loader, validation_loader, epochs, lr, name):
    model.to(device)
    loss_fn = nn.BCEWithLogitsLoss().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    train_loss, validation_loss = [], []
    train_acc, validation_acc = [], []
    
    with tqdm(range(epochs), unit='epoch') as tepochs:
        tepochs.set_description('Training')
        
        for epoch in tepochs:
            model.train()
            running_loss = 0.
            correct, total = 0, 0
            
            for data, target in train_loader:
                data, target = data.to(device), target.to(device)

                output = model(data).squeeze(dim=1)
                pred = torch.sigmoid(output)
                pred = (pred > 0.5).float()

                optimizer.zero_grad()
                loss = loss_fn(output, target)
                loss.backward()
                optimizer.step()

                tepochs.set_postfix(loss=loss.item())
                running_loss += loss.item()  

                total += target.size(0)
                correct += (pred == target).sum().item()
            
            train_loss.append(running_loss / len(train_loader))
            train_acc.append(correct / total)

            model.eval()
            running_loss = 0.
            correct, total = 0, 0

            for data, target in validation_loader:
                data, target = data.to(device), target.to(device)
                
                output = model(data).squeeze(dim=1)
                pred = torch.sigmoid(output)
                pred = (pred > 0.5).float()

                loss = loss_fn(output, target)
                running_loss += loss.item()
                
                total += target.size(0)
                correct += (pred == target).sum().item()

            validation_loss.append(running_loss / len(validation_loader))
            validation_acc.append(correct / total)
            
            print(f"Epoches: {epoch}")
            print(f"\nTrain loss: {train_loss[-1]:.5f} | Train acc: {train_acc[-1]:.5f} \nValidation loss: {validation_loss[-1]:.5f} | Validation acc: {validation_acc[-1]:.5f}\n")
            
            if epoch in [0,5,10, 11,25,30,35,39,40]:
                torch.save({
                    'epoch': epoch,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict()
                }, f"./model_vgg16_bn_{name}_e{epoch}.pth")
            
        """
        torch.save({
            'epoch': epochs,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict()
        }, f"./model_vgg16_bn_{name}_final_{epochs}.pth")
        """
        return train_loss, train_acc, validation_loss, validation_acc

In [None]:
train_loss_vgg16_bn, train_acc_vgg16_bn, validation_loss_vgg16_bn, validation_acc_vgg16_bn = train_vgg16_bn(model_vgg16_bn, Device, train_dataloader, test_dataloader, epochs=1, lr=0.0001, name="freeze")

In [None]:
for param in model_vgg16_bn.parameters():
    param.requires_grad = True
    
summary(model=model_vgg16_bn,
        input_size=(1, 3, 224, 224),  # make sure this is "input_size", not "input_shape"
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"])

In [None]:
train_loss2, train_acc2, validation_loss2, validation_acc2= train_vgg16_bn(model_vgg16_bn, Device,train_dataloader,test_dataloader,11, 0.000001, "finetune")

In [None]:
# import shutil

# # Create a zip archive
# shutil.make_archive('/kaggle/working/model_vgg16_bn_freeze', 'zip', '/kaggle/working')

# # Now, you can download the zip file from the output section
