# Imports

In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import cv2
from PIL import Image
import gc

import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from torchvision import datasets, transforms, models
from torch.utils.data import Dataset
from torch.optim.lr_scheduler import ReduceLROnPlateau

!pip install torchtoolbox==0.1.4.1
import torchtoolbox.transform as transforms
from torch.utils.data import Dataset, DataLoader, Subset

import time
import datetime
import random

from sklearn.model_selection import StratifiedKFold, GroupKFold, train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

!pip install efficientnet_pytorch
from efficientnet_pytorch import EfficientNet

import os 

import warnings
warnings.simplefilter('ignore')

In [28]:
!pip install wandb          #画图工具
!wandb login

In [29]:
import wandb

wandb.init(project="Melanoma", entity="grimm0404")

# 设置随机种子

In [30]:
def seed_everything(seed_value):
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True

seed = 1234
seed_everything(seed)

# 选择训练设备

In [31]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print (device)

## 创建Dataset

In [32]:
class CustomDataset(Dataset):
    def __init__(self, df: pd.DataFrame, img_dir, train: bool = True, transforms= None):
        self.df = df
        self.img_dir = img_dir
        self.transforms = transforms
        self.train = train

    def __getitem__(self, index):
        img_path = os.path.join(self.img_dir, self.df.iloc[index]['image_name'] + '.jpg')
        images = cv2.imread(img_path)

        if self.transforms:
            images = self.transforms(images)

        if self.train:
            labels = self.df.iloc[index]['target']
            return torch.tensor(images, dtype=torch.float32), torch.tensor(labels, dtype=torch.float32)

        else:
            return torch.tensor(images, dtype=torch.float32)

    def __len__(self):
            return len(self.df)

In [33]:
df = pd.read_csv('/kaggle/input/melanoma-external-malignant-256/train.csv')
test_df = pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/test.csv')
test_img_dir = '/kaggle/input/melanoma-external-malignant-256/test/test/'
train_img_dir = '/kaggle/input/melanoma-external-malignant-256/train/train/'

In [None]:
test_df.head()       #查看数据

In [34]:
vld_size=0.20

train, valid = train_test_split (df, stratify=df.target, test_size = vld_size, random_state=42) 

train_df=pd.DataFrame(train)
validation_df=pd.DataFrame(valid)

print(len(validation_df))
print(len(train_df))

## 数据集划分可视化

In [35]:
fig2 = plt.figure(figsize=(20, 5))
ax3 = fig2.add_subplot(1,2,1)
ax4 = fig2.add_subplot(1,2,2)

counts1 = train_df['target'].value_counts()
dx = ['Benign', 'Malignant']
ax3.bar(dx, counts1)  
ax3.set_title("Training Set")
ax3.legend()

for i, v in enumerate(counts1):
    ax3.text(i-.1, 
              v/counts1[i]+200, 
              counts1[i], 
              fontsize=15,)

counts2 = validation_df['target'].value_counts()
ax4.bar(dx, counts2)  
ax4.set_title("Validation Set")
ax4.legend()

for i, v in enumerate(counts2):
    ax4.text(i-.1, 
              v/counts2[i]+100, 
              counts2[i], 
              fontsize=15)

plt.show()  

### 数据增强

In [36]:
training_transforms = transforms.Compose([transforms.RandomRotation(30),
                                          transforms.RandomResizedCrop(256, scale=(0.8, 1.0)),
                                          transforms.RandomHorizontalFlip(),
                                          transforms.RandomVerticalFlip(),
                                          transforms.ColorJitter(brightness=32. / 255.,saturation=0.5,hue=0.01),
                                          transforms.ToTensor(),
                                          transforms.Normalize([0.485, 0.456, 0.406], 
                                                               [0.229, 0.224, 0.225])])

validation_transforms = transforms.Compose([transforms.Resize(256),
                                            transforms.CenterCrop(256),
                                            transforms.ToTensor(),
                                            transforms.Normalize([0.485, 0.456, 0.406], 
                                                                 [0.229, 0.224, 0.225])])

testing_transforms = transforms.Compose([transforms.Resize(256),
                                         transforms.CenterCrop(256),
                                         transforms.ToTensor(),
                                         transforms.Normalize([0.485, 0.456, 0.406], 
                                                              [0.229, 0.224, 0.225])])

## Loading Datasets

In [37]:
training_dataset = CustomDataset(df = train_df,
                                 img_dir = train_img_dir, 
                                 train = True,
                                 transforms = training_transforms )

validation_dataset = CustomDataset(df = validation_df,
                                   img_dir = train_img_dir, 
                                   train = True,
                                   transforms = training_transforms )

testing_dataset = CustomDataset(df = test_df,
                                img_dir = test_img_dir,
                                train= False, 
                                transforms = testing_transforms )

In [38]:
train_loader = torch.utils.data.DataLoader(training_dataset, batch_size=32, num_workers=4, shuffle=True)
validate_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=16, shuffle = False)
test_loader = torch.utils.data.DataLoader(testing_dataset, batch_size=16, shuffle = False)

In [None]:
#打印部分数据增强后的数据，看看效果
cnt = 0
for imgs, labels in train_loader:
  if (cnt>=6):
    break
  img = imgs[0]
  img = np.transpose(img, (1, 2, 0))
  plt.imshow(img.numpy())
  plt.show()
  cnt += 1

In [39]:
print(len(train_loader))
print(len(validate_loader))
print(len(test_loader))

# 创建Model

**适用于efficientnet-b2**

In [40]:
class Net(nn.Module):
    def __init__(self, arch):
        super(Net, self).__init__()
        self.arch = arch
        if 'fgdf' in str(arch.__class__):
            self.arch.fc = nn.Linear(in_features=1280, out_features=500, bias=True)
        if 'EfficientNet' in str(arch.__class__):   
            self.arch._fc = nn.Linear(in_features=1408, out_features=500, bias=True)
            
        self.ouput = nn.Linear(500, 1)
        
    def forward(self, images):
        x = images
        features = self.arch(x)
        output = self.ouput(features)
        
        return output

**适用于efficientnet-b3**

In [41]:
# class Model(nn.Module):
#     def __init__(self):
#         super(Model, self).__init__()
#         #加载预训练模型
#         self.resnetmodel = EfficientNet.from_pretrained('efficientnet-b3')
        
#         self.fc = nn.Sequential(nn.Linear(1000, 512), nn.ReLU(),
#                                   nn.Linear(512, 1))
# #                                 , nn.Sigmoid())
        
#     def forward(self, x):
#         x = self.resnetmodel(x)
#         return self.fc(x)

# **选择预训练模型**

In [42]:
# import torchvision.models as models
arch = EfficientNet.from_pretrained('efficientnet-b2')
# arch = EfficientNet.from_pretrained('efficientnet-b3')
model = Net(arch=arch) 
# model = Model()
# model = models.resnet152(pretrained=True)
model = model.to(device)

# Hyperparameters

In [43]:
best_val = 0

# 保存模型
model_path = f'melanoma_model_{best_val}.pth'  

epochs = 10

# 如果准确度没有变化则提前停止训练
es_patience = 3

criterion = nn.BCEWithLogitsLoss()
# criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(model.parameters(), lr=0.0005) 

# 当patience个epoch过去而模型性能不提升时，学习率减少的动作会被触发
scheduler = ReduceLROnPlateau(optimizer=optimizer, mode='max', patience=3, verbose=True, factor=0.2)

# Trainning the Model

In [44]:
loss_history=[]  
train_acc_history=[]  
val_loss_history=[]  
val_acc_history=[] 
val_auc_history=[]

patience = es_patience
Total_start_time = time.time()  
model.to(device)

for e in range(epochs):
    
    start_time = time.time()
    correct = 0
    running_loss = 0
    model.train()
    
    for images, labels in train_loader:
        
        images, labels = images.to(device), labels.to(device)
        
        optimizer.zero_grad()
        
        output = model(images) 
        loss = criterion(output, labels.view(-1,1))
        loss.backward()
        optimizer.step()
        
        # Training loss
        running_loss += loss.item()
        
        train_preds = torch.round(torch.sigmoid(output))
            
        correct += (train_preds.cpu() == labels.cpu().unsqueeze(1)).sum().item()
                        
    train_acc = correct / len(train_df)
               
    model.eval()
    preds=[]            

    with torch.no_grad():
        
        val_loss = 0
        val_correct = 0
    
        for val_images, val_labels in validate_loader:
        
            val_images, val_labels = val_images.to(device), val_labels.to(device)
        
            val_output = model(val_images)
            val_loss += (criterion(val_output, val_labels.view(-1,1))).item() 
            val_pred = torch.sigmoid(val_output)
            
            preds.append(val_pred.cpu())
        pred=np.vstack(preds).ravel()
           
#         val_accuracy = accuracy_score(train_df['target'].values, torch.round(pred2))
        val_auc_score = roc_auc_score(validation_df['target'].values, pred)
    
        #计算训练一个epoch所需要的时间    
        training_time = str(datetime.timedelta(seconds=time.time() - start_time))[:7]
        
        #打印每次迭代的loss和acc    
        print("Epoch: {}/{}.. ".format(e+1, epochs),
              
              #将每个epoch的loss和accuracy打印出来并在云端画图软件上记录
              "Training Loss: {:.3f}".format(running_loss/len(train_loader)),
              wandb.log({"train_loss": running_loss/len(train_loader)}),
              
              "Training Accuracy: {:.3f}".format(train_acc),
              wandb.log({"train_acc": train_acc}),
              
              "Validation Loss: {:.3f}".format(val_loss/len(validate_loader)),
              wandb.log({"val_loss": val_loss/len(validate_loader)}),
              
#               "Validation Accuracy: {:.3f}".format(val_accuracy),
#               wandb.log({"val_acc": val_accuracy}),
              
              "Validation AUC Score: {:.3f}".format(val_auc_score),
              wandb.log({"val_auc_score": val_auc_score}),
              "Training Time: {}".format( training_time))
            
          
        scheduler.step(val_auc_score)
                
        if val_auc_score >= best_val:
            best_val = val_auc_score
            patience = es_patience         #如果准确度提升了就更新
            torch.save(model, model_path)  # 保存最好的模型
        else:
            patience -= 1
            if patience == 0:
                print('Early stopping. Best Val roc_auc: {:.3f}'.format(best_val))
                break
        
    loss_history.append(running_loss)  
    train_acc_history.append(train_acc)    
    val_loss_history.append(val_loss)  
    #val_acc_history.append(val_accuracy)
    val_auc_history.append(val_auc_score) 

#计算总训练总时长
total_training_time = str(datetime.timedelta(seconds=time.time() - Total_start_time  ))[:7]                  
print("Total Training Time: {}".format(total_training_time))     

# 画图（losses and accuracies）

In [45]:
# 画出当前模型的loss和accuracy折线变化图
fig = plt.figure(figsize=(20, 5))
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)

ax1.plot(loss_history, label= 'Training Loss')  
ax1.plot(val_loss_history,label='Validation Loss')
ax1.set_title("Losses")
ax1.set_xlabel('Epochs')
ax1.set_ylabel('Loss')
ax1.legend()

ax2.plot(train_acc_history,label='Training accuracy')  
#ax2.plot(val_acc_history,label='Validation accuracy')
ax2.plot(val_auc_history,label='Validation AUC Score')
ax2.set_title("Accuracies")
ax2.set_xlabel('Epochs')
ax2.set_ylabel('Accuracy')
ax2.legend()

plt.savefig('d01.jpg')
plt.show()  

In [46]:
#删除多余的变量、清理缓存
del training_dataset, validation_dataset, train_loader, validate_loader, images, val_images, val_labels 
gc.collect()

# Testing

In [47]:
test_df['target']= np.zeros((len(test_df), 1))

In [48]:
test_labels = torch.tensor(test_df['target'], dtype=torch.float32)

In [49]:
model = torch.load(model_path)
model.eval()
model.to(device)
test_preds=[]
with torch.no_grad():
    
    for f, (test_images) in enumerate(test_loader):
        
        test_images, test_labels = test_images.to(device), test_labels.to(device)
        
        test_output = model(test_images)
        test_pred = torch.sigmoid(test_output)
            
        test_preds.append(test_pred.cpu())
        
    test_pred=np.vstack(test_preds).ravel()
    test_pred2 = torch.tensor(test_pred)
    test_accuracy = accuracy_score(test_labels.cpu(), torch.round(test_pred2))
      
print("Test Accuracy: {}".format(test_accuracy))    

In [50]:
#读取数据集里的sample_submission文件取出target作为test_pred
sub = pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/sample_submission.csv')
sub.loc[:, "target"] = test_pred
sub.to_csv('submission.csv', index=False)

In [51]:
def process_image(image_path):
    
    pil_image = Image.open(image_path)
    
    # Resize
    if pil_image.size[0] > pil_image.size[1]:
        pil_image.thumbnail((5000, 256))
    else:
        pil_image.thumbnail((256, 5000))
        
    # Crop 
    left_margin = (pil_image.width-256)/2
    bottom_margin = (pil_image.height-256)/2
    right_margin = left_margin + 256
    top_margin = bottom_margin + 256
    
    pil_image = pil_image.crop((left_margin, bottom_margin, right_margin, top_margin))
    
    # Normalize
    np_image = np.array(pil_image)/255
    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    np_image = (np_image - mean) / std
  
    # 转换channel
    np_image = np_image.transpose((2, 0, 1))
    
    return np_image

# Confusion Matrix

In [52]:
test = test_df['target']= np.zeros((len(test_df), 1))
pred = np.round(test_pred)
cm = confusion_matrix(test, pred)

cm_df = pd.DataFrame(cm,
                     index = ['Benign','Malignant'], 
                     columns = ['Benign','Malignant'])

plt.figure(figsize=(5.5,4))
sb.heatmap(cm_df, annot=True)
plt.title('Confusion Matrix \nAccuracy:{0:.3f}'.format(test_accuracy))
plt.ylabel('True label')
plt.xlabel('Predicted label')

plt.savefig('Confusion Matrix01.jpg')
plt.show()

# 查全率、查准率、F1

In [None]:
target_acc = []
for i in df['target']:
    if i == 1:
        target_acc.append(i)
print(len(target_acc))

predict_acc = []
    predict_acc.append(test_pred2)
print(len(predict_acc))
    
recall = train_acc / len(target_acc)
print('recall: ' + recall)

precision = train_acc / len(predict_acc)
print('precision' + precision)

F1 = 2 * recall * precision / (recall + precision)
print('F1' + F1)

# **测试：输入一张图片判断类别（melanoma or benign）**

In [53]:
def predict(image_path, model, topk=1): 
    image = process_image(image_path)
    
    # 将图片转换成tensor
    image = torch.from_numpy(image).type(torch.cuda.FloatTensor)
    #print(image.shape)
    #print(type(image))
    
    image = image.unsqueeze(0)
    
    output = model(image)
    
    probabilities = torch.sigmoid(output)
    
    top_probabilities, top_indices = probabilities.topk(topk)
    top_probabilities = top_probabilities.detach().type(torch.FloatTensor).numpy().tolist()[0] 
    top_indices = top_indices.detach().type(torch.FloatTensor).numpy().tolist()[0] 
    top_classes = []
    
#     设置阈值判断图片类别
    if probabilities > 0.5 :
        top_classes.append("Melanoma")
    else:
        top_classes.append("Benign")
   
    return top_probabilities, top_classes

predict_image_path='../input/siim-isic-melanoma-classification/jpeg/train/ISIC_0502582.jpg'
#predict_image_path='../input/siim-isic-melanoma-classification/jpeg/test/ISIC_0074618.jpg'

probs, classes = predict(predict_image_path, model)   
print(probs)
print(classes)

In [57]:
def imshow(image, ax=None, title=None):
    if ax is None:
        fig, ax = plt.subplots()
    
    # 把channel换到最后一个
    image = image.transpose((1, 2, 0))

    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    image = std * image + mean
    
    if title is not None:
        ax.set_title(title)
    
    # 裁剪到0，1之间不然看上去像是噪点
    image = np.clip(image, 0, 1)
    
    ax.imshow(image)
    
    return ax

image = process_image(predict_image_path)
imshow(image)

In [58]:
# 画图查看预测结果
plt.figure(figsize = (6,10))
plot_1 = plt.subplot(2,1,1)

image = process_image(predict_image_path)

imshow(image, plot_1)
font = {"color": 'g'} if 'Benign' in classes else {"color": 'r'}
plot_1.set_title("Diagnosis: {}".format(classes), fontdict=font);