In [1]:
import pandas as pd
import os
import numpy as np
import cv2
from PIL import Image

import torch
import torch.nn as nn

from torchvision import datasets, models, transforms
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt
import glob
import re
from tqdm import tqdm
import time

In [2]:
class CombineDataset(Dataset):

    def __init__(self, frame, id_col, label_name, path_imgs, use_cache = False, transform = None):
        
        self.frame = frame
        self.id_col = id_col
        self.label_name = label_name
        self.path_imgs = path_imgs
        self.inputs_dtype = torch.float32
        
        self.transform = transform
        
        self.use_cache = use_cache
        
        if self.use_cache:
            self.cached_data = []

            progressbar = tqdm(range(len(self.path_imgs)), desc='Caching')
            
            for i, img in zip(progressbar, self.path_imgs):
                img = Image.open(img)
                if self.transform is not None:
                    img= self.transform(img)
                
                self.cached_data.append(img)
            

    def __len__(self):
        return (self.frame.shape[0])

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        img = self.cached_data[idx]

        feats = [feat for feat in self.frame.columns if feat not in [self.label_name,self.id_col]]
        feats  = np.array(self.frame[feats].iloc[idx])
        feats = feats.tolist()
        feats = torch.FloatTensor(feats)

        label = np.array(self.frame[self.label_name].iloc[idx])
        label = torch.tensor(label, dtype=torch.long)
        
        name = self.frame[self.id_col].iloc[idx]

        return img, feats, label, name

In [3]:
data_dir = "C:\\...\\DATASET\\"
metadata = pd.read_csv('C:\\...\\MetaData2.csv')

numbers = re.compile(r'(\d+)')
def numericalSort(value):
    parts = numbers.split(value)
    parts[1::2] = map(int, parts[1::2])
    return parts

data_dir2 = sorted(glob.glob('C:\\...\\*.jpg'), key=numericalSort)

labels_cat = ['0. vascular lesions', '1. nevus', '2. solar lentigo', '3. dermatofibroma', '4. seborrheic ketarosis', '5. benign ketarosis', '6. actinic keratosis', '7. basal cell carcinoma', '8. squamous cell carcinoma', '9. melanoma']

img_size = 256, 256
batch_size = 4
num_classes = 10
num_epochs = 7


In [4]:
metadata_age = metadata[['age']]
Age_grup = []

for i in metadata_age['age']:
    if i <= 44:
        Age_grup += ['young']
    elif 45 <= i <= 59:
        Age_grup += ['middle']
    elif 60 <= i <= 74:
        Age_grup += ['elderly']
    elif 75 <= i:
        Age_grup += ['senile']
    else:
        Age_grup += ['error']
        
metadata['age'] = Age_grup

encode_map_categor = {'0. vascular lesions': 0, '1. nevus': 1, '2. solar lentigo': 2, '3. dermatofibroma': 3, '4. seborrheic ketarosis': 4, '5. benign ketarosis': 5, '6. actinic keratosis': 6, '7. basal cell carcinoma': 7, '8. squamous cell carcinoma': 8, '9. melanoma': 9}
encode_map_age = {'young': 0, 'middle': 1, 'elderly': 2, 'senile': 3}
encode_map_sex = {'male': 0, 'female': 1}
encode_map_anatomloc = {'anterior torso': 0, 'head/neck': 1, 'lateral torso': 2, 'lower extremity': 3, 'oral/genital': 4, 'palms/soles': 5, 'posterior torso': 6, 'upper extremity': 7}

metadata['diagnosis'].replace(encode_map_categor, inplace=True)
metadata['age'].replace(encode_map_age, inplace=True)
metadata['sex'].replace(encode_map_sex, inplace=True)
metadata['anatomloc'].replace(encode_map_anatomloc, inplace=True)

In [5]:
md = metadata[['name']]
md

Unnamed: 0,name
0,ISIC_0010512
1,ISIC_0010889
2,ISIC_0024468
3,ISIC_0024470
4,ISIC_0024511
...,...
41720,ISIC_0072937
41721,ISIC_0072964
41722,ISIC_0073012
41723,ISIC_0073031


In [6]:
Duplicate = pd.read_csv('C:\\Users\\...\\ISIC_2020_Training_Duplicates.csv')
Duplicate1 = Duplicate[['image_name_1']]
Duplicate2 = Duplicate[['image_name_2']]
Duplicate1

Unnamed: 0,image_name_1
0,ISIC_0079038
1,ISIC_0087297
2,ISIC_0088137
3,ISIC_0112097
4,ISIC_0148783
...,...
420,ISIC_8879370
421,ISIC_8889856
422,ISIC_8987085
423,ISIC_9218360


In [7]:
list_duplicates = []
for i in md['name']:
    for j in Duplicate1['image_name_1']:
        if i in j:
            list_duplicates += ['duplicate']
        else:
            list_duplicates += ['not a duplicate']

list_duplicates1 = pd.DataFrame(list_duplicates, columns=['duplicates_1'])

list_duplicates = []
for i in md['name']:
    for j in Duplicate2['image_name_2']:
        if i in j:
            list_duplicates += ['duplicate']
        else:
            list_duplicates += ['not a duplicate']
            
list_duplicates2 = pd.DataFrame(list_duplicates, columns=['duplicates_2'])

In [8]:
list_duplicates1['duplicates_1'].value_counts()

not a duplicate    17733125
Name: duplicates_1, dtype: int64

In [9]:
list_duplicates2['duplicates_2'].value_counts()

not a duplicate    17733125
Name: duplicates_2, dtype: int64

In [10]:
list_duplicates2

Unnamed: 0,duplicates_2
0,not a duplicate
1,not a duplicate
2,not a duplicate
3,not a duplicate
4,not a duplicate
...,...
17733120,not a duplicate
17733121,not a duplicate
17733122,not a duplicate
17733123,not a duplicate
