In [None]:
import glob
import numpy as np
import pandas as pd
import os
import PIL
import matplotlib.pyplot as plt
import seaborn as sns
import math
import random
import multiprocessing as mp
from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array, array_to_img
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense
from keras.optimizers import Adam
from tensorflow.keras import optimizers
from tensorflow.keras.utils import Sequence
from tensorflow.keras.callbacks import Callback
import tensorflow as tf

In [None]:
random.seed = 42

In [None]:
train_df = pd.read_csv("/kaggle/input/siim-isic-melanoma-classification/train.csv")
test_df = pd.read_csv("/kaggle/input/siim-isic-melanoma-classification/test.csv")
train_df.head()

In [None]:
train_df.info()

In [None]:
test_df.head()

In [None]:
test_df.info()

In [None]:
train_df.nunique()

In [None]:
test_df.nunique()

## Insight
1. Missing vaules

Both metadata and image files show the the total number of train images is 33126. In metadata, some columns has some missing values (NaN), namely sex, age_approx and anatom_site_general_challenge.
On the other hand, only "anatom_site_general_challenge" in the test data has missing values.

Handling missing values will be done before model building and training. Several choices we can choose (drop or fill values), but before we make decision, let's acquire more detail about the columns first.

2. Unique values
    
    1) one patient may have muliple images in the datasets.
 
    2) "benign_malignant" and "target" are both binary classification, they are what we need to predict, thus, "benign_malignant" should not be included in features.

## target distribution

In [None]:
sns.countplot(train_df['target'])

## Insight
* This step helps us to find out the target values (Y values) distribution.

* the classification of this datasets is binary - it has shown by the nunique function, it is worthy to note that the distribution of target values in training sample is extremely imbalance: 99% of the target value is zero. 

* This extreme imbalance of Y values may cause a dilemma：
    * you will get a good validating accuracy, (validation dataset is a randomly selected subset of training dataset, it is reasonable to assume that randomly guess may get >90% accuracy), but the test datasets may differ from the traning dataset and the test accuracy is low;
    * or you will get both high validating accuracy and test accuracy - the problem is any algorithm will do the same thing.
    * the real problem is that it makes more hard to identify malignant melanoma, because the corresponding samples are too few to identify features. 
* Therefore, having too many benign samples does not help us to identify maliganant melanoma. This may imply that we could reduce benign samples to reduce overfitting issue. And we will reduce benign samples in training sections.

## Age,Sex and melanoma distribution

In [None]:
fig, ax = plt.subplots(2,1, figsize=(16,8), sharex=True)

cond1 = train_df.benign_malignant == 'benign'
cond2 = train_df.benign_malignant == 'malignant'
#sns.countplot('age_approx',hue='sex', data=train_df)
#sns.barplot('age_approx', 'target', 'sex', data=train_df)
sns.countplot(train_df['age_approx'].where(cond1), hue=train_df.sex, ax=ax[0])
ax[0].set_title("Benign cases among genders in train dataset", pad=10)

sns.countplot(train_df['age_approx'].where(cond2), hue=train_df.sex, ax=ax[1])
ax[1].set_title("Malignant cases among genders in train dataset", pad=10)
plt.show()

## Insight
* It is obvious that age plays a key role in melanoma diagnoses. Males in (60,75) seems to be risky in malignant tumor while Females in 55 and 65 are the most risky group.

* As for sex, it seems that males have more risk than females to have malignant tumor.
 
* It also noticable that males in (45,50) have a relatively high potential of developing a malignant tumor. We could speculate that besides age and sex, there would be other effects cause the high ratios for males in such early ages.

## Anatom site distribution analysis

In [None]:
train_df['anatom_site_general_challenge'].where(cond1).value_counts()

In [None]:
fig, ax = plt.subplots(1,2, figsize=(16,8), sharey=True)
#train_df['anatom_site_general_challenge'].where(cond1).value_counts()
sns.countplot(y=train_df['anatom_site_general_challenge'].where(cond1),ax=ax[0])
ax[0].set_title("Benign: Anatom site distribution", pad=10)
#train_df['anatom_site_general_challenge'].where(cond2).value_counts()
sns.countplot(y=train_df['anatom_site_general_challenge'].where(cond2),ax=ax[1])
ax[1].set_title("Maglinant: Anatom site distribution", pad=10)
ax[1].label_outer()

## Insight
* ”anatom_site_general_challenge“ represent the position of melanoma on patients. As the two parellel graphs shows, "torso" is the part of body which has high potential of maglinant skin cancer compare to other parts. 

* it seems that the most of melanoma images are locatet at "head/neck", but more than 99% of the melanomas are benign in training set.

* it should be reminded that some of rows in ”anatom_site_general_challenge“ column is missed (null.)

## diagnosis and result
The last feature column in training dataset is "diagnosis" (which is not in test dataset), we will explore the relationship between diagnosis and the final result.

In [None]:
print("total kinds of diagnosis:", len(train_df['diagnosis'].unique()))
print("list of diagnosis:", train_df['diagnosis'].unique())
print("\nBenign diagonsis distribution:\n", train_df['diagnosis'].where(cond1).value_counts())
train_df['diagnosis'].where(cond1).value_counts().plot(kind='bar')

In [None]:
print(train_df['diagnosis'].where(cond2).value_counts())
train_df['diagnosis'].where(cond2).value_counts().plot(kind='bar')

## Insight
* Well, it seems that all the "melanona" is diagnosed as "malignant";
* In the "benign" result, most of the diagnoses are "unknown", this does not make any sense.

# Shallow learning
## Data Propreccessing
In this section, we will begin with handling missing values and encoding the category columns:
* first step is handling missing values. Since test data also has  missing value, we cannot simply drop all the NaN rows. In this case, we can fill with the most common one;
* "age_approx" is a numerical column, what we need to do is normalisation, we will use MinMaxScaler to normalise the column
* regarding category columns: "sex", "anatom_site_general_challenge", we have explore that those two columns have correlation with the target diagnose result. we will use different encoding for the two columns:

    * "sex", is a binary classification, we will use conditional expression to convert boolean results to numerical results;
    * "anatom_site_general_challenge", only have 6 unique values and those values we think are equally important, so we wil try "OneHotEncoder".

### Training metadata preprocessing

In [None]:
### setup training baseline dataframe
train_df_baseline = train_df.copy()

# randomly select a certain number of samples labelled as "benign"
def random_selection(df1, df2, k_samples=10000):
    random_index = random.sample(df1.index.to_list(), k_samples)
    random_df = df1.loc[random_index]
    select_df = pd.concat([random_df, df2])
    select_df.sort_index(inplace=True)
    return select_df

malignant_train_df = train_df[cond2]
benign_train_df = train_df[cond1]

train_df_baseline = random_selection(df1=benign_train_df, df2=malignant_train_df)

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
import category_encoders as ce

# 
features = ["sex", "age_approx", "anatom_site_general_challenge"]
# fill missing values with most frequent value 
impute = SimpleImputer(strategy='most_frequent')
train_df_baseline[features] = impute.fit_transform(train_df_baseline[features])


# Encoding category columns
# Encoding "sex" column
train_df_baseline = train_df_baseline.assign(sex_enc=(train_df_baseline.sex == 'male').astype('int'))

# Encoding "anatom_site_general_challenge" column
oneHot_enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
oneHot_encoded = oneHot_enc.fit_transform(train_df_baseline['anatom_site_general_challenge'].values.reshape(-1,1))
oneHot_df = pd.DataFrame(oneHot_encoded, index=train_df_baseline.index)
train_df_baseline = train_df_baseline.join(oneHot_df)

# Encoding "age_approx" column with LableEncoding 
# label_enc = LabelEncoder()
# label_encoded = label_enc.fit_transform(train_df_baseline['age_approx'])

#Normilise new "age" column
scaler = MinMaxScaler()
train_df_baseline['age_enc'] = scaler.fit_transform(train_df_baseline['age_approx'].values.reshape(-1,1))


print(train_df_baseline.shape)
train_df_baseline.head()

### test metadata preprocessing

In [None]:
test_df_baseline = test_df.copy()
test_df_baseline[features] = impute.transform(test_df_baseline[features])

test_df_baseline = test_df_baseline.assign(sex_enc=(test_df_baseline.sex == 'male').astype('int'))

oneHot_encoded = oneHot_enc.transform(test_df_baseline['anatom_site_general_challenge'].values.reshape(-1,1))
oneHot_df = pd.DataFrame(oneHot_encoded, index=test_df_baseline.index)
test_df_baseline = test_df_baseline.join(oneHot_df)

test_df_baseline['age_enc'] = scaler.transform(test_df_baseline['age_approx'].values.reshape(-1,1))

print(test_df_baseline.shape)
test_df_baseline.head()

# data training and prediction

* In this section we will try to use only metadata for traning, setup a baseline of result

In [None]:
### At first, we will use KNN to predict the test dataset 
from sklearn.neighbors import KNeighborsClassifier

baseline_features = train_df_baseline.columns[-8:].to_list()
trainX = train_df_baseline[baseline_features]
trainY = train_df_baseline['target']
testX = test_df_baseline[baseline_features]

knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

knn.fit(trainX, trainY)
knn.predict_proba(testX)
#### KNN is not suitable for this compitition because the result is amost binary

In [None]:
# RandomForest is a baseline of results (submission result 0.679)

from sklearn.ensemble import RandomForestClassifier

baseline_features = train_df_baseline.columns[-8:].to_list()
trainX = train_df_baseline[baseline_features]
trainY = train_df_baseline['target']
testX = test_df_baseline[baseline_features]

rfclass = RandomForestClassifier(random_state=42)

rfclass.fit(trainX, trainY)
rfc_predict = rfclass.predict_proba(testX)[:,1]
test_df_baseline['rfc_pred'] = rfc_predict
test_df_baseline.head(2)

# Deep Learning Section

In [None]:
train_path = '/kaggle/input/siim-isic-melanoma-classification/jpeg/train/'
test_path = '/kaggle/input/siim-isic-melanoma-classification/jpeg/test/'

train_files = glob.glob(train_path+"*") 
test_files = glob.glob(test_path+"*") 
print('Number of train images: %d \nNumber of test images:%d' %(len(train_files), len(test_files)))

In [None]:
X_train_image, X_val_image, Y_train_image, Y_val_image = train_test_split(train_df_baseline['image_name'], train_df_baseline['benign_malignant'], random_state=42)

In [None]:
image_df_train = train_df_baseline.loc[X_train_image.index, ['image_name', 'benign_malignant']]
image_df_train['image_name'] = image_df_train['image_name'] + '.jpg'
image_df_train.sort_index(inplace=True)
print(image_df_train.shape)
image_df_train.head(2)

In [None]:
image_df_valid = train_df_baseline.loc[X_val_image.index, ['image_name', 'benign_malignant']]
image_df_valid['image_name'] = image_df_valid['image_name'] + '.jpg'
image_df_valid.sort_index(inplace=True)
print(image_df_valid.shape)
image_df_valid.head(2)

In [None]:
image_df_test = test_df_baseline[['image_name']] + '.jpg'
print(image_df_test.shape)
image_df_test.head(2)

In [None]:
###### from keras.applications import vgg16
from keras.applications import mobilenet_v2

imageDG = ImageDataGenerator(preprocessing_function=mobilenet_v2.preprocess_input)
#imageDG = ImageDataGenerator(rescale=1.0/255)
train_batches = imageDG.flow_from_dataframe(dataframe=image_df_train, directory=train_path, x_col='image_name', y_col='benign_malignant', \
                                            target_size=(224,224), classes=['benign', 'malignant'], batch_size=32)

valid_batches = imageDG.flow_from_dataframe(dataframe=image_df_valid, directory=train_path, x_col='image_name', y_col='benign_malignant', \
                                            target_size=(224,224), classes=['benign', 'malignant'], batch_size=32)

test_batches = imageDG.flow_from_dataframe(dataframe=image_df_test, directory=test_path, x_col='image_name', \
                                           target_size=(224,224), shuffle=False, classes=None, class_mode=None, batch_size=32)

### Here we have already generated the train, valid and test batches for CNN training, validating and prediction, notice that we use the preprocessing function from vgg16, we will use the pre-trained vgg16 module to build our image training model (vgg16 paper: https://arxiv.org/pdf/1409.1556.pdf).
### Before building the model, we will first ensure that our image can be plotted correctly, the plotImages function is directly copy from Tensorflow documentation: https://www.tensorflow.org/tutorials/images/classification#visualize_training_images

In [None]:
def plotImages(images_arr):
    fig, axes = plt.subplots(1, 10, figsize=(20,20))
    axes = axes.flatten()
    for img, ax in zip( images_arr, axes):
        ax.imshow(img)
        ax.axis('off')
    plt.tight_layout()
    plt.show()

In [None]:
imgs, labels= next(train_batches)
plotImages(imgs)
print(labels[:10])

In [None]:
imgs, labels= next(valid_batches)
plotImages(imgs)
print(labels[:10])

In [None]:
imgs= next(test_batches)
plotImages(imgs)

## Insight
* Well, we are not doctors, we cann't identify whether the melanoma shown in each image is benign or malignant;
* However, we can ensure that we are on the right path: all the image load in generator and no images left, the train images have been seperated into training and validation sets and then can be used for training and validation, the batches of images in three sets can be plotted correctly.

# Deep Learning Model

* As we mentioned above, we will use the exist MobileNetV2 model to build our own model to reduce the training time and also the model is a pre-trained model have many layers and tuned parameters which are much better than simple CNN model build from scratch.

In [None]:
# from keras.applications import mobilenet
# mobile = mobilenet.MobileNet()
# mobile.summary()

In [None]:

mobile = mobilenet_v2.MobileNetV2()
print("type of mobilenet_v2 model:", type(mobile))

In [None]:
# #transfer learning
# from keras import Model
# model = Sequential()
# for layer in mobile.layers[:-1]:
#     model.add(layer)
# for layer in model.layers:
#     layer.trainable = True
# model.add(Dense(units=2, activation='softmax'))
# model.summary()

#transfer learning
from keras import Model
output = Dense(units=2, activation='softmax')(mobile.layers[-2].output)
model = Model(inputs=mobile.input, outputs=output)
for layer in model.layers[:-1]:
    layer.trainable = False
model.summary()



* Since the orignal vgg16 model has 1000 final ouput, which obviously is not suitable for us (we only have two), we will substitute our own layer for the last layer of vgg16 model.
* To achive the purpose, we will duplicate vgg16 model expect the last layer.
* Notice that the pre-trained parameters will be set as "not trainable", which means, the parameters will not be tuned while training our new model.

In [None]:
# check if GPU is available to use for tensorflow
physical_devices = tf.config.experimental.list_physical_devices('GPU')
print("Num GPUs Available: ", len(physical_devices))

In [None]:
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
    
model.fit_generator(train_batches, validation_data=valid_batches, epochs=2, \
                                   steps_per_epoch=len(train_batches), validation_steps=len(valid_batches), \
                                   workers=mp.cpu_count(), use_multiprocessing=True, verbose=1)

In [None]:
# from keras.layers.normalization import BatchNormalization
# from keras.layers import ZeroPadding2D
# from keras.layers import InputLayer
# from keras.layers import ReLU

# def CNN_model(input_shape=(48,48,3)):
#     model = Sequential()
#     # first Convolution layer, first time need input_shape, valid faster and better 
#     model.add(InputLayer(input_shape=(48,48,3)))
#     model.add(ZeroPadding2D(padding=((1,0),(1,0))))
#     model.add(Conv2D(32, (3, 3), padding='valid', strides=(1,1)))
#     model.add(BatchNormalization())
#     model.add(MaxPooling2D(pool_size=(2, 2)))
#     model.add(Dropout(0.25))
#     # second Convolution layer
#     model.add(Conv2D(64, (3, 3), padding='valid', activation='relu', strides=1,))
#     # second MaxPooling layer
#     model.add(MaxPooling2D(pool_size=(2, 2)))
#     model.add(Dropout(0.25))
#     # third Convolution layer
#     model.add(Conv2D(128, (3, 3), padding='valid', activation='relu', strides=1,))
#     # third MaxPooling layer
#     model.add(MaxPooling2D(pool_size=(2, 2)))
#     model.add(Dropout(0.25))
#     # Flatten
#     model.add(Flatten())
#     model.add(Dense(units = 256, activation='relu', name="Dense_1"))
#     model.add(Dense(units = 2, activation='softmax'))
    
#     model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
    
#     model.fit_generator(train_batches, validation_data=valid_batches, epochs=2, \
#                                    steps_per_epoch=len(train_batches), validation_steps=len(valid_batches), \
#                                    workers=mp.cpu_count(), use_multiprocessing=True, verbose=1)
#     model.summary()
#     return model

# input_model = CNN_model()
# feature_model = Model(inputs=input_model.input, outputs=model.get_layer(index=-2).output)
# for layer in feature_model.layers[:-1]:
#     layer.trainable = False
# feature_model.summary()

In [None]:
image_feature_train = model.predict(train_batches, workers=mp.cpu_count(), use_multiprocessing=True, verbose=1)
print(image_feature_train.shape)

In [None]:
image_feature_valid = model.predict(valid_batches, workers=mp.cpu_count(), use_multiprocessing=True, verbose=1)
print(image_feature_valid.shape)

In [None]:
image_feature_test = model.predict(test_batches, workers=mp.cpu_count(), use_multiprocessing=True, verbose=1)
print(image_feature_test.shape)

In [None]:
cols = [f'imft_{num}' for num in range(image_feature_train.shape[1])]
train_feature_df = pd.DataFrame(image_feature_train, columns=cols, index=image_df_train.index)
valid_feature_df = pd.DataFrame(image_feature_valid, columns=cols, index=image_df_valid.index)
test_feature_df = pd.DataFrame(image_feature_test, columns=cols, index=image_df_test.index)

In [None]:
train_X_final = pd.concat([train_feature_df,train_df_baseline.loc[image_df_train.index, baseline_features]], axis=1)
valid_X_final = pd.concat([valid_feature_df,train_df_baseline.loc[image_df_valid.index, baseline_features]], axis=1)
test_X_final = pd.concat([test_feature_df,test_df_baseline.loc[image_df_test.index, baseline_features]], axis=1)

In [None]:
image_df_train.shape

In [None]:
train_df_baseline.loc[image_df_train.index, baseline_features]

In [None]:
train_feature_df

In [None]:
import lightgbm as lgb
dtrain = lgb.Dataset(data=train_X_final, label=train_df_baseline.loc[image_df_train.index, 'target'])
dvalid = lgb.Dataset(data=valid_X_final, label=train_df_baseline.loc[image_df_valid.index, 'target'])

param = {'num_leaves': 64, 'objective': 'binary', 'metric': 'auc'}
num_boost_round = 1000

bst = lgb.train(param, dtrain, num_boost_round, dvalid, early_stopping_rounds=10, verbose_eval=True)

In [None]:
test_df_baseline

In [None]:
target_pred = bst.predict(test_X_final)
test_df_baseline['lgb_pred'] = target_pred
#compare the first few rows of predict results between RandomForest and CNN+LGB 
test_df_baseline.head()

In [None]:
# In case the order of "image_name" in sample_submission.csv is not the same as in test.csv 
ss = pd.read_csv("/kaggle/input/siim-isic-melanoma-classification/sample_submission.csv")
ss = ss.merge(test_df_baseline[['image_name', 'lgb_pred']], on='image_name')

upload = ss[['image_name', 'lgb_pred']]
upload.columns = ['image_name', 'target']


upload.to_csv('submission.csv', index=False)
upload.head()