In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from glob import glob
import seaborn as sns
from PIL import Image
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf
from sklearn.utils import resample
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing import image
import pickle

In [4]:
# Initializing the path of the working directory
base_dir = r"C:\Users\MSI\Documents\MiniProject"

# Mapping the image id with their path so that it can be referred easily
imageid_path_dict = {os.path.splitext(os.path.basename(x))[0]: x 
                     for x in glob(os.path.join(base_dir, 'sample', '*.jpg'))}

# This dictionary is useful for displaying more human-friendly labels later on
lesion_type_dict = {
    'nv': 'Melanocytic nevi',
    'mel': 'Melanoma',
    'bkl': 'Benign keratosis-like lesions ',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}

In [5]:
# Reffering to the medadata having the information about all the images we have.
skin_df = pd.read_csv(os.path.join(base_dir, 'HAM10000_metadata.csv'))

# Creating New Columns path , cell_type and cell_type_idx for better readability

# Keeping the path of the respected image
skin_df['path'] = skin_df['image_id'].map(imageid_path_dict.get)

# Keeping the human understable Name with the respected name
skin_df['cell_type'] = skin_df['dx'].map(lesion_type_dict.get)

# Keeping the index of the cell_type of the dict
skin_df['cell_type_idx'] = pd.Categorical(skin_df['cell_type']).codes

In [6]:
skin_df['cell_type'].value_counts()

Melanocytic nevi                  6705
Melanoma                          1113
Benign keratosis-like lesions     1099
Basal cell carcinoma               514
Actinic keratoses                  327
Vascular lesions                   142
Dermatofibroma                     115
Name: cell_type, dtype: int64

In [7]:
#Showing the content of the records
skin_df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,path,cell_type,cell_type_idx
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,vidir_modern,C:\Users\MSI\Documents\MiniProject\sample\ISIC...,Benign keratosis-like lesions,2
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,vidir_modern,C:\Users\MSI\Documents\MiniProject\sample\ISIC...,Benign keratosis-like lesions,2
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,vidir_modern,C:\Users\MSI\Documents\MiniProject\sample\ISIC...,Benign keratosis-like lesions,2
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern,C:\Users\MSI\Documents\MiniProject\sample\ISIC...,Benign keratosis-like lesions,2
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern,C:\Users\MSI\Documents\MiniProject\sample\ISIC...,Benign keratosis-like lesions,2


In [8]:
# Preprocessing
# # Checking the records again to see,is there is any record having empty field
skin_df.isnull().sum()

lesion_id         0
image_id          0
dx                0
dx_type           0
age              57
sex               0
localization      0
dataset           0
path              0
cell_type         0
cell_type_idx     0
dtype: int64

In [9]:
# Above we find that there are 57 records whose age attribute have null value
# So in this step we are filling that empty fields with the mean value of the attribute age
skin_df['age'].fillna((skin_df['age'].mean()) , inplace = True)

In [10]:
# Checking the records again to see,is there is any record having empty field
skin_df.isnull().sum()

lesion_id        0
image_id         0
dx               0
dx_type          0
age              0
sex              0
localization     0
dataset          0
path             0
cell_type        0
cell_type_idx    0
dtype: int64

In [11]:
# Balancing the dataset
cell_type_idx = 'cell_type_idx'
record_0 = skin_df[skin_df[cell_type_idx] == 0]
record_1 = skin_df[skin_df[cell_type_idx] == 1]
record_2 = skin_df[skin_df[cell_type_idx] == 2]
record_3 = skin_df[skin_df[cell_type_idx] == 3]
record_4 = skin_df[skin_df[cell_type_idx] == 4]
record_5 = skin_df[skin_df[cell_type_idx] == 5]
record_6 = skin_df[skin_df[cell_type_idx] == 6]

In [12]:
n = 500;
record_bal_0 = resample(record_0 , replace = True , n_samples = n , random_state = 42)
record_bal_1 = resample(record_1 , replace = True , n_samples = n , random_state = 42)
record_bal_2 = resample(record_2 , replace = True , n_samples = n , random_state = 42)
record_bal_3 = resample(record_3 , replace = True , n_samples = n , random_state = 42)
record_bal_4 = resample(record_4 , replace = True , n_samples = n , random_state = 42)
record_bal_5 = resample(record_5 , replace = True , n_samples = n , random_state = 42)
record_bal_6 = resample(record_6 , replace = True , n_samples = n , random_state = 42)

In [13]:
balance_record = pd.concat([record_bal_0 , record_bal_1 , record_bal_2 , record_bal_3 , record_bal_4 , record_bal_5 , record_bal_6])

In [14]:
balance_record['image'] = balance_record['path'].map(lambda x: np.asarray(Image.open(x).resize((64,64))).flatten())

In [15]:
balance_record.shape

(3500, 12)

In [16]:
balance_record.dtypes

lesion_id         object
image_id          object
dx                object
dx_type           object
age              float64
sex               object
localization      object
dataset           object
path              object
cell_type         object
cell_type_idx       int8
image             object
dtype: object

In [17]:
X = np.asarray(balance_record['image'].tolist())
X.shape

(3500, 12288)

In [18]:
Y = balance_record['cell_type_idx']

In [19]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.20,random_state=42)

In [20]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.transform(x_test)

In [21]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train , y_train)

In [22]:
y_predict = classifier.predict(x_test)
pickle.dump(classifier,open("bayes.pkl", "wb"))
print(y_predict)

[3 1 4 3 4 0 6 3 2 3 3 2 3 2 2 1 0 1 2 5 6 3 2 1 3 2 2 2 1 2 4 1 1 1 6 6 1
 0 6 5 6 6 3 5 4 4 2 3 6 2 3 2 2 3 2 2 4 0 4 3 6 6 5 1 3 5 1 1 6 5 5 2 3 4
 1 4 6 3 6 3 2 1 4 1 5 5 1 2 3 1 4 1 1 4 0 5 3 2 6 6 3 4 2 1 4 3 2 3 3 1 3
 1 2 3 4 1 2 1 2 1 1 1 5 1 1 1 5 1 1 3 2 3 3 2 6 4 3 3 3 4 2 2 4 6 1 5 1 3
 2 2 2 4 4 1 2 2 1 3 2 1 5 5 2 4 2 2 2 6 4 2 3 1 3 2 6 2 2 5 1 0 1 2 0 2 1
 1 2 4 3 5 3 5 6 0 4 1 1 3 2 3 2 1 2 5 0 3 2 2 6 3 4 6 2 6 3 3 2 0 1 2 0 1
 3 2 2 3 6 2 2 3 5 6 1 3 2 3 5 1 0 1 6 2 3 2 2 1 4 2 6 2 0 3 3 3 1 3 1 2 6
 6 5 3 6 3 1 4 5 4 1 3 3 3 1 2 1 2 0 2 6 6 3 2 3 6 2 1 0 6 0 0 3 1 3 2 5 5
 2 5 2 1 2 0 1 1 2 1 2 2 3 2 5 3 2 1 1 5 6 1 5 3 1 1 4 6 2 1 1 1 3 2 2 3 1
 2 4 2 3 4 3 5 1 0 1 5 2 1 4 1 6 3 4 1 1 2 1 4 4 6 2 3 2 3 2 1 0 6 0 1 6 5
 2 0 2 3 2 0 0 1 4 3 3 3 2 1 4 2 1 2 6 6 3 1 4 4 6 1 2 5 0 6 2 3 4 1 5 3 3
 2 4 3 1 4 6 1 5 5 4 1 3 0 3 1 3 6 0 3 2 1 5 2 1 2 1 3 2 4 3 2 2 1 2 4 1 6
 6 5 5 2 2 2 3 1 1 2 4 1 5 4 3 4 3 2 5 2 5 1 2 6 3 6 3 1 6 3 1 0 3 2 6 2 2
 1 3 1 2 4 4 2 1 1 4 3 3 

In [23]:
compare = np.vstack((y_test , y_predict)).T
compare[:5,:]

array([[3, 3],
       [4, 1],
       [4, 4],
       [3, 3],
       [0, 4]], dtype=int8)

In [24]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test , y_predict)
print(cm)

[[18 27 25 35  8  0 13]
 [ 6 45 22 17  0  3 12]
 [ 4 10 48 11  7 13  2]
 [ 1 28 20 36  7  0  9]
 [ 2  9 10  7 42  8  9]
 [ 2 12 25  7  6 31  6]
 [ 5 20 15 21  4  9 23]]


In [25]:
true = 0
false = 0

for row in range(cm.shape[0]):
    for col in range(cm.shape[1]):
        if row == col:
            true += cm[row , col]
        else:
            false += cm[row , col]
            
print('Correct Prediction : ',true)
print('Incorrect Prediction : ',false)
print('\nAccuracy : ', (true / (cm.sum())).round(2))

Correct Prediction :  243
Incorrect Prediction :  457

Accuracy :  0.35
