# Grupo:

#### Alessandro de Freitas Guerreiro - 11233891
#### Matheus Yasuo Ribeiro Utino - 11233689
#### Vinícius Silva Montanari - 11233709

## Tema: Classificação por imagens da patologia retinopatia diabética utilizando técnicas de aprendizado de máquina
### Universidade de São Paulo
### Instituto de Ciências Matemáticas e de Computação (ICMC)
### SCC0276 - Aprendizado de Máquina (2022)

In [7]:
# Standard dependencies
import cv2
import time
import scipy as sp
import numpy as np
import random as rn
import pandas as pd
from tqdm import tqdm
from PIL import Image
from functools import partial
import matplotlib.pyplot as plt

# Machine Learning
import tensorflow as tf
import keras
from keras_preprocessing.image import ImageDataGenerator
from sklearn.metrics import cohen_kappa_score

In [8]:
pip install efficientnet

# Data Analysis

In [9]:
#Define path of files

KAGGLE_DIR = '../input/aptos2019-blindness-detection/'
TRAIN_DF_PATH = KAGGLE_DIR + "train.csv"
TRAIN_IMG_PATH = KAGGLE_DIR + "train_images/"

In [10]:
#Load csv and convert to pandas dataframe

df_train = pd.read_csv(TRAIN_DF_PATH)
df_train['id_code'] = df_train['id_code'] + ".png"

In [11]:
#Show train dataset shape and informations

print("Train dataset size :", df_train.shape, "\n")
df_train.head()

In [12]:
#Get number of occurrences of each class in train dataset

df_train.groupby('diagnosis').count()

In [13]:
#Pie graph for classes in train dataset

df_train['diagnosis'].value_counts().plot(labels = ['Normal', 'Moderate', 'Mild', 'Proliferative', 'Severe'],kind = 'pie',autopct='%1.1f%%')
plt.savefig('pie_train_classes.png', dpi = 400)
plt.show() 

In [14]:
# Remove duplicated lines in dataframes
df_train = df_train.drop_duplicates()
print("Train dataset size :", df_train.shape, "\n")

# Analysis Nan values in dataframes
print("There are Nan in train dataset? ", bool(df_train.isnull().sum().sum()), "\n")

In [15]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

# Plot one figure for each class in dataframe

fig, ax = plt.subplots(1, 5, figsize=(15, 6))
for i in range(5):
    sample = df_train[df_train['diagnosis'] == i].sample(1)
    image_name = sample['id_code'].item()
    X = mpimg.imread('../input/aptos2019-blindness-detection/train_images/' + image_name)
    print("Shape of Image: " + str(X.shape))
    ax[i].set_title(f"Image: {image_name}\n Label = {sample['diagnosis'].item()}", 
                    weight='bold', fontsize=10)
    ax[i].axis('off')
    ax[i].imshow(X);
plt.savefig('real_images.png', dpi = 200)

# Preprocessing

In [16]:
import tensorflow as tf
from tensorflow import keras
from keras import *
from keras.models import Sequential, load_model
from keras.models import Model
from keras.layers import GlobalAveragePooling2D, Dropout
from keras.layers.core import Dense
import tensorflow.keras as keras
from tensorflow.keras.applications import EfficientNetB3

#Define some constants about the images

IMG_WIDTH = 320
IMG_HEIGHT = 320
NUM_DIMENSIONS = 3
BATCH_SIZE = 1

INPUT_SHAPE = (IMG_WIDTH, IMG_HEIGHT, NUM_DIMENSIONS)

#Building a cnn for features extraction using fine tunning

efnb3 = EfficientNetB3(weights='imagenet', include_top = False, input_shape = INPUT_SHAPE)

model = Sequential()
model.add(efnb3)
model.add(GlobalAveragePooling2D())

model.summary()


In [None]:
#Plot all layers of EfficientNetB3 

tf.keras.utils.plot_model(efnb3, to_file='model.png')

In [38]:
# Image Data Generator

train_datagen = ImageDataGenerator()

# Resize and load image

train_generator = train_datagen.flow_from_dataframe(df_train, 
                                                    x_col='id_code', 
                                                    y_col='diagnosis',
                                                    directory = TRAIN_IMG_PATH,
                                                    target_size=(IMG_WIDTH, IMG_HEIGHT),
                                                    batch_size=BATCH_SIZE,
                                                    seed=42,
                                                    class_mode='raw', 
                                                    subset='training')


In [39]:
# Generate data

def generateData(generator, numElements):
    x = []
    y = []
        
    for j in tqdm(range(0, numElements)):
        img, label = next(generator)
        x.append(model.predict(img))
        y.append(label)
            
    return np.array(x),np.array(y)

In [40]:
print('Generating train dataset!')
x_train, y_train = generateData(train_generator, len(train_generator))

In [41]:
x_train.shape

In [42]:
y_train.shape

In [43]:
#Change dimension (numElements, 1, IMG_WIDTH, IMG_HEIGHT, NUM_DIMENSIONS) to (numElements, IMG_WIDTH, IMG_HEIGHT, NUM_DIMENSIONS)
x_train = np.array(np.squeeze(x_train, axis = 1))

#Change dimension (numElements, 1) to (numElements)
y_train = np.ravel(y_train)

In [44]:
#Generate train, validate and test 

from sklearn.model_selection import train_test_split

x_train_2, x_test, y_train_2, y_test = train_test_split(x_train, y_train, test_size=0.15, stratify=y_train, shuffle = True, random_state=42)

x_train_final, x_val, y_train_final, y_val = train_test_split(x_train_2, y_train_2, test_size=0.2, stratify=y_train_2, shuffle = True, random_state=42)

In [45]:
from imblearn.over_sampling import SMOTE

# Using Synthetic Minority Oversampling Technique (SMOTE) to balance dataset

sm = SMOTE(random_state=42)
x_train_SMOTE, y_train_SMOTE = sm.fit_resample(x_train_final, y_train_final)

In [46]:
from imblearn.over_sampling import RandomOverSampler 

# Using RandomOverSampler to balance dataset

sm = RandomOverSampler(random_state=42)
x_train_ROV, y_train_ROV = sm.fit_resample(x_train_final, y_train_final)

In [47]:
from imblearn.over_sampling import BorderlineSMOTE

# Using BorderlineSMOTE to balance dataset

sm = BorderlineSMOTE(random_state = 42)
x_train_BLM, y_train_BLM = sm.fit_resample(x_train_final, y_train_final)

In [48]:
from imblearn.over_sampling import ADASYN 

# Using ADASYN  to balance dataset

sm = ADASYN(random_state=42)
x_train_ADASYN, y_train_ADASYN = sm.fit_resample(x_train_final, y_train_final)

In [49]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score, make_scorer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

#Get results of validate and test dataset

def getResults(nameClassifier, nameDataAugmentation, x_train, y_train, x_val, y_val, x_test, y_test):
    kappa_scorer = make_scorer(cohen_kappa_score, weights = 'quadratic')
    scoring = {"Acc": "accuracy", "F1-Macro": "f1_macro","Kappa": kappa_scorer}
    cv = StratifiedKFold(n_splits = 5, random_state = 42, shuffle = True)
    
    print("Using ", nameClassifier, "and", nameDataAugmentation, "data augmentation technic\n\n")
    
    #grid_search_cv = GridSearchCV(classifier, params, verbose=1, cv= cv, scoring = scoring, refit = "Kappa", n_jobs = 3)
    results_val = []
    if(nameClassifier == "KNN"):
        for K in range(2, 13):
            for weight in ['uniform','distance']:
                clf = KNeighborsClassifier(n_neighbors = K, weights = weight)
                clf.fit(x_train, y_train)
                y_pred = clf.predict(x_val)
                results_val.append([K, weight, accuracy_score(y_val, y_pred), f1_score(y_val, y_pred, average='macro'), cohen_kappa_score(y_val, y_pred, weights = 'quadratic')])
                
    elif(nameClassifier == "Decision Tree"):
        for max_depth in range(2, 7):
            for criterion in ['gini', 'entropy']:
                clf = DecisionTreeClassifier(max_depth = max_depth, criterion = criterion, random_state = 42)
                clf.fit(x_train, y_train)
                y_pred = clf.predict(x_val)
                results_val.append([max_depth, criterion, accuracy_score(y_val, y_pred), f1_score(y_val, y_pred, average='macro'), cohen_kappa_score(y_val, y_pred, weights = 'quadratic')])
    
    elif(nameClassifier == "SVM"): 
        for C in [10, 100, 1000]:
            for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
                clf = svm.SVC(C = C, kernel = kernel, random_state = 42)
                clf.fit(x_train, y_train)
                y_pred = clf.predict(x_val)
                results_val.append([C, kernel, accuracy_score(y_val, y_pred), f1_score(y_val, y_pred, average='macro'), cohen_kappa_score(y_val, y_pred, weights = 'quadratic')])
        
    print("Results of validate:\n")
    for i in results_val:
        print(i)
    print("\n")
    
    
    max = results_val[0][4]
    index_max = 0
    # Search max cohen kappa score
    for i in range(1,len(results_val)):
        if(results_val[i][4] > max):
            max = results_val[i][4]
            index_max = i
    
    print("Best validate results and params:",results_val[index_max])
    
    results_test = []
    
    if(nameClassifier == "KNN"):
        clf = KNeighborsClassifier(n_neighbors = results_val[index_max][0], weights = results_val[index_max][1])
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        results_test.append([results_val[index_max][0], results_val[index_max][1], accuracy_score(y_test, y_pred), f1_score(y_test, y_pred, average='macro'), cohen_kappa_score(y_test, y_pred, weights = 'quadratic')])
        
    elif(nameClassifier == "Decision Tree"):
        clf = DecisionTreeClassifier(max_depth = results_val[index_max][0], criterion = results_val[index_max][1], random_state = 42)
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        results_test.append([results_val[index_max][0], results_val[index_max][1], accuracy_score(y_test, y_pred), f1_score(y_test, y_pred, average='macro'), cohen_kappa_score(y_test, y_pred, weights = 'quadratic')])
        
    elif(nameClassifier == "SVM"): 
        clf = svm.SVC(C = results_val[index_max][0], kernel = results_val[index_max][1], random_state = 42)
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        results_test.append([results_val[index_max][0], results_val[index_max][1], accuracy_score(y_test, y_pred), f1_score(y_test, y_pred, average='macro'), cohen_kappa_score(y_test, y_pred, weights = 'quadratic')]) 
        
    print("Result of test:",results_test)
    print("\n\n\n")
    
    
    #y_pred = grid_search_cv.predict(x_test)
    
#     print("Results of test:\n")
    
#     print(classification_report(y_test, y_pred))
#     print("Accuracy:",accuracy_score(y_test, y_pred))
#     print("F1-Macro",f1_score(y_test, y_pred, average='macro'))
#     print("Quadratic Weighted Kappa:",cohen_kappa_score(y_test, y_pred, weights = 'quadratic'))
#     print("\n\n\n")

# K-Nearest Neighbors (KNN)

In [50]:
from sklearn.neighbors import KNeighborsClassifier

getResults("KNN", "Nothing", x_train_final, y_train_final, x_val, y_val, x_test, y_test)
getResults("KNN", "Synthetic Minority Oversampling Technique (SMOTE)",  x_train_SMOTE, y_train_SMOTE, x_val, y_val, x_test, y_test)
getResults("KNN", "RandomOverSampler", x_train_ROV, y_train_ROV, x_val, y_val, x_test, y_test)
getResults("KNN", "BorderlineSMOTE",  x_train_BLM, y_train_BLM, x_val, y_val, x_test, y_test)
getResults("KNN", "ADASYN", x_train_ADASYN, y_train_ADASYN, x_val, y_val, x_test, y_test)

# Decision Tree

In [51]:
from sklearn.tree import DecisionTreeClassifier

getResults("Decision Tree", "Nothing", x_train_final, y_train_final, x_val, y_val, x_test, y_test)
getResults("Decision Tree", "Synthetic Minority Oversampling Technique (SMOTE)", x_train_SMOTE, y_train_SMOTE, x_val, y_val, x_test, y_test)
getResults("Decision Tree", "RandomOverSampler", x_train_ROV, y_train_ROV, x_val, y_val, x_test, y_test)
getResults("Decision Tree", "BorderlineSMOTE", x_train_BLM, y_train_BLM, x_val, y_val, x_test, y_test)
getResults("Decision Tree", "ADASYN", x_train_ADASYN, y_train_ADASYN, x_val, y_val, x_test, y_test)

# Support Vector Machines (SVM)

In [52]:
from sklearn import svm

getResults("SVM", "Nothing", x_train_final, y_train_final, x_val, y_val, x_test, y_test)
getResults("SVM", "Synthetic Minority Oversampling Technique (SMOTE)",  x_train_SMOTE, y_train_SMOTE, x_val, y_val, x_test, y_test)
getResults("SVM", "RandomOverSampler", x_train_ROV, y_train_ROV, x_val, y_val, x_test, y_test)
getResults("SVM", "BorderlineSMOTE",  x_train_BLM, y_train_BLM, x_val, y_val, x_test, y_test)
getResults("SVM", "ADASYN", x_train_ADASYN, y_train_ADASYN, x_val, y_val, x_test, y_test)