In [None]:
from google.colab import drive
import cv2
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import hashlib
from sklearn.model_selection import train_test_split
from skimage.transform import rotate
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier
import joblib

In [None]:
IMM_SIZE = 150
images = []
values = []
duplicates = []

In [None]:
stages_dict = {"CN": ['None', 'NonDemented'], "EMCI": ['Very Mild', 'VeryMildDemented'], 
               "LMCI": ['Mild', 'MildDemented'], "MCI": ['Moderate', 'ModerateDemented'], "AD": ['Servere', 'SevereDemented']}
stages_list = ["CN", "EMCI", "LMCI", "MCI", "AD"]

count_dumplicates = {"CN": 0, "EMCI": 0, "LMCI": 0, "MCI": 0, "AD": 0}
count_all_images = {"CN": 0, "EMCI": 0, "LMCI": 0, "MCI": 0, "AD": 0}

Завантаження зображень та перевірка на дублікати

In [None]:
def load_images(path, images, values, count_dumplicates, count_all_images):
  hashes = {}
  folders_list = os.listdir(path)
  for f in folders_list:
    files_list = os.listdir(path+"/"+f)
    for files in files_list:
      image = cv2.imread(path+"/"+f+"/"+files)
      filehash = hashlib.sha256(image).hexdigest()
      index = [key for key, value in stages_dict.items() if f in value]
      if filehash in hashes:
        count_dumplicates[index[0]] += 1
      else:
        images.append(image)
        count_all_images[index[0]] += 1
        values.append(index[0])
        hashes[filehash] = files
  return images, values, count_dumplicates, count_all_images
  

In [None]:
images, values, count_dumplicates, count_all_images = load_images('train', images, values, count_dumplicates, count_all_images)
images, values, count_dumplicates, count_all_images = load_images('test', images, values, count_dumplicates, count_all_images)

Приклад кожної стадії хвороби Альцгеймера в наборі даних

In [None]:
plt.figure(figsize=(20, 30))
for stage in stages_list:
  indices = [i for i, x in enumerate(values) if x == stage]
  sampled_list = random.sample(indices, 1)
  for i in range(len(sampled_list)):
    plt.subplot(1, 5, i+1+(stages_list.index(stage)*1))
    plt.imshow(images[sampled_list[i]], cmap='gray')
    plt.title(stage)

Побудова стовпчастої діаграми для відображення кількості зображень по кожній стадії хвороби та кількості дублікатів

In [None]:
stages = list(count_all_images.keys())
duplicates = [count_dumplicates[stage] for stage in stages]
counts = [count_all_images[stage] for stage in stages]

colors = ['#CCCCFF', '#FF9999']
fig, ax = plt.subplots(figsize=(8, 6))
bar_width = 0.35

ax.bar(stages, counts, width=bar_width, color=colors[0], label='Кількість унікальних зображень')
ax.bar(stages, duplicates, width=bar_width, bottom=counts, color=colors[1], label='Кількість дублікатів')
ax.legend()
ax.set_xlabel('Стадії хвороби')
ax.set_ylabel('Кількість')

plt.show()

Аугментація даних - відображення зображень відносно вертикальної осі та поворот на деякий кут

In [None]:
def dataset_augmentation(arr, values, ind, method):
  if method == "rotate":
    for i in ind:
      degree = 0
      while degree == 0:
        degree = random.randint(-15, 15)
      rot_image = rotate(arr[i], angle=degree)
      arr.append(rot_image)
      values.append(values[i])
  elif method == "flip":
    for i in ind:
      hflipped_image= cv2.flip(arr[i], 1)
      arr.append(hflipped_image)
      values.append(values[i])
  return arr, values

In [None]:
def find_all_index_images(arr, stage):
  indices = [ind for ind, ele in enumerate(arr) if ele == stage]
  return indices

In [None]:
indexes = find_all_index_images(values, "EMCI")
images, values = dataset_augmentation(images, values, indexes, 'flip')
indexes = find_all_index_images(values, "EMCI")
images, values = dataset_augmentation(images, values, indexes, 'rotate')

indexes = find_all_index_images(values, "CN")
images, values = dataset_augmentation(images, values, indexes, 'flip')
images, values = dataset_augmentation(images, values, indexes, 'rotate')

indexes = find_all_index_images(values, "LMCI")
images, values = dataset_augmentation(images, values, indexes, 'flip')

Нормалізація зображень

In [None]:
# Perform min-max normalization
def normalization(array):
  for i in range(len(array)):
    min_val = np.min(array[i])
    max_val = np.max(array[i])
    img_norm = (array[i] - min_val) / (max_val - min_val)
    array[i] = img_norm
  return array

In [None]:
images = normalization(images)

In [None]:
print(dict((l, values.count(l)) for l in set(values)))

Поділ на навчальну, тестову та валідаційну вибірки 60:20:20

In [None]:
X_train, X_test, y_train, y_test = train_test_split(images, values, test_size=0.2, random_state=20, stratify=values)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=20, stratify=y_train) # 0.25 x 0.8 = 0.2

Кругові діаграми розподілу зображень кожного класу в наборах даних на вибірки

In [None]:
colors = ['#9999FF', '#99CCFF','#FFCC99', '#FFFF99', '#FF9999']
fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(20, 5))

# Plot pie chart for train set
train_labels, train_counts = np.unique(y_train, return_counts=True)
ax[0].pie(train_counts, labels=train_labels, colors=colors,autopct=lambda x: '{:d}'.format(int(round(x * sum(train_counts) / 100.0))), startangle=140)
ax[0].set_title('Train Set')

# Plot pie chart for validation set
val_labels, val_counts = np.unique(y_val, return_counts=True)
ax[1].pie(val_counts, labels=val_labels, colors=colors,autopct=lambda x: '{:d}'.format(int(round(x * sum(val_counts) / 100.0))), startangle=140)
ax[1].set_title('Validation Set')

# Plot pie chart for test set
test_labels, test_counts = np.unique(y_test, return_counts=True)
ax[2].pie(test_counts, labels=test_labels, colors=colors,autopct=lambda x: '{:d}'.format(int(round(x * sum(test_counts) / 100.0))), startangle=140)
ax[2].set_title('Test Set')

plt.show()

In [None]:
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)
X_val = np.array(X_val)
y_val = np.array(y_val)

X_train = X_train.reshape((X_train.shape[0], -1))
X_test = X_test.reshape((X_test.shape[0], -1))
X_val = X_val.reshape((X_val.shape[0], -1))

SVM - підбір гіперпараметрів та передбачення на валідаційній вибірці

In [None]:
svc = SVC()
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']} 

grid = GridSearchCV(svc, param_grid, refit = True, verbose = 3)

grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.best_estimator_)

grid_predictions = grid.predict(X_val) 

accuracy = accuracy_score(y_val, grid_predictions)
confusion_mat = confusion_matrix(y_val, grid_predictions)
print("Accuracy:", accuracy)
print(classification_report(y_val, grid_predictions))

ConfusionMatrixDisplay(confusion_matrix=confusion_mat, display_labels=svc.classes_).plot(cmap='Blues')

Decision Tree - підбір гіперпараметрів та передбачення на валідаційній вибірці

In [None]:
dfc = DecisionTreeClassifier(random_state=42)

param_grid = {'criterion': ['gini', 'entropy'],
              'max_depth': [2, 4, 6, 8, 10, 12]}

grid = GridSearchCV(dfc, param_grid, refit = True, verbose = 3)

grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.best_estimator_)

grid_predictions = grid.predict(X_val) 

accuracy = accuracy_score(y_val, grid_predictions)
confusion_mat = confusion_matrix(y_val, grid_predictions)
print("Accuracy:", accuracy)
print(classification_report(y_val, grid_predictions))

ConfusionMatrixDisplay(confusion_matrix=confusion_mat, display_labels=svc.classes_).plot(cmap='Blues')

Random Forest - підбір гіперпараметрів та передбачення на валідаційній вибірці

In [None]:
rfc = RandomForestClassifier()

param_grid = {'n_estimators': [100, 200, 300],
              'max_depth': [5, 10, 15, None],}


grid = GridSearchCV(rfc, param_grid, refit = True, verbose = 3)
  
grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.best_estimator_)

grid_predictions = grid.predict(X_val) 

accuracy = accuracy_score(y_val, grid_predictions)
confusion_mat = confusion_matrix(y_val, grid_predictions)
print("Accuracy:", accuracy)
print(classification_report(y_val, grid_predictions))

ConfusionMatrixDisplay(confusion_matrix=confusion_mat, display_labels=svc.classes_).plot(cmap='Blues')

MLP - підбір гіперпараметрів та передбачення на валідаційній вибірці

In [None]:
mlp = MLPClassifier()

param_grid = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam']
}

grid = GridSearchCV(mlp, param_grid, refit = True, verbose = 3)

grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.best_estimator_)

grid_predictions = grid.predict(X_val) 

accuracy = accuracy_score(y_val, grid_predictions)
confusion_mat = confusion_matrix(y_val, grid_predictions)
print("Accuracy:", accuracy)
print(classification_report(y_val, grid_predictions))

ConfusionMatrixDisplay(confusion_matrix=confusion_mat, display_labels=svc.classes_).plot(cmap='Blues')

Voting Classifier - hard voting

In [None]:
rfc_1 = RandomForestClassifier(n_estimators=300)
mlp_1 = MLPClassifier(activation='tanh', hidden_layer_sizes=(50, 50, 50), solver='sgd')
svc_1 = SVC(C=1, gamma=0.001, kernel='rbf')

eclf1 = VotingClassifier(estimators=[('rf', rfc_1), ('mlp', mlp_1), ('svc', svc_1)], voting='hard', n_jobs=-1, verbose=True)

In [None]:
eclf1.fit(X_train, y_train)
y_pred_eclf1 = eclf1.predict(X_val)

In [None]:
accuracy = accuracy_score(y_val, y_pred_eclf1)
confusion_mat = confusion_matrix(y_val, y_pred_eclf1)
print("Accuracy:", accuracy)
print("Confusion matrix:\n", confusion_mat)
print(classification_report(y_val, y_pred_eclf1))
ConfusionMatrixDisplay(confusion_matrix=confusion_mat, display_labels=eclf1.classes_).plot(cmap='Blues')

In [None]:
eclf1.estimators_

Voting Classifier - soft voting

In [None]:
rfc_2 = RandomForestClassifier(n_estimators=300)
mlp_2 = MLPClassifier(activation='tanh', hidden_layer_sizes=(50, 50, 50), solver='sgd')
svc_2 = SVC(C=1, gamma=0.001, kernel='rbf', probability=True)

eclf2 = VotingClassifier(estimators=[('rf', rfc_2), ('mlp', mlp_2), ('svc', svc_2)],  voting='soft', verbose=True)

In [None]:
eclf2.fit(X_train, y_train)
y_pred_eclf2 = eclf2.predict(X_val)

In [None]:
accuracy = accuracy_score(y_val, y_pred_eclf2)
confusion_mat = confusion_matrix(y_val, y_pred_eclf2)
print("Accuracy:", accuracy)
print(classification_report(y_val, y_pred_eclf2))
ConfusionMatrixDisplay(confusion_matrix=confusion_mat, display_labels=eclf2.classes_).plot(cmap='Blues')

In [None]:
eclf2.estimators_

In [None]:
joblib.dump(eclf2, 'ensemble_model.joblib')

Voting Classifier - soft voting, класифікація на тестовій вибірці

In [None]:
loaded_ensemble = joblib.load('ensemble_model.joblib')

In [None]:
y_pred_final = loaded_ensemble.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred_final)
confusion_mat = confusion_matrix(y_test, y_pred_final)
print("Accuracy:", accuracy)
print("Confusion matrix:\n", confusion_mat)
print(classification_report(y_test, y_pred_final))
ConfusionMatrixDisplay(confusion_matrix=confusion_mat, display_labels=loaded_ensemble.classes_).plot(cmap='Blues')