In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import cv2
from skimage import segmentation
from skimage.measure import label, regionprops, regionprops_table, shannon_entropy
import os
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score
from sklearn.model_selection import train_test_split
from skimage.filters import threshold_otsu

from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics,svm
from sklearn import tree
from skimage import feature

from sklearn.preprocessing import MaxAbsScaler

#Class and functions used

##Functions

In [27]:
def display_img(image,title='Image',x_label=None,y_label=None,cmap_type='gray',show_axis=False,colorBar=False,F_size=(8,6)):
  plt.figure(figsize=F_size)
  plt.imshow(image,cmap=cmap_type)
  plt.title(title)
  plt.xlabel(x_label)
  plt.ylabel(y_label)
  if colorBar: plt.colorbar()
  if not show_axis: plt.axis('off')
  plt.show()

def count_0_1(array):

  count1,count0 = 0,0

  for element in array:

    if element == 1:
      count1+=1
    else:
      count0+=1
  
  return count0,count1

def generate_knn_model(train_data,label_train_data,test_data):
    knn = KNeighborsClassifier()
    knn.fit(train_data,label_train_data)
    predicted = knn.predict(test_data)
    return predicted

def generate_naive_bayes_model(train_data,label_train_data,test_data):
    gnb = GaussianNB()
    gnb.fit(train_data, label_train_data)
    predicted = gnb.predict(test_data)
    return predicted

def generate_svm_model(train_data,label_train_data,test_data):
    clf = svm.SVC(kernel='linear')
    clf.fit(train_data, label_train_data)
    predicted = clf.predict(test_data)
    return predicted

def generate_MLP_model(X_train, y_train,test_data):
    classifier = MLPClassifier(hidden_layer_sizes=(100,100,100), max_iter=500,activation = 'relu',solver='adam',random_state=1)
    classifier.fit(X_train, y_train)
    predicted = classifier.predict(test_data)
    return predicted

def generate_random_forest_model(X_train, y_train,test_data):
    rfc = RandomForestClassifier(criterion= 'entropy', max_depth= 8, max_features='auto', n_estimators=200)
    rfc.fit(X_train,y_train)
    predicted = rfc.predict(test_data)
    return predicted

def generate_SGDC_model(train_data,label_train_data,test_data):
    clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=200)
    clf.fit(train_data, label_train_data)
    predicted = clf.predict(test_data)
    return predicted

def generate_decision_tree_model(train_data,label_train_data,test_data):
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(train_data, label_train_data)
    predicted = clf.predict(test_data)
    return predicted
    



##Classes

In [22]:
class classification:

  def __init__(self,x_train, x_test, y_train, y_test):
    
    self.x_train, self.x_test, self.y_train, self.y_test = x_train, x_test, y_train, y_test
  
  def extraction(self):

    x = [self.x_train,self.x_test]
    image_props = {'training':[],'test':[]}
    group_names = ['training','test']

    for index,group in enumerate(x):

      for index2,image in enumerate(group):

        regions = regionprops(image)
        contour_area , contour_perimeter, contour_convex_area, diameter = self.get_contours_param(regions)
        image_props[group_names[index]].append([contour_area , contour_perimeter, contour_convex_area, diameter,shannon_entropy(image,base=2)])
      
    
    return image_props

  def get_contours_param(self,contour):
    contour_area = contour[0].filled_area
    contour_perimeter = contour[0].perimeter
    contour_convex_area = contour[0].convex_area
    diameter = contour[0].equivalent_diameter
    self.features_extracted = ['area','perimeter','convex_area','diameter','shannon_entropy']
    return contour_area , contour_perimeter, contour_convex_area, diameter

class LocalBinaryPatterns:
	def __init__(self, numPoints, radius):
		# store the number of points and radius
		self.numPoints = numPoints
		self.radius = radius
	def describe(self, image, eps=1e-7):
		# compute the Local Binary Pattern representation
		# of the image, and then use the LBP representation
		# to build the histogram of patterns
		lbp = feature.local_binary_pattern(image, self.numPoints,
			self.radius, method="uniform")
		(hist, _) = np.histogram(lbp.ravel(),
			bins=np.arange(0, self.numPoints + 3),
			range=(0, self.numPoints + 2))
		# normalize the histogram
		hist = hist.astype("float")
		hist /= (hist.sum() + eps)
		# return the histogram of Local Binary Patterns
		return hist


#Loading dataset

In [4]:
select_images_randomly = False

#Get images from google drive
directory_path_malignos = '/content/drive/MyDrive/cropped_and_treated_nods_maligno/'
directory_path_benignos = '/content/drive/MyDrive/cropped_and_treated_nods_benigno/'

directory_files = os.listdir(directory_path_malignos)
array_of_images_malignos = [plt.imread( os.path.join(directory_path_malignos,file) ) for file in directory_files]

directory_files = os.listdir(directory_path_benignos)
array_of_images_benignos = [plt.imread( os.path.join(directory_path_benignos,file) ) for file in directory_files]

print(f'Number of cropped maligno images: {len(array_of_images_malignos)}')
print(f'Number of cropped benigno images: {len(array_of_images_benignos)}')


Number of cropped maligno images: 262
Number of cropped benigno images: 275


##Splitting data into training and test set

The x and y arrays are composed of images. The features will be extracted in the next section.

In [23]:
x = [*array_of_images_malignos,*array_of_images_benignos]
y = [*[1 for _ in array_of_images_malignos],*[0 for _ in array_of_images_benignos]]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42,shuffle=True)

print(f'Size of training set {len(y_train)}')
count0,count1=count_0_1(y_train)
print(f'class 0 size: {count0} class 1 size: {count1}. ratio: {count1/(count0+count1)}')
print(f'Size of test set {len(y_test)}')
count0,count1=count_0_1(y_test)
print(f'class 0 size: {count0} class 1 size: {count1}. ratio: {count1/(count0+count1)}')

Size of training set 359
class 0 size: 186 class 1 size: 173. ratio: 0.4818941504178273
Size of test set 178
class 0 size: 89 class 1 size: 89. ratio: 0.5


#Obtaining features

The extrated features were area, perimeter, convex area, diameter and shannon entropy.



In [32]:
#Pipeline 1 dataset
model = classification(x_train, x_test, y_train, y_test)
result = model.extraction()
print(len(y_train))

norm =  MaxAbsScaler()
norm.fit(result['training'])
norm_x_train = norm.transform(result['training'])

norm =  MaxAbsScaler()
norm.fit(result['test'])
norm_x_test = norm.transform(result['test'])

#Pipeline 2 dataset

desc = LocalBinaryPatterns(24, 8)
x_train_desc = [desc.describe(image) for image in x_train]
x_test_desc = [desc.describe(image) for image in x_test]

#Pipeline 3 dataset

new_train = [[*norm_x_train[index],*x_train_desc[index]] for index in range(len(norm_x_train)) ]
new_test = [[*norm_x_test[index],*x_test_desc[index]] for index in range(len(norm_x_test))]


359


#Classification

## Pipeline 1

In [37]:

result_knn = generate_knn_model(norm_x_train,y_train,norm_x_test)
result_naive = generate_naive_bayes_model(norm_x_train,y_train,norm_x_test)
result_svm = generate_svm_model(norm_x_train,y_train,norm_x_test)
result_mlp = generate_MLP_model(norm_x_train,y_train,norm_x_test)
result_forest = generate_random_forest_model(norm_x_train,y_train,norm_x_test)
result_SGDC = generate_SGDC_model(norm_x_train,y_train,norm_x_test)
resut_tree = generate_decision_tree_model(norm_x_train,y_train,norm_x_test)

all_results = [result_knn,result_naive,result_svm,result_mlp,result_forest,result_SGDC,resut_tree]

result_metrics_pipeline1 = []

for result_per_model in all_results:

  accuracy = metrics.accuracy_score(result_per_model,y_test)
  recall = metrics.recall_score(result_per_model,y_test)
  precision = metrics.precision_score(result_per_model,y_test)
  f1_score = metrics.f1_score(result_per_model,y_test)

  result_metrics_pipeline1.append([accuracy,recall,precision,f1_score])

result_df_pipeline1 = pd.DataFrame(result_metrics_pipeline1,columns = ['accuracy','recall','precision','f1_score'],index = ['knn','naive','svm','mlp','forest','SGDC','tree'])
display(result_df_pipeline1)

Unnamed: 0,accuracy,recall,precision,f1_score
knn,0.769663,0.772727,0.764045,0.768362
naive,0.792135,0.776596,0.820225,0.797814
svm,0.752809,0.835821,0.629213,0.717949
mlp,0.814607,0.804348,0.831461,0.81768
forest,0.814607,0.797872,0.842697,0.819672
SGDC,0.657303,0.868421,0.370787,0.519685
tree,0.780899,0.797619,0.752809,0.774566


##Pipeline 2

In [36]:
result_knn = generate_knn_model(x_train_desc,y_train,x_test_desc)
result_naive = generate_naive_bayes_model(x_train_desc,y_train,x_test_desc)
result_svm = generate_svm_model(x_train_desc,y_train,x_test_desc)
result_mlp = generate_MLP_model(x_train_desc,y_train,x_test_desc)
result_forest = generate_random_forest_model(x_train_desc,y_train,x_test_desc)
result_SGDC = generate_SGDC_model(x_train_desc,y_train,x_test_desc)
resut_tree = generate_decision_tree_model(x_train_desc,y_train,x_test_desc)

all_results = [result_knn,result_naive,result_svm,result_mlp,result_forest,result_SGDC,resut_tree]

result_metrics_pipeline2 = []

for result_per_model in all_results:

  accuracy = metrics.accuracy_score(result_per_model,y_test)
  recall = metrics.recall_score(result_per_model,y_test)
  precision = metrics.precision_score(result_per_model,y_test)
  f1_score = metrics.f1_score(result_per_model,y_test)

  result_metrics_pipeline2.append([accuracy,recall,precision,f1_score])

result_df_pipeline2 = pd.DataFrame(result_metrics_pipeline2,columns = ['accuracy','recall','precision','f1_score'],index = ['knn','naive','svm','mlp','forest','SGDC','tree'])
display(result_df_pipeline2)

Unnamed: 0,accuracy,recall,precision,f1_score
knn,0.775281,0.802469,0.730337,0.764706
naive,0.735955,0.85,0.573034,0.684564
svm,0.696629,0.753623,0.58427,0.658228
mlp,0.842697,0.858824,0.820225,0.83908
forest,0.820225,0.827586,0.808989,0.818182
SGDC,0.730337,0.847458,0.561798,0.675676
tree,0.758427,0.7875,0.707865,0.745562


##Pipeline 3

In [35]:
result_knn = generate_knn_model(new_train,y_train,new_test)
result_naive = generate_naive_bayes_model(new_train,y_train,new_test)
result_svm = generate_svm_model(new_train,y_train,new_test)
result_mlp = generate_MLP_model(new_train,y_train,new_test)
result_forest = generate_random_forest_model(new_train,y_train,new_test)
result_SGDC = generate_SGDC_model(new_train,y_train,new_test)
resut_tree = generate_decision_tree_model(new_train,y_train,new_test)

all_results = [result_knn,result_naive,result_svm,result_mlp,result_forest,result_SGDC,resut_tree]

result_metrics = []
#performing the classifications
for result_per_model in all_results:

  accuracy = metrics.accuracy_score(result_per_model,y_test)
  recall = metrics.recall_score(result_per_model,y_test)
  precision = metrics.precision_score(result_per_model,y_test)
  f1_score = metrics.f1_score(result_per_model,y_test)

  result_metrics.append([accuracy,recall,precision,f1_score])

result_df_pipeline3 = pd.DataFrame(result_metrics,columns = ['accuracy','recall','precision','f1_score'],index = ['knn','naive','svm','mlp','forest','SGDC','tree'])
display(result_df_pipeline3)

Unnamed: 0,accuracy,recall,precision,f1_score
knn,0.803371,0.787234,0.831461,0.808743
naive,0.741573,0.852459,0.58427,0.693333
svm,0.775281,0.865672,0.651685,0.74359
mlp,0.859551,0.872093,0.842697,0.857143
forest,0.808989,0.823529,0.786517,0.804598
SGDC,0.516854,0.508876,0.966292,0.666667
tree,0.780899,0.784091,0.775281,0.779661


#Comparison of Pipelines

##Pipeline 1 x Pipeline 2

In [42]:
display(result_df_pipeline1-result_df_pipeline2)
result = result_df_pipeline1/result_df_pipeline2
display(result)

Unnamed: 0,accuracy,recall,precision,f1_score
knn,-0.005618,-0.029742,0.033708,0.003656
naive,0.05618,-0.073404,0.247191,0.11325
svm,0.05618,0.082198,0.044944,0.059721
mlp,-0.02809,-0.054476,0.011236,-0.021401
forest,-0.005618,-0.029714,0.033708,0.00149
SGDC,-0.073034,0.020963,-0.191011,-0.155991
tree,0.022472,0.010119,0.044944,0.029004


Unnamed: 0,accuracy,recall,precision,f1_score
knn,0.992754,0.962937,1.046154,1.004781
naive,1.076336,0.913642,1.431373,1.165434
svm,1.080645,1.10907,1.076923,1.09073
mlp,0.966667,0.936569,1.013699,0.974495
forest,0.993151,0.964096,1.041667,1.001821
SGDC,0.9,1.024737,0.66,0.769134
tree,1.02963,1.01285,1.063492,1.038903


##Pipeline 1 x Pipeline 3

In [43]:
display(result_df_pipeline1-result_df_pipeline3)
result = result_df_pipeline1/result_df_pipeline3
display(result)

Unnamed: 0,accuracy,recall,precision,f1_score
knn,-0.033708,-0.014507,-0.067416,-0.040382
naive,0.050562,-0.075863,0.235955,0.104481
svm,-0.022472,-0.029851,-0.022472,-0.025641
mlp,-0.044944,-0.067745,-0.011236,-0.039463
forest,0.005618,-0.025657,0.05618,0.015074
SGDC,0.140449,0.359545,-0.595506,-0.146982
tree,0.0,0.013528,-0.022472,-0.005095


Unnamed: 0,accuracy,recall,precision,f1_score
knn,0.958042,0.981572,0.918919,0.950069
naive,1.068182,0.911007,1.403846,1.150694
svm,0.971014,0.965517,0.965517,0.965517
mlp,0.947712,0.922319,0.986667,0.953959
forest,1.006944,0.968845,1.071429,1.018735
SGDC,1.271739,1.706548,0.383721,0.779528
tree,1.0,1.017253,0.971014,0.993466


#Pipeline 2 x Pipeline 3

In [46]:
display(result_df_pipeline2-result_df_pipeline3)
result = result_df_pipeline2/result_df_pipeline3
display(result)

Unnamed: 0,accuracy,recall,precision,f1_score
knn,-0.02809,0.015235,-0.101124,-0.044037
naive,-0.005618,-0.002459,-0.011236,-0.00877
svm,-0.078652,-0.112048,-0.067416,-0.085362
mlp,-0.016854,-0.013269,-0.022472,-0.018062
forest,0.011236,0.004057,0.022472,0.013584
SGDC,0.213483,0.338582,-0.404494,0.009009
tree,-0.022472,0.003409,-0.067416,-0.034099


Unnamed: 0,accuracy,recall,precision,f1_score
knn,0.965035,1.019353,0.878378,0.945548
naive,0.992424,0.997115,0.980769,0.987352
svm,0.898551,0.870565,0.896552,0.885203
mlp,0.980392,0.984784,0.973333,0.978927
forest,1.013889,1.004926,1.028571,1.016883
SGDC,1.413043,1.665353,0.581395,1.013514
tree,0.971223,1.004348,0.913043,0.956264
