In [1]:
import os
import pandas as pd
import numpy as np
from PIL import Image
from tqdm import tqdm
from sklearn import preprocessing
from sklearn.metrics import classification_report, accuracy_score

In [2]:
label = pd.read_csv('./dataset/label.csv')
label.head()

Unnamed: 0,file_name,label
0,IMAGE_0000.jpg,meningioma_tumor
1,IMAGE_0001.jpg,no_tumor
2,IMAGE_0002.jpg,meningioma_tumor
3,IMAGE_0003.jpg,glioma_tumor
4,IMAGE_0004.jpg,meningioma_tumor


In [3]:
ImageData = []
for i in tqdm(range(label.shape[0])):  
    org_img = Image.open('./dataset/image/' + label['file_name'][i]).convert('L') #Image(512, 512), gray-scale
    img = np.array(org_img) # numpy(512, 512)
    ImageData.append(img)

100%|██████████| 3000/3000 [00:25<00:00, 118.36it/s]


In [4]:
X = np.array(ImageData)
X = X.reshape((X.shape[0], -1))
print("X.shape is", X.shape)

data_classes = ['no_tumor', 'meningioma_tumor', 'glioma_tumor', 'pituitary_tumor']
Y = np.array(label['label'].apply(data_classes.index)) # Transform into 4 classes
print("Y.shape is", Y.shape)

X.shape is (3000, 262144)
Y.shape is (3000,)


In [5]:
# Do the Normalization
min_max_scaler = preprocessing.MinMaxScaler()
X_scaler = min_max_scaler.fit_transform(X) # Normalize to [0,1]

In [6]:
# Divide the dataset into training set, validation set and test set.

train_images = X_scaler[:2400][:]
valid_images = X_scaler[2400:2700][:]
test_images = X_scaler[2700:][:]

train_labels = Y[:2400]
valid_labels = Y[2400:2700]
test_labels = Y[2700:]

In [8]:
from sklearn.linear_model import LogisticRegression

# Build Logistic Regression Model
logreg = LogisticRegression(penalty='l2', solver='newton-cg', multi_class='multinomial')
# Train the model using the training sets
logreg.fit(train_images, train_labels)
y_pred_LR = logreg.predict(test_images)

print('Accuracy on test set: '+ str(accuracy_score(test_labels, y_pred_LR)))
print(classification_report(test_labels, y_pred_LR)) #text report showing the main classification metrics

Accuracy on test set: 0.7933333333333333
              precision    recall  f1-score   support

           0       0.92      0.69      0.79        48
           1       0.69      0.68      0.68        81
           2       0.71      0.78      0.74        85
           3       0.92      0.98      0.95        86

    accuracy                           0.79       300
   macro avg       0.81      0.78      0.79       300
weighted avg       0.80      0.79      0.79       300



In [7]:
from sklearn.ensemble import RandomForestClassifier

# Build Random Forest Model
RandomForest = RandomForestClassifier(n_estimators=150)
# Train the model using the training sets
RandomForest = RandomForest.fit(train_images, train_labels)

y_pred_RF = RandomForest.predict(test_images)
# y_pred_proba_RF = RandomForest.predict_proba(test_images)
print('Accuracy on test set: '+ str(accuracy_score(test_labels, y_pred_RF)))
print(classification_report(test_labels, y_pred_RF)) #text report showing the main classification metrics

Accuracy on test set: 0.8833333333333333
              precision    recall  f1-score   support

           0       0.91      0.88      0.89        48
           1       0.83      0.88      0.85        81
           2       0.93      0.79      0.85        85
           3       0.89      0.99      0.93        86

    accuracy                           0.88       300
   macro avg       0.89      0.88      0.88       300
weighted avg       0.89      0.88      0.88       300



In [8]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt

# Compare the performance between decision tree and random forests

rfc_l = []
clf_l = []

for i in range(10):
    rfc = RandomForestClassifier(n_estimators=25)
    rfc_s = cross_val_score(rfc, X_scaler, Y, cv=10).mean()
    rfc_l.append(rfc_s)
    clf = DecisionTreeClassifier()
    clf_s = cross_val_score(clf, X_scaler, Y, cv=10).mean()
    clf_l.append(clf_s)

plt.plot(range(1,11),rfc_l,label = "Random Forest")
plt.plot(range(1,11),clf_l,label = "Decision Tree")
plt.legend()
plt.show()

In [None]:
# Draw the learning curve of n_estimator

superpa = []
for i in range(150):
    rfc = RandomForestClassifier(n_estimators=i+1,n_jobs=-1)
    rfc_s = cross_val_score(rfc, X_scaler, Y, cv=10).mean()
    superpa.append(rfc_s)
print(max(superpa),superpa.index(max(superpa)))
plt.figure(figsize=[20,5])
plt.plot(range(1,151),superpa)
plt.show()

In [10]:
from sklearn import svm
from sklearn.svm import SVC

SVM_rbf = svm.SVC(C=1.0, kernel='rbf', gamma=0.5, probability=True) # (C=0.5, kernel='linear', gamma=1)
# Train the model using the training sets
SVM_rbf.fit(train_images, train_labels)

y_pred_SVM = SVM_rbf.predict(test_images)
y_pred_proba_SVM = SVM_rbf.predict_proba(test_images)

print('Accuracy on test set: '+ str(accuracy_score(test_labels, y_pred_SVM)))
print(classification_report(test_labels, y_pred_SVM)) #text report showing the main classification metrics