In [1]:
import os
import pandas as pd
import numpy as np
from PIL import Image
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [3]:
label = pd.read_csv('./dataset/label.csv')
label.head()

Unnamed: 0,file_name,label
0,IMAGE_0000.jpg,meningioma_tumor
1,IMAGE_0001.jpg,no_tumor
2,IMAGE_0002.jpg,meningioma_tumor
3,IMAGE_0003.jpg,glioma_tumor
4,IMAGE_0004.jpg,meningioma_tumor


In [2]:
from collections import Counter

def get_images_and_labels(dir_path):
    '''
    从图像数据集的根目录dir_path下获取所有类别的图像名列表和对应的标签名列表
    :param dir_path: 图像数据集的根目录
    :return: images_list, labels_list
    '''

    print("BEGIN TO LOAD DATA!")
    dataset_csv = pd.read_csv(dir_path + 'label.csv')
    data_classes = ['no_tumor', 'meningioma_tumor', 'glioma_tumor', 'pituitary_tumor']
    images_list = dataset_csv['file_name']                        # images_name list
    labels_list = dataset_csv['label'].apply(data_classes.index)  # labels list

    train_images_list = images_list[:2400]
    valid_images_list = images_list[2400:2700]
    test_images_list = images_list[2700:]

    train_labels_list = labels_list[:2400]
    valid_labels_list = labels_list[2400:2700]
    test_labels_list = labels_list[2700:]

    train_label = Counter(train_labels_list)
    valid_label = Counter(valid_labels_list)
    test_label = Counter(test_labels_list)

    print(train_label, '\n', valid_label, '\n', test_label, '\n')

    return images_list, labels_list

dataset_path = './dataset/'
images, labels =  get_images_and_labels(dataset_path)

BEGIN TO LOAD DATA!
Counter({2: 694, 1: 683, 3: 663, 0: 360}) 
 Counter({1: 91, 3: 82, 2: 81, 0: 46}) 
 Counter({3: 86, 2: 85, 1: 81, 0: 48}) 



In [3]:
# ImageData = []
# for i in tqdm(range(label.shape[0])):  
#     org_img = Image.open('./dataset/image/' + label['file_name'][i]).convert('L') #Image(512, 512), gray-scale
#     img = np.array(org_img) # numpy(512, 512)
#     ImageData.append(img)

sample_images = []
for index in tqdm(range(len(images))):
    img_path = images[index]
    org_img = Image.open(dataset_path + 'image/' + img_path).convert('L') # Image(512, 512)
    img = np.array(org_img)
    sample_images.append(img)

100%|██████████| 3000/3000 [00:12<00:00, 244.50it/s]


In [5]:
# import torch

# input = torch.Tensor(sample['image'])
# input = np.array(input)

input = np.array(sample_images[0:2])

In [6]:
X = np.array(sample_images)
print('!')
X = X.reshape((X.shape[0], -1))
X = X / 255 # Normalization, convert the values of X from (0,255) to (0,1)
print("X.shape is", X.shape)

Y = np.array(labels)
print("Y.shape is", Y.shape)

In [3]:
np.__version__

'1.21.4'

In [6]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=0)
x_train.shape

(2250, 262144)

In [7]:
from sklearn.linear_model import LogisticRegression

def logRegrPredict(x_train, y_train, xtest):
    # Build Logistic Regression Model
    logreg = LogisticRegression(penalty='l2', solver='newton-cg', multi_class='multinomial')
    # Train the model using the training sets
    logreg.fit(x_train, y_train)
    y_pred = logreg.predict(xtest)
    #print('Accuracy on test set: {:.2f}'.format(logreg.score(x_test, y_test)))
    return y_pred

y_pred_LR = logRegrPredict(x_train, y_train, x_test)
print('Accuracy on test set: '+ str(accuracy_score(y_test, y_pred_LR)))
print(classification_report(y_test, y_pred_LR)) #text report showing the main classification metrics


Accuracy on test set: 0.7853333333333333
                  precision    recall  f1-score   support

    glioma_tumor       0.72      0.77      0.74       219
meningioma_tumor       0.71      0.69      0.70       206
        no_tumor       0.86      0.65      0.74       123
 pituitary_tumor       0.90      0.98      0.94       202

        accuracy                           0.79       750
       macro avg       0.80      0.77      0.78       750
    weighted avg       0.79      0.79      0.78       750



In [23]:
from sklearn.ensemble import RandomForestClassifier

def RandomForestPredict(x_train, y_train, x_test):
    # Build Random Forest Model
    RandomForest = RandomForestClassifier(n_estimators=200)
    # Train the model using the training sets
    RandomForest.fit(x_train, y_train)
    y_pred = RandomForest.predict(x_test)
    #print('Accuracy on test set: {:.2f}'.format(logreg.score(x_test, y_test)))
    return y_pred

y_pred_RF = RandomForestPredict(x_train, y_train, x_test)
print('Accuracy on test set: '+ str(accuracy_score(y_test, y_pred_RF)))
print(classification_report(y_test, y_pred_RF)) #text report showing the main classification metrics

Accuracy on test set: 0.892
                  precision    recall  f1-score   support

    glioma_tumor       0.96      0.78      0.86       219
meningioma_tumor       0.80      0.92      0.86       206
        no_tumor       0.90      0.90      0.90       123
 pituitary_tumor       0.93      0.98      0.95       202

        accuracy                           0.89       750
       macro avg       0.90      0.90      0.89       750
    weighted avg       0.90      0.89      0.89       750



In [8]:
from sklearn import svm
from sklearn.svm import SVC

def SVMPredict(x_train, y_train, x_test):
    SVM_rbf = svm.SVC(C=0.5, kernel='linear', gamma=1)
    # Train the model using the training sets
    SVM_rbf.fit(x_train, y_train)
    y_pred = SVM_rbf.predict(x_test)
    #print('Accuracy on test set: {:.2f}'.format(logreg.score(x_test, y_test)))
    return y_pred

y_pred_SVM = SVMPredict(x_train, y_train, x_test)
print('Accuracy on test set: '+ str(accuracy_score(y_test, y_pred_SVM)))
print(classification_report(y_test, y_pred_SVM)) #text report showing the main classification metrics

Accuracy on test set: 0.792
                  precision    recall  f1-score   support

    glioma_tumor       0.73      0.77      0.75       219
meningioma_tumor       0.71      0.70      0.71       206
        no_tumor       0.82      0.68      0.75       123
 pituitary_tumor       0.93      0.97      0.95       202

        accuracy                           0.79       750
       macro avg       0.80      0.78      0.79       750
    weighted avg       0.79      0.79      0.79       750



# To Do List:
* 初始数据集绘制散点图：https://www.heywhale.com/mw/project/5ed755db946a0e002cb803f7
* （LR)可视化分类结果，链接同上
* RandomForest交叉验证结果：https://zhuanlan.zhihu.com/p/62993244
* RandomForest里n_estimator学习曲线：http://kylin.ink/uncategorized/1467
* 每一个ml方法加上训练过程loss配图