In [None]:
import os
import cv2
import random
import numpy as np 
import pandas as pd 
import seaborn as sns

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# To plot pretty figures
%matplotlib inline
import matplotlib as mlp
import matplotlib.pyplot as plt

## Data Loading

In [None]:
labels = ['glioma_tumor', 'meningioma_tumor', 'no_tumor', 'pituitary_tumor']
train_img = [] #contains the images used for training the model
test_img = []
train_labels = [] #label of each image in x_train 
test_labels = []
TRAIN_PATH = '../input/brain-tumor-classification-mri/Training'
TEST_PATH = '../input/brain-tumor-classification-mri/Testing'
new_size = (255, 255)

for label in labels:
    img_dir = os.path.join(TRAIN_PATH, label)
    for img_file in os.listdir(img_dir):
        img = cv2.imread(f'{img_dir}/{img_file}')
        img = cv2.resize(img, new_size)
        img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)/255
        train_img.append(img)
        train_labels.append(label)
        
train_img = np.stack(train_img)
train_labels = np.stack(train_labels)

print("train_img shape : ", train_img.shape)
print("train_labels shape : ", train_labels.shape)

for label in labels:
    img_dir = os.path.join(TEST_PATH, label)
    for img_file in os.listdir(img_dir):
        img = cv2.imread(f'{img_dir}/{img_file}')
        img = cv2.resize(img, new_size)
        img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)/255
        test_img.append(img)
        test_labels.append(label)
        
test_img = np.stack(test_img)
test_labels = np.stack(test_labels)

print("test_img shape : ", test_img.shape)
print("test_labels shape : ", test_labels.shape)

class_map = {
    'no_tumor': 0,
    'glioma_tumor': 1,
    'pituitary_tumor': 2,
    'meningioma_tumor': 3
}

train_labels = np.array([class_map[label] for label in train_labels])
test_labels = np.array([class_map[label] for label in test_labels])

## Data pre-processing

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler #the Standard Scaler : X2 = (X1 - E(X1))/sqrt(Var(X1))
from sklearn.decomposition import PCA

train_img = train_img.reshape((train_img.shape[0], 255*255))
test_img = test_img.reshape((test_img.shape[0], 255*255))
S = StandardScaler()
X_train = S.fit_transform(train_img)
X_test = S.transform(test_img)
P = PCA(n_components = 100)
pca_train = P.fit_transform(X_train)
pca_test = P.transform(X_test)

In [None]:
data_train = np.hstack((pca_train, train_labels[:,None]))
data_test = np.hstack((pca_test, test_labels[:,None]))

all_data = np.vstack((data_train, data_test))
X = all_data[:,:-1]
y = all_data[:,-1]

# Linear Classifier

This estimator implements regularized linear models with stochastic gradient descent (SGD) learning: the gradient of the loss is estimated each sample at a time and the model is updated along the way with a decreasing strength schedule (aka learning rate). SGD allows minibatch (online/out-of-core) learning via the partial_fit method. For best results using the default learning rate schedule, the data should have zero mean and unit variance.

This implementation works with data represented as dense or sparse arrays of floating point values for the features. The model it fits can be controlled with the loss parameter; by default, it fits a linear support vector machine (SVM).

The regularizer is a penalty added to the loss function that shrinks model parameters towards the zero vector using either the squared euclidean norm L2 or the absolute norm L1 or a combination of both (Elastic Net). If the parameter update crosses the 0.0 value because of the regularizer, the update is truncated to 0.0 to allow for learning sparse models and achieve online feature selection.

In [None]:
from sklearn.linear_model import SGDClassifier 
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix, f1_score
from sklearn.model_selection import cross_validate, cross_val_score

In [None]:
clf = SGDClassifier(loss = 'squared_error')
clf.fit(pca_train, train_labels)
print("Score on the training data set: ", clf.score(pca_train, train_labels))

The score is extremelly poor, which means that our data is not linearly separable.

In [None]:
print("Score on the testing data set: ", clf.score(pca_test, test_labels))

Indeed, on the testing data set, only 1 out of four sample are correctly classified. A random classification would give similar results.

# Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression 

In [None]:
cls = LogisticRegression(penalty = 'none', max_iter = 1500, tol = 0.0001, multi_class = 'multinomial')
cls.fit(pca_train, train_labels)

print("Score on the training data set: ", cls.score(pca_train, train_labels))
print("Score on the testing data set: ", cls.score(pca_test, test_labels))

The score on the test set is poor, the logistic regression won't work effectively

### Testing the variation of the different learning parameters

In [None]:
ccp_alphas = np.linspace(0.0001, 0.01, 20)
clfs = []
for alpha in ccp_alphas:
    LC = SGDClassifier(loss = 'squared_error', alpha = alpha)
    LC.fit(pca_train, train_labels)
    clfs.append(LC)

In [None]:
train_scores = [clf.score(pca_train, train_labels) for clf in clfs]
test_scores = [clf.score(pca_test, test_labels) for clf in clfs]

fig, ax = plt.subplots(figsize=(10, 5))
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker="o", label="train", drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker="o", label="test", drawstyle="steps-post")
ax.legend()
ax.grid()
plt.show()

### Adaptive learning rate

As long as the training keeps decreasing, each time n_iter_no_change consecutive epochs fail to decrease the training loss by tol or fail to increase validation score by tol if early_stopping is True, the current learning rate is divided by 5.

In [None]:
clf = SGDClassifier(loss = 'squared_error', learning_rate = 'adaptive', eta0 = 0.001, max_iter = 15000)
clf.fit(pca_train, train_labels)
clf.score(pca_train, train_labels)

### Different type of loss

In [None]:
loss_func = ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']

clfs = []
for func in loss_func:
    clf = SGDClassifier(loss = func)
    clf.fit(pca_train, train_labels)
    clfs.append(clf)

In [None]:
train_scores = [clf.score(pca_train, train_labels) for clf in clfs]
test_scores = [clf.score(pca_test, test_labels) for clf in clfs]

number = [i+1 for i in range(len(loss_func))]

fig, ax = plt.subplots(figsize=(25, 5))
ax.set_xlabel("Loss function")
ax.set_ylabel("Accuracy")
ax.set_title("Accuracy vs loss function for training sets")
ax.bar(number, train_scores, tick_label = loss_func, width = 0.8, color = ['blue']) 
ax.legend()
ax.grid()
plt.show()

fig, ax = plt.subplots(figsize=(25, 5))
ax.set_xlabel("Loss function")
ax.set_ylabel("Accuracy")
ax.set_title("Accuracy vs loss function for testing sets")
ax.bar(number, test_scores, tick_label = loss_func, width = 0.8, color = ['red'])
ax.legend()
ax.grid()
plt.show()

### From what we can see here, the 'hinge' solver performs better. However the accuracy on the testing set is still poor. Let's try ot improve that result.

In [None]:
ccp_alphas = np.linspace(0.0001, 0.01, 500)
clfs = []
for alpha in ccp_alphas:
    LC = SGDClassifier(loss = 'hinge', alpha = alpha)
    LC.fit(pca_train, train_labels)
    clfs.append(LC)

In [None]:
train_scores = [clf.score(pca_train, train_labels) for clf in clfs]
test_scores = [clf.score(pca_test, test_labels) for clf in clfs]

fig, ax = plt.subplots(figsize=(30, 5))
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker="o", label="train", drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker="o", label="test", drawstyle="steps-post")
ax.legend()
ax.grid()
plt.show()

In [None]:
from tqdm import tqdm

penaltys = ['l1', 'l2', 'elasticnet']

P = []

for pen in penaltys:
    print('\nPenalty mode: ', pen)
    p1, p2 = [], []
    for i in tqdm (range (50), desc="Loading..."):
        clf = SGDClassifier(loss = 'hinge', penalty = pen)
        clf.fit(pca_train, train_labels)
        p1.append(clf.score(pca_train, train_labels))
        p2.append(clf.score(pca_test, test_labels))
    
    P.append([p1, p2])

In [None]:
scores_train = []
scores_test = []

for set in P:
    sum_train = 0
    for i in set[0]:
        sum_train += i
    scores_train.append(sum_train/len(set[0]))
    
    sum_test = 0
    for i in set[1]:
        sum_test += i
    scores_test.append(sum_test/len(set[1]))
    
fig, (ax1, ax2) = plt.subplots(2, figsize = (10, 10))

ax1.set_xlabel("Penalty mode")
ax1.set_ylabel("Mean accuracy")
ax1.set_title("Accuracy vs penalty mode for training sets over 50 classifiers")
ax1.bar([1, 2, 3], scores_train, tick_label = penaltys, width = 0.8, color = ['blue'])
ax1.set_ylim(0.7, 0.8)
ax1.legend()
ax1.grid()

ax2.set_xlabel("alpha")
ax2.set_ylabel("Mean accuracy")
ax2.set_title("Mean accuracy vs penalty mode for testing sets over 50 classifiers")
ax2.bar([1, 2, 3], scores_test, tick_label = penaltys, width = 0.8, color = ['red'])
ax2.set_ylim(0.4, 0.50)
ax2.legend()
ax2.grid()

plt.show()