In [None]:
import os
import cv2
import random
import numpy as np 
import pandas as pd 
import seaborn as sns

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# To plot pretty figures
%matplotlib inline
import matplotlib as mlp
import matplotlib.pyplot as plt

## Data Loading

In [None]:
labels = ['glioma_tumor', 'meningioma_tumor', 'no_tumor', 'pituitary_tumor']
train_img = [] #contains the images used for training the model
test_img = []
train_labels = [] #label of each image in x_train 
test_labels = []
TRAIN_PATH = '../input/brain-tumor-classification-mri/Training'
TEST_PATH = '../input/brain-tumor-classification-mri/Testing'
new_size = (255, 255)

for label in labels:
    img_dir = os.path.join(TRAIN_PATH, label)
    for img_file in os.listdir(img_dir):
        img = cv2.imread(f'{img_dir}/{img_file}')
        img = cv2.resize(img, new_size)
        img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)/255
        train_img.append(img)
        train_labels.append(label)
        
train_img = np.stack(train_img)
train_labels = np.stack(train_labels)

print("train_img shape : ", train_img.shape)
print("train_labels shape : ", train_labels.shape)

for label in labels:
    img_dir = os.path.join(TEST_PATH, label)
    for img_file in os.listdir(img_dir):
        img = cv2.imread(f'{img_dir}/{img_file}')
        img = cv2.resize(img, new_size)
        img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)/255
        test_img.append(img)
        test_labels.append(label)
        
test_img = np.stack(test_img)
test_labels = np.stack(test_labels)

print("test_img shape : ", test_img.shape)
print("test_labels shape : ", test_labels.shape)

class_map = {
    'no_tumor': 0,
    'glioma_tumor': 1,
    'pituitary_tumor': 2,
    'meningioma_tumor': 3
}

train_labels = np.array([class_map[label] for label in train_labels])
test_labels = np.array([class_map[label] for label in test_labels])

## Data pre-processing

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler #the Standard Scaler : X2 = (X1 - E(X1))/sqrt(Var(X1))
from sklearn.decomposition import PCA

train_img = train_img.reshape((train_img.shape[0], 255*255))
test_img = test_img.reshape((test_img.shape[0], 255*255))
S = StandardScaler()
X_train = S.fit_transform(train_img)
X_test = S.transform(test_img)
P = PCA(n_components = 100)
pca_train = P.fit_transform(X_train)
pca_test = P.transform(X_test)

In [None]:
data_train = np.hstack((pca_train, train_labels[:,None]))
data_test = np.hstack((pca_test, test_labels[:,None]))

all_data = np.vstack((data_train, data_test))
X = all_data[:,:-1]
y = all_data[:,-1]

# C4.5/J48

Testing the Decision Tree Classifier (CART / C4.5) and computing metrics with different hyperparameters

In [None]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix, f1_score
from sklearn.model_selection import cross_validate, cross_val_score

In [None]:
def C45(X, y, max_depth, criterion= "gini"):
    
    for depth in max_depth:
    
        clf = DecisionTreeClassifier(max_depth = depth, criterion = criterion)
        #clf = clf.fit(X, y)
        #y_pred = clf.predict(pca_test)
        #tn, fp, fn, tp = confusion_matrix(test_labels, y_pred)
        acc = cross_val_score(clf, X, y, cv=10).mean()
        precision = cross_val_score(clf, X, y, cv=10, scoring='precision_weighted').mean()
        recall = cross_val_score(clf, X, y, cv=10, scoring='recall_weighted').mean()
        f1 = cross_val_score(clf, X, y, cv=10, scoring='f1_weighted').mean()
        roc = cross_val_score(clf, X, y, cv=10, scoring='roc_auc_ovr_weighted').mean()
        
        
        print(f'\n max_depth {depth}, criterion {criterion}')
        print("10-fold Accuracy = ", acc)
        print("10-fold Precision = ", precision)
        print("10-fold Recall = ", recall)
        print("10-fold f1-score = ", f1)
        print("10-fold roc score = ", roc)
        

In [None]:
C45(X, y, max_depth = [5, 10, 20, 50, 100])

Accuracy increases with max_depth.
However, evolution of accuracy and other metrics between max_depth = 20 and max_depth=100 is not significant.

In [None]:
# Plotting a decision tree with max_depth = 3

clf = DecisionTreeClassifier(max_depth = 3)
clf.fit(pca_train, train_labels)

plt.figure(figsize=(20,12))
sklearn.tree.plot_tree(clf, class_names = list(class_map.keys()), fontsize=9)
plt.show()

Gini tends to decrease at each expansion (correct behavior is gini = 0) but it is not necesarily always true, which is not optimal. This not optimal behavior is reflected in the "value" attribute of each node, where there is often no predominant class.

## OTHER HYPERPARAMETERS TO INVESTIGATE (for tree size reduction)

- other splitting criteria than "gini" (Gini impurity) --> see random forest section
- min_samples_leaf (by default = 1)
- (eventually) min_impurity_decrease (e.g. splitting only if it induces impurity decrease greater or equal to threshold)
- ccp_alpha : prunes (removes branches) the tree. Greater is ccp_alpha, smaller the resulting tree is (e.g. more nodes are pruned)

### Complexity Cost Pruning (CCP alpha)

Pruning the tree $T$ to minimize :
$$R_{\alpha}(T) = R(T) + \alpha |\tilde{T}|$$
Where $R(T)$ is the weighted impurity (see splitting criterion) and $\tilde{T}$ is the number of terminal nodes in $T$. 
The more we increase $\alpha$, the more we prune $T$

In [None]:
ccp_alphas = np.linspace(0, 0.05, 20)
clfs = []
for alpha in ccp_alphas:
    DT = DecisionTreeClassifier(max_depth = 50, ccp_alpha = alpha)
    DT.fit(pca_train, train_labels)
    clfs.append(DT)

In [None]:
train_scores = [clf.score(pca_train, train_labels) for clf in clfs]
test_scores = [clf.score(pca_test, test_labels) for clf in clfs]

fig, ax = plt.subplots(figsize=(10, 5))
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker="o", label="train", drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker="o", label="test", drawstyle="steps-post")
ax.legend()
plt.show()

Explicit overfitting as the first step demonstrates 100% accuracy on the training set but only 70% on testing set (reflected in 10 fold cross validation). CCP alpha pruning, however, makes the accuracy worst on both training and testing sets. Pruning isn't effective in preventing overfitting.

## Splitting Criteria

Sklearn notably disposes of two splitting criteria (or impurities) : Gini and Entropy.


To calculate Gini and Entropy impurities at node m, given that we have n classes :
$$ \forall k \in \{1, ... n\},$$
$$\ p_{mk} = \frac{\Bigl|\Bigl\{sample \ | \ sample \in \ m, sample \ class = k\Bigr\}\Bigr|}{\Bigl|\Bigl\{sample \ | \ sample \in \ m \Bigr\}\Bigr|} $$
$$ Gini(m) = \sum_{k=1}^{n} p_{mk}(1 - p_{mk})$$
$$Entropy(m) = -\sum_{k=1}^{n} p_{mk}log(p_{mk})$$

A feature and a threshold can be selected according to the minimization of these impurities.
As results with Gini criterion have already been computed, we do it with the Entropy one this time

In [None]:
# testing Entropy criterion

C45(X, y, max_depth = [5, 10, 20, 50, 100], criterion = "entropy")

Results a little worse than with Gini impurity criterion. Moreover, some overfitting seems to happen with max depth past 50.

## Minimum samples per leaf

Self explanatory

In [None]:
min_samp = np.linspace(1, 100, 20).astype('int')
clfs = []
for m in min_samp:
    DT = DecisionTreeClassifier(max_depth = 50, min_samples_leaf = m)
    DT.fit(pca_train, train_labels)
    clfs.append(DT)

In [None]:
train_scores = [clf.score(pca_train, train_labels) for clf in clfs]
test_scores = [clf.score(pca_test, test_labels) for clf in clfs]

fig, ax = plt.subplots(figsize=(10, 5))
ax.set_xlabel("min samples per leaf")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs min_samples_leaf for training and testing sets")
ax.plot(min_samp, train_scores, marker="o", label="train", drawstyle="steps-post")
ax.plot(min_samp, test_scores, marker="o", label="test", drawstyle="steps-post")
ax.legend()
plt.show()

Increasing the minimum number of samples per leaf makes the accuracy worse both on the training and testing sets. There is moreover no overfitting reducing.

## Minimum impurity decrease

A node will be split if the resulting impurity decreases more than this hyperparameter value (min_impurity_decrease).

In [None]:
imp = np.linspace(0, 0.05, 20)
clfs = []
for m in imp:
    DT = DecisionTreeClassifier(max_depth = 50, min_impurity_decrease = m)
    DT.fit(pca_train, train_labels)
    clfs.append(DT)

In [None]:
train_scores = [clf.score(pca_train, train_labels) for clf in clfs]
test_scores = [clf.score(pca_test, test_labels) for clf in clfs]

fig, ax = plt.subplots(figsize=(10, 5))
ax.set_xlabel("min impurity decrease")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs min_impurity_decrease for training and testing sets")
ax.plot(imp, train_scores, marker="o", label="train", drawstyle="steps-post")
ax.plot(imp, test_scores, marker="o", label="test", drawstyle="steps-post")
ax.legend()
plt.show()

Global decrease of accuracy on training and testing sets as the minimum impurity increases.

## Grid Search for optimal hyperparameters

WARNING : CAN TAKE SOME TIME (~3-5min)

In [None]:
from sklearn.model_selection import GridSearchCV

DT = DecisionTreeClassifier()
criterion = ('gini', 'entropy')
max_depth = np.linspace(1, 101, 3).astype('int')
min_samples_leaf = np.linspace(1, 101, 3).astype('int')
min_impurity_decrease = np.linspace(0, 0.05, 3)
ccp_alpha = np.linspace(0, 0.05, 3)
parameters = {'criterion': criterion, 'max_depth' : max_depth, 'min_samples_leaf':min_samples_leaf, 'min_impurity_decrease':min_impurity_decrease, 'ccp_alpha':ccp_alpha}
clf = GridSearchCV(DT, parameters, cv=10)
clf.fit(X, y)

In [None]:
print("best params : ", clf.best_params_)
print("best score = ", clf.best_score_)

Tree size hyperparameters don't help increasing the classifier's accuracy. Eventually max_depth can be fixed at 50.

## RandomizedSearchCV for Optimizing  hyperparameters

Faster than GridSearchCV because not all parameter values are tried out.

In [None]:
from sklearn.model_selection import RandomizedSearchCV

DT = DecisionTreeClassifier()
criterion = ('gini', 'entropy')
max_depth = np.linspace(1, 101, 3).astype('int')
min_samples_leaf = np.linspace(1, 101, 3).astype('int')
min_impurity_decrease = np.linspace(0, 0.05, 3)
ccp_alpha = np.linspace(0, 0.05, 3)
parameters = {'criterion': criterion, 'max_depth' : max_depth, 'min_samples_leaf':min_samples_leaf, 'min_impurity_decrease':min_impurity_decrease, 'ccp_alpha':ccp_alpha}
clf = RandomizedSearchCV(DT, parameters, cv=10)
clf.fit(X, y)


In [None]:
print("best params : ", clf.best_params_)
print("best score = ", clf.best_score_)

## Tree 1
With the best working parameters from the randomized search.

In [None]:
DT = DecisionTreeClassifier(max_depth = 51, min_impurity_decrease = 0.025, criterion='entropy', ccp_alpha=0.0, min_samples_leaf=101)
clf = DT.fit(pca_train, train_labels)

DT2 = DecisionTreeClassifier(max_depth = 101, min_impurity_decrease = 0.0, criterion='gini', ccp_alpha=0.0, min_samples_leaf=1)
clf2 = DT2.fit(pca_train, train_labels)

In [None]:
train_scores = clf2.score(pca_train, train_labels)
test_scores = clf2.score(pca_test, test_labels) 

print("\n Grid Search CV")
print(f"Training Accuracy...{train_scores}")
print(f"Test Accuracy...{test_scores}")

train_scores = clf.score(pca_train, train_labels)
test_scores = clf.score(pca_test, test_labels) 

print("\n Random Search CV")
print(f"Training Accuracy...{train_scores}")
print(f"Test Accuracy...{test_scores}")

The decision tree does not generalize well to new data. The test accuracy is way lower than the training accuracy.

## Tree 2
Moving 30% of the training samples to the testing set.

In [None]:
from sklearn.model_selection import train_test_split

x_train_30, x_test_30, y_train_30, y_test_30 = train_test_split(pca_train, train_labels, test_size=0.30, random_state=42)

new_test_data = np.hstack((x_test_30, y_test_30[:,None]))

all_test_data = np.vstack((new_test_data, data_test))
x_test_30 = all_test_data[:,:-1]
y_test_30 = all_test_data[:,-1]

In [None]:
clf = DT.fit(x_train_30, y_train_30)
clf2 = DT2.fit(x_train_30, y_train_30)

In [None]:
train_scores = clf2.score(x_train_30, y_train_30)
test_scores = clf2.score(x_test_30, y_test_30) 

print("\n Grid Search CV")
print(f"Training Accuracy...{train_scores}")
print(f"Test Accuracy...{test_scores}")

train_scores = clf.score(x_train_30, y_train_30)
test_scores = clf.score(x_test_30, y_test_30) 

print("\n Random Search CV")
print(f"Training Accuracy...{train_scores}")
print(f"Test Accuracy...{test_scores}")

## Tree 3

Moving 60% of the training samples to the testing set.


In [None]:
from sklearn.model_selection import train_test_split

x_train_60, x_test_60, y_train_60, y_test_60 = train_test_split(pca_train, train_labels, test_size=0.60, random_state=42)

new_test_data = np.hstack((x_test_60, y_test_60[:,None]))

all_test_data = np.vstack((new_test_data, data_test))
x_test_60 = all_test_data[:,:-1]
y_test_60 = all_test_data[:,-1]

In [None]:
clf = DT.fit(x_train_60, y_train_60)
clf2 = DT2.fit(x_train_60, y_train_60)

In [None]:
train_scores = clf2.score(x_train_60, y_train_60)
test_scores = clf2.score(x_test_60, y_test_60) 

print("\n Grid Search CV")
print(f"Training Accuracy...{train_scores}")
print(f"Test Accuracy...{test_scores}")

train_scores = clf.score(x_train_60, y_train_60)
test_scores = clf.score(x_test_60, y_test_60)  

print("\n Random Search CV")
print(f"Training Accuracy...{train_scores}")
print(f"Test Accuracy...{test_scores}")

## Resume table

TODO

In [None]:
# Evaluation Buffers
hyperparam = ["gini max_depth 20", 
              "gini max_depth 50", 
              "gini max_depth 100", 
              "entropy max_depth 50", 
              "min_impurity_decrease 0.02",
              "min_impurity_decrease 0.05", 
              "min_samples_leaf 10", 
              "min_samples_leaf 20", 
              "ccp_alpha 0.02",  
              "ccp_alpha 0.05"]
Acc = []
Pr = []
Rec = []
F1 = [] 
AUC = []

def evaluator(X, y, max_depth = [None], criterion= ["gini"], impurity_dec = [0.0], leaf_samples = [1], ccp_alphas = [0.0]):
    for depth in max_depth:
        for crit in criterion:
            for mid in impurity_dec:
                for ls in leaf_samples:
                    for ccp in ccp_alphas:
                        clf = DecisionTreeClassifier(max_depth = depth, criterion = crit, min_impurity_decrease=mid, min_samples_leaf = ls, ccp_alpha=ccp)
                        acc = cross_val_score(clf, X, y, cv=10).mean()
                        precision = cross_val_score(clf, X, y, cv=10, scoring='precision_weighted').mean()
                        recall = cross_val_score(clf, X, y, cv=10, scoring='recall_weighted').mean()
                        f1 = cross_val_score(clf, X, y, cv=10, scoring='f1_weighted').mean()
                        roc = cross_val_score(clf, X, y, cv=10, scoring='roc_auc_ovr_weighted').mean()
                        Acc.append(acc)
                        Pr.append(precision)
                        Rec.append(recall)
                        F1.append(f1)
                        AUC.append(roc)

In [None]:
evaluator(X,y, max_depth = [20, 50, 100])
evaluator(X,y,max_depth = [50], criterion = ["entropy"])
evaluator(X,y,max_depth = [50], impurity_dec = [0.02, 0.05])
evaluator(X,y,max_depth = [50], leaf_samples = [10, 20])
evaluator(X,y,max_depth = [50], ccp_alphas = [0.02, 0.05])

In [None]:
data = {'Hyperparameter': hyperparam,
        'Accuracy' : Acc,
        'Weighted Precision': Pr,
        'Weighted Recall' : Rec,
        'Weighted F1 Score' : F1,
        'Weighted AUC' : AUC
        }
pd.set_option('expand_frame_repr', False)
resume = pd.DataFrame(data)

print(resume)

## Random forest

A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting.

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
def randomForest(X, y, n = 100, crit = 'gini', max_depth = 1):
    
    clf = RandomForestClassifier(n_estimators = n, criterion = crit, max_depth = max_depth)
    clf.fit(X, y)

    acc = cross_val_score(clf, X, y, cv=10).mean()
    precision = cross_val_score(clf, X, y, cv=10, scoring='precision_weighted').mean()
    recall = cross_val_score(clf, X, y, cv=10, scoring='recall_weighted').mean()
    f1 = cross_val_score(clf, X, y, cv=10, scoring='f1_weighted').mean()
    roc = cross_val_score(clf, X, y, cv=10, scoring='roc_auc_ovr_weighted').mean()

#     print(f'\n max_depth {max_depth}, criterion {crit}')
#     print("10-fold Accuracy = ", acc)
#     print("10-fold Precision = ", precision)
#     print("10-fold Recall = ", recall)
#     print("10-fold f1-score = ", f1)
#     print("10-fold roc score = ", roc)

    return acc, precision, recall, f1, roc

### Warning: extremly long run

In [None]:
from tabulate import tabulate

estimators = [10, 20]
criterions = ['gini', 'entropy']
depth = [5, 10, 20]


for crit in criterions:
    print(f"\nUsing criterion {crit}:")
    lines = [["max_depth \ n_estimators"] + estimators]
    
    for dep in depth:
        lines.append([f"\n max_depth: {dep}"])
        
        for esti in estimators:
            acc, precision, recall, f1, roc = randomForest(X, y, esti, crit, dep)
            lines[-1].append(f"accuracy: {acc}\n precision: {precision}\n recall: {recall}\n f1: {f1}\n roc: {roc}")
            
            
    print(tabulate(lines))
    

## RandomizedSearchCV for Optimizing RandomForests  hyperparameters

In [None]:
from sklearn.model_selection import RandomizedSearchCV

DT = RandomForestClassifier()
n_estimators = np.linspace(10, 50, 100).astype('int')
criterion = ('gini', 'entropy')
max_depth = np.linspace(1, 101, 3).astype('int')
min_samples_leaf = np.linspace(1, 101, 3).astype('int')
min_impurity_decrease = np.linspace(0, 0.05, 3)
ccp_alpha = np.linspace(0, 0.05, 3)
parameters = {'criterion': criterion, 'max_depth' : max_depth, 'min_samples_leaf':min_samples_leaf, 'min_impurity_decrease':min_impurity_decrease, 'ccp_alpha':ccp_alpha}
clf = RandomizedSearchCV(DT, parameters, cv=10)
clf.fit(X, y)

In [None]:
print("best params : ", clf.best_params_)
print("best score = ", clf.best_score_)

## Tree 1 (RandomForest)

In [None]:
DT = RandomForestClassifier(max_depth = 101, min_impurity_decrease = 0.0, criterion='gini', ccp_alpha=0.0, min_samples_leaf=1)
clf = DT.fit(pca_train, train_labels)

In [None]:
train_scores = clf.score(pca_train, train_labels)
test_scores = clf.score(pca_test, test_labels) 
print(f"Training Accuracy...{train_scores}")
print(f"Test Accuracy...{test_scores}")

## Tree 2 (RandomForest)

In [None]:
clf = DT.fit(x_train_30, y_train_30)

In [None]:
train_scores = clf.score(x_train_30, y_train_30)
test_scores = clf.score(x_test_30, y_test_30) 
print(f"Training Accuracy...{train_scores}")
print(f"Test Accuracy...{test_scores}")

## Tree 3 (RandomForest)

In [None]:
clf = DT.fit(x_train_60, y_train_60)

In [None]:
train_scores = clf.score(x_train_60, y_train_60)
test_scores = clf.score(x_test_60, y_test_60) 
print(f"Training Accuracy...{train_scores}")
print(f"Test Accuracy...{test_scores}")