## Course: TM10007 - Machine learning
Editors: Lishia Vergeer, Amy Roos, Maaike Pruijt, Hilde Roording.

Description: The aim of this code is to predict the tumor grade of glioma’s(high or low) before surgery, 
based on features extracted from a combination of four MRI images: 
T2-weighted, T2-weighted FLAIR and T1-weighted before and after injection of contrast agent.

#### Import packages

In [237]:
# General packages
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets as ds

# Import code
from brats.load_data import load_data

# Performance 
from sklearn.model_selection import train_test_split
from sklearn import decomposition
import seaborn


# Pipeline and gridsearch
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV

#preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
# scaler
from sklearn.preprocessing import RobustScaler

#Machine learning classifiers
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn import feature_selection 
from sklearn import preprocessing
from sklearn import neighbors
from sklearn.model_selection import ShuffleSplit

import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn import svm
from sklearn.model_selection import cross_val_score, KFold
from sklearn.neighbors import KNeighborsRegressor



#### Load data

In [238]:
data_brats = load_data()

# Convert to dataframe
X = pd.DataFrame(data_brats)

# Print data 
print(f'The number of samples in data_brats: {len(X.index)}')
print(f'The number of columns in data_brats: {len(X.columns)}')

The number of samples in data_brats: 167
The number of columns in data_brats: 725


  data = data.append(data2)


#### Split data in X and y
Split in X (data) and y (label)

In [239]:
# split column label from dataset X
y = X.pop('label')

#### Split data in train and test set
This function creates a panda dataframe and splits the data into test and train components.
This is done with test_size variable and the function train_test_split from the sklearn module.
Returns a train set with the data of 80% and a test set of 20% of the subjects.



In [240]:
# Split data in train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2)  


#### Divergent values to NaN

In [241]:
# infinity to NaN
X_train[X_train==np.inf]=np.nan
X_test[X_test==np.inf]=np.nan

# non-numeric features to NaN
X_train = X_train.replace(['#DIV/0!'], np.nan)
X_train = X_train.apply(pd.to_numeric, errors='coerce')

X_test  = X_test.replace(['#DIV/0!'], np.nan)
X_test = X_test.apply(pd.to_numeric, errors='coerce')

# Outliers to NaN
for column in X_train:
    # Information outliers
    outliers = [x for x in X_train[column] if x < lower or x > upper]
    outliers_removed = [x for x in X_train[column] if x >= lower and x <= upper]
    print('Identified outliers: %d' % len(outliers))
    print('Non-outlier observations: %d' % len(outliers_removed))

    # Removing outliers
    q1 = X_train[column].quantile(0.25)
    q3 = X_train[column].quantile(0.75)
    iqr = q3-q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    print(column)

    X_train[column].loc[X_train[column] > upper] = np.nan
    X_train[column].loc[X_train[column] < lower] = np.nan

    # Information outliers after removal
    outliers = [x for x in X_train[column] if x < lower or x > upper]
    outliers_removed = [x for x in X_train[column] if x >= lower and x <= upper]
    print('Identified outliers: %d' % len(outliers))
    print('Non-outlier observations: %d' % len(outliers_removed))

Identified outliers: 133
Non-outlier observations: 0
VOLUME_ET
Identified outliers: 0
Non-outlier observations: 127
Identified outliers: 18
Non-outlier observations: 115
VOLUME_NET
Identified outliers: 0
Non-outlier observations: 126
Identified outliers: 21
Non-outlier observations: 112
VOLUME_ED
Identified outliers: 0
Non-outlier observations: 130
Identified outliers: 2
Non-outlier observations: 131
VOLUME_TC
Identified outliers: 0
Non-outlier observations: 128
Identified outliers: 44
Non-outlier observations: 89


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[column].loc[X_train[column] > upper] = np.nan


VOLUME_WT
Identified outliers: 0
Non-outlier observations: 132
Identified outliers: 133
Non-outlier observations: 0
VOLUME_BRAIN
Identified outliers: 0
Non-outlier observations: 129
Identified outliers: 133
Non-outlier observations: 0
VOLUME_ET_OVER_NET
Identified outliers: 0
Non-outlier observations: 117
Identified outliers: 1
Non-outlier observations: 131
VOLUME_ET_OVER_ED
Identified outliers: 0
Non-outlier observations: 121
Identified outliers: 24
Non-outlier observations: 108
VOLUME_NET_OVER_ED
Identified outliers: 0
Non-outlier observations: 116
Identified outliers: 0
Non-outlier observations: 133
VOLUME_ET_over_TC
Identified outliers: 0
Non-outlier observations: 133
Identified outliers: 0
Non-outlier observations: 133
VOLUME_NET_over_TC
Identified outliers: 0
Non-outlier observations: 133
Identified outliers: 41
Non-outlier observations: 92
VOLUME_ED_over_TC
Identified outliers: 0
Non-outlier observations: 119
Identified outliers: 0
Non-outlier observations: 133
VOLUME_ET_OVER_WT

#### Drop columns

In [242]:
# If the total number of NaN observations in a column are greater than 40%, delete the entire column.
perc = 40.0
min_count = int(((100-perc)/100)*X_train.shape[0] + 1)
X_train_drop = X_train.dropna(axis=1, thresh=min_count)
X_labels = X_train_drop.keys()

X_test_drop = X_test[X_labels]

#### Fill the NaN observations.


In [243]:
data_fill_train = X_train_drop.fillna(X_train_drop.mean()) 
data_fill_test = X_test_drop.fillna(X_test_drop.mean()) 

#### Scale features

In [244]:
# Scaler
scaler = MinMaxScaler()
scaler.fit(data_fill_train)
X_train_scaled = scaler.transform(data_fill_train)
X_test_scaled = scaler.transform(data_fill_test)

#### Transform features

In [245]:
# Perform a PCA
pca = decomposition.PCA(n_components=5)
pca.fit(X_train_scaled) 
X_train_pca = pca.transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)


#### Cross validation

In [246]:
# Create a cross-validation object
cv = KFold(n_splits=5, shuffle= True, random_state = 1)

X = X_train_pca

y = y_train.values
y = np.where(y=='GBM', 1, y)
y = np.where(y=='LGG', 0, y)
y = y.tolist()

classifiers = (KNeighborsClassifier(), RandomForestClassifier(), SVC(kernel="rbf", C=0.025, probability=True), NuSVC(probability=True), DecisionTreeClassifier(), AdaBoostClassifier(), GradientBoostingClassifier() )

for cls in classifiers:

    list_scores = cross_val_score(cls, X, y, scoring = 'accuracy', cv=cv)
    mean = np.mean(list_scores)

    print(list_scores)
    print(f'{cls} mean:', mean)
  

[0.92592593 0.88888889 0.66666667 0.84615385 0.76923077]
KNeighborsClassifier() mean: 0.8193732193732194
[0.88888889 0.77777778 0.81481481 0.84615385 0.80769231]
RandomForestClassifier() mean: 0.827065527065527
[0.66666667 0.66666667 0.55555556 0.53846154 0.61538462]
SVC(C=0.025, probability=True) mean: 0.6085470085470085
[0.92592593 0.88888889 0.74074074 0.88461538 0.88461538]
NuSVC(probability=True) mean: 0.8649572649572649
[0.51851852 0.62962963 0.7037037  0.80769231 0.65384615]
DecisionTreeClassifier() mean: 0.6626780626780626
[0.81481481 0.77777778 0.74074074 0.73076923 0.65384615]
AdaBoostClassifier() mean: 0.7435897435897435
[0.85185185 0.74074074 0.81481481 0.76923077 0.73076923]
GradientBoostingClassifier() mean: 0.7814814814814814


#### Classifier

In [247]:
# Construct classifiers
clsfs = [RandomForestClassifier(n_estimators=1, random_state=42),
         RandomForestClassifier(n_estimators=5, random_state=42),
         RandomForestClassifier(n_estimators=200, random_state=42)]