<a href="https://colab.research.google.com/github/Kristelwen/TM10007_PROJECT/blob/master/neural_networks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## TM10007 Assignment

In [1]:
# Run this to use from colab environment
!pip install -q --upgrade git+https://github.com/Kristelwen/TM10007_PROJECT

  Building wheel for brats (setup.py) ... [?25l[?25hdone


## Data loading and cleaning

In [0]:
# Importing modules
# General packages
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
from time import time
from sklearn import datasets as ds
from scipy.stats import randint

# Preprocessing packages
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFECV

# Classifiers
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier

# SVM Kernels
from sklearn.kernel_approximation import RBFSampler
from sklearn.metrics.pairwise import rbf_kernel, sigmoid_kernel
from sklearn.decomposition import KernelPCA

# Model selection
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit

# Regularization
from sklearn.linear_model import Lasso, RidgeClassifier
from sklearn.feature_selection import SelectFromModel

In [3]:
# Data loading functions. Uncomment the one you want to use
# from adni.load_data import load_data
from brats.load_data import load_data
#from hn.load_data import load_data

data = load_data()
print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')


The number of samples: 167
The number of columns: 725


## Preprocessing

In [0]:
# Drop columns which contain NaN values
threshold = math.floor(len(data)/2)  # calculate threshold, half of total rows
data_drop = data.dropna(thresh=threshold, axis=1)  # Delete columns/features with more than 'threshold' NaNs
data_drop = data_drop.dropna(axis=0)  # Delete rows/subjects with NaNs

# Split data and labels
labels = data_drop['label']
data_drop = data_drop.drop(columns="label")  # Data without labels

# Convert labels 'GBM' and 'LGG' to respectively 0 and 1
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

# Delete columns with strings (2 columns)
# data_strings = data_drop.select_dtypes(include=[object])
# columns_strings = list(data_strings.columns)
# data_no_strings = data_drop.drop(columns_strings, axis=1)

# Replace infinity values with NaN
#data_no_strings = data_no_strings.replace([np.inf, -np.inf], np.nan)
# print(np.isinf(data_no_strings.values).any()) - check if dataframe contains infinity values

# Split the data in a train (80%) and test set (20%) - OF MOET DIT VOOR FEATURE SCALING? Omdat je niks mag fitten op testdata
data_train, data_test, label_train, label_test = train_test_split(data_drop, labels, test_size=0.1)
data_train2, data_val, label_train2, label_val = train_test_split(data_train, label_train, test_size=0.1)

# Feature scaling
scaler = RobustScaler()
transformer = scaler.fit(data_train2.values)
data_scaled_train2 = transformer.transform(data_train2.values)
data_df_train2 = pd.DataFrame(data_scaled_train2, index = data_train2.index, columns = data_train2.columns)

data_scaled_val = transformer.transform(data_val.values)
data_df_val = pd.DataFrame(data_scaled_val, index = data_val.index, columns = data_val.columns)

data_scaled_test = transformer.transform(data_test.values)
data_df_test = pd.DataFrame(data_scaled_test, index = data_test.index, columns = data_test.columns)

# Optie 1 feature selection: PCA
  # Training set 2
pca_train = PCA(n_components=80)  # Create a PCA with 20 components
pca_train.fit(data_scaled_train2)  # Fit PCA
data_train_pca2 = pca_train.transform(data_scaled_train2)  # Transform train data using PCA
#df_train_pca2 = pd.DataFrame(data_train_pca2, index = data_scaled_train2.index)  # Put train data back in dataframe with 20 most important features
 
  # Training set 1
#data_train_pca = pca_train.transform(data_train)
  # Validatie set
data_val_pca = pca_train.transform(data_scaled_val)  # Transform test data using PCA

  # Test set
data_test_pca = pca_train.transform(data_scaled_test)  # Transform test data using PCA

# Optie 2 feature selection: RFECV


## Neural Networks

In [0]:
# Importing NN modules
from sklearn.neural_network import MLPClassifier

# Cross-validation / performance
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from scipy.stats import randint

In [0]:
# Preprocessing -> train 1 wel scalen want cross validatie op toepassen
transformer1 = scaler.fit(data_train.values)
data_scaled_train = transformer1.transform(data_train.values)
data_df_train = pd.DataFrame(data_scaled_train, index = data_train.index, columns = data_train.columns)


In [0]:
# Function definitions
class NeuralNetwork:
    def __init__(self, x, y):
        self.input      = x
        self.weights1   = np.random.rand(self.input.shape[1],4) 
        self.weights2   = np.random.rand(4,1)                 
        self.y          = y
        self.output     = np.zeros(self.y.shape)

    def feedforward(self):
        self.layer1 = sigmoid(np.dot(self.input, self.weights1))
        self.output = sigmoid(np.dot(self.layer1, self.weights2))

    def backprop(self):
        # application of the chain rule to find derivative of the loss function with respect to weights2 and weights1
        d_weights2 = np.dot(self.layer1.T, (2*(self.y - self.output) * sigmoid_derivative(self.output)))
        d_weights1 = np.dot(self.input.T,  (np.dot(2*(self.y - self.output) * sigmoid_derivative(self.output), self.weights2.T) * sigmoid_derivative(self.layer1)))

        # update the weights with the derivative (slope) of the loss function
        self.weights1 += d_weights1
        self.weights2 += d_weights2

In [92]:
# Hyperparameter optimization of Neural Network

# Our parameter to optimize is the number of estimators, which we vary uniformlybetween 1 and 400
parameter_space = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,), (7,2), (7,7,7),(9,9,9,9),(50,50,50,50), (50,50,50,50,50)],
    'activation': ['logistic','identity','tanh'],  # relu 
    'solver': ['sgd', 'adam'],  # lbfgs
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],  # invscaling
}
# Now use the classifiers on all datasets
fitted_mlps = list()
    
# Within a 5-fold cross-validation, try out 20 different number of trees
clf = RandomizedSearchCV(MLPClassifier(max_iter=2000), parameter_space, cv=5, random_state=42, return_train_score=True)

# Fit the classifier
clf.fit(data_train, label_train)

# Save for next part
fitted_mlps.append(clf)

# Get the best estimator and best parameters belonging to that estimator
print('Best parameters found:\n', clf.best_params_)
# print("")
# All results
# means = clf.cv_results_['mean_test_score']
# stds = clf.cv_results_['std_test_score']
# for mean, std, params in zip(means, stds, clf.cv_results_['params']):
#     print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

# print(f'\n The best estimator is {clf.best_estimator_} \n The best amount of trees is {clf.best_params_}')


  ret = a @ b
  ret = a @ b
  y = np.array(y > threshold, dtype=np.int)
  y = np.array(y > threshold, dtype=np.int)
  ret = a @ b
  ret = a @ b
  ret = umr_sum(arr, axis, dtype, out, keepdims)
  y = np.array(y > threshold, dtype=np.int)
  y = np.array(y > threshold, dtype=np.int)
  ret = a @ b
  ret = a @ b
  y = np.array(y > threshold, dtype=np.int)
  y = np.array(y > threshold, dtype=np.int)
  ret = a @ b
  ret = a @ b
  y = np.array(y > threshold, dtype=np.int)
  y = np.array(y > threshold, dtype=np.int)
  ret = a @ b
  ret = a @ b
  y = np.array(y > threshold, dtype=np.int)
  y = np.array(y > threshold, dtype=np.int)


Best parameters found:
 {'solver': 'adam', 'learning_rate': 'constant', 'hidden_layer_sizes': (7, 7, 7), 'alpha': 0.05, 'activation': 'identity'}


In [93]:
pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_solver,param_learning_rate,param_hidden_layer_sizes,param_alpha,param_activation,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.370547,0.336066,0.004404,6e-05,adam,constant,"(7, 2)",0.05,logistic,"{'solver': 'adam', 'learning_rate': 'constant'...",0.703704,0.692308,0.692308,0.692308,0.692308,0.694587,0.004558,4,0.692308,0.695238,0.695238,0.695238,0.695238,0.694652,0.001172
1,2.948804,0.950444,0.004881,0.000168,sgd,constant,"(100,)",0.0001,tanh,"{'solver': 'sgd', 'learning_rate': 'constant',...",0.62963,0.692308,0.692308,0.730769,0.692308,0.687464,0.032529,9,0.817308,0.857143,0.752381,0.857143,0.790476,0.81489,0.040199
2,2.410722,0.019928,0.00471,7.3e-05,sgd,constant,"(7, 2)",0.0001,identity,"{'solver': 'sgd', 'learning_rate': 'constant',...",0.703704,0.692308,0.692308,0.692308,0.692308,0.694587,0.004558,4,0.692308,0.695238,0.695238,0.695238,0.695238,0.694652,0.001172
3,0.114645,0.085951,0.004419,0.000144,adam,adaptive,"(7, 2)",0.0001,tanh,"{'solver': 'adam', 'learning_rate': 'adaptive'...",0.481481,0.769231,0.692308,0.5,0.423077,0.573219,0.133358,10,0.509615,0.685714,0.695238,0.647619,0.314286,0.570495,0.144315
4,1.388168,0.471474,0.004444,6.9e-05,adam,constant,"(7, 7, 7)",0.05,identity,"{'solver': 'adam', 'learning_rate': 'constant'...",0.703704,0.692308,0.615385,0.807692,0.769231,0.717664,0.066428,1,0.971154,0.933333,0.961905,0.92381,0.847619,0.927564,0.04363
5,0.111796,0.031122,0.004343,0.000107,adam,constant,"(7, 7, 7)",0.05,tanh,"{'solver': 'adam', 'learning_rate': 'constant'...",0.777778,0.692308,0.692308,0.692308,0.692308,0.709402,0.034188,2,0.759615,0.714286,0.695238,0.695238,0.695238,0.711923,0.024961
6,0.2607,0.149117,0.004332,2.1e-05,adam,adaptive,"(7, 2)",0.0001,logistic,"{'solver': 'adam', 'learning_rate': 'adaptive'...",0.703704,0.692308,0.692308,0.692308,0.692308,0.694587,0.004558,4,0.692308,0.695238,0.695238,0.695238,0.695238,0.694652,0.001172
7,0.33286,0.057417,0.004452,0.000163,sgd,adaptive,"(7, 7, 7)",0.0001,logistic,"{'solver': 'sgd', 'learning_rate': 'adaptive',...",0.703704,0.692308,0.692308,0.692308,0.692308,0.694587,0.004558,4,0.692308,0.695238,0.695238,0.695238,0.695238,0.694652,0.001172
8,0.345172,0.023982,0.004582,0.00014,sgd,constant,"(7, 7, 7)",0.0001,logistic,"{'solver': 'sgd', 'learning_rate': 'constant',...",0.703704,0.692308,0.692308,0.692308,0.692308,0.694587,0.004558,4,0.692308,0.695238,0.695238,0.695238,0.695238,0.694652,0.001172
9,0.351032,0.078825,0.005424,0.001152,adam,constant,"(100,)",0.0001,logistic,"{'solver': 'adam', 'learning_rate': 'constant'...",0.703704,0.730769,0.730769,0.692308,0.653846,0.702279,0.028533,3,0.701923,0.695238,0.67619,0.704762,0.72381,0.700385,0.015376


In [96]:
# Fitting the Neural Network Classifier
MLP = MLPClassifier(solver='adam', learning_rate='constant',hidden_layer_sizes=(9,9,9,9), alpha=0.0001, activation='identity')
# MLP = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    # hidden_layer_sizes=(7, 2), random_state=1)
MLP.fit(data_df_train2, label_train2)

# Predictions
train2_pred = MLP.predict(data_df_train2)
val_pred = MLP.predict(data_df_val)
train_pred = MLP.predict(data_df_train)

# Errors
error_train2 = (sum(abs(train2_pred - label_train2))/len(data_train2))*100
error_train2 = (round(error_train2, 2))
print ('The error for the training set is {}%'.format(error_train2))

error_val = (sum(abs(val_pred - label_val))/len(data_val))*100
error_val = (round(error_val, 2))
print ('The error for the validation set is {}%'.format(error_val))

# Confusion matrix and classification report
print('Confusion matrix and classification report of validation set')
print(confusion_matrix(label_val, val_pred))
print(classification_report(label_val, val_pred))

print('Confusion matrix and classification report of training set')
print(confusion_matrix(label_train2, train2_pred))
print(classification_report(label_train2, train2_pred))

print('Confusion matrix and classification report of bigger training set')
print(confusion_matrix(label_train, train_pred))
print(classification_report(label_train, train_pred))


The error for the training set is 6.84%
The error for the validation set is 14.29%
Confusion matrix and classification report of validation set
[[9 1]
 [1 3]]
              precision    recall  f1-score   support

           0       0.90      0.90      0.90        10
           1       0.75      0.75      0.75         4

    accuracy                           0.86        14
   macro avg       0.82      0.82      0.82        14
weighted avg       0.86      0.86      0.86        14

Confusion matrix and classification report of training set
[[79  2]
 [ 6 30]]
              precision    recall  f1-score   support

           0       0.93      0.98      0.95        81
           1       0.94      0.83      0.88        36

    accuracy                           0.93       117
   macro avg       0.93      0.90      0.92       117
weighted avg       0.93      0.93      0.93       117

Confusion matrix and classification report of bigger training set
[[86  5]
 [ 4 36]]
              precision 

In [84]:
# Evaluate neural network using three-fold cross-validation
score = cross_val_score(MLP, data_df_train, label_train, cv=3)

print(f'The accuracy of the validation set in the different folds is {score}')

The accuracy of the validation set in the different folds is [0.84090909 0.81818182 0.86046512]


