<a href="https://colab.research.google.com/github/JSSchouten/TM10007_Group_10/blob/master/Data_TM10007.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Run this to use from colab environment
!pip install -q --upgrade git+https://github.com/karinvangarderen/tm10007_project.git

  Building wheel for brats (setup.py) ... [?25l[?25hdone


# Code

In [0]:
'''
all necessary imports
'''
# general functions
import pandas as pd
import numpy as np
import seaborn
import matplotlib.pyplot as plt
from pathlib import Path

# load data
from brats.load_data  import load_data

# preprocessing and scaling
from sklearn.model_selection    import train_test_split
from sklearn                    import preprocessing
from sklearn.decomposition      import PCA
from sklearn.feature_selection  import SelectKBest, f_classif

# classifiers
from sklearn.model_selection        import cross_val_score, learning_curve
from sklearn.neighbors              import KNeighborsClassifier
from sklearn.ensemble               import RandomForestClassifier
from sklearn                        import svm

# calculate accuracy values
from sklearn.metrics  import accuracy_score
from sklearn.metrics  import confusion_matrix

In [0]:
'''
Load the data from GitHub
'''

data = load_data()
data_columns = list(set(data))
# print(f'The number of samples: {len(data.index)}')
# print(f'The number of columns: {len(data.columns)}')

In [0]:
'''
Learning curve function copied from assignment
'''


def plot_learning_curve(estimator, title, X, y, axes, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate 3 plots: the test and training learning curve, the training
    samples vs fit times curve, the fit times vs score curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    axes : array of 3 axes, optional (default=None)
        Axes to use for plotting the curves.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 5-fold cross-validation,
          - integer, to specify the number of folds.
          - :term:`CV splitter`,
          - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : int or None, optional (default=None)
        Number of jobs to run in parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    train_sizes : array-like, shape (n_ticks,), dtype float or int
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the dtype is float, it is regarded as a
        fraction of the maximum size of the training set (that is determined
        by the selected validation method), i.e. it has to be within (0, 1].
        Otherwise it is interpreted as absolute sizes of the training sets.
        Note that for classification the number of samples usually have to
        be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))
    """

    axes.set_title(title)
    if ylim is not None:
        axes.set_ylim(*ylim)
    axes.set_xlabel("Training examples")
    axes.set_ylabel("Score")

    train_sizes, train_scores, test_scores  = \
        learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    # Plot learning curve
    axes.grid()
    axes.fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
    axes.fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
    axes.plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
    axes.plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    axes.legend(loc="best")

    return plt

In [0]:
'''
All other functions used
'''


def split(data):
  '''
  Divide data in a training and test set 80% - 20%
  '''
  x = data.iloc[:,:-1]
  y = data['label']
  x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

  return x_train, x_test, y_train, y_test


def delnan(x):
  '''
  Replace all values causing errors by NaN, and replace those and pre-
  existing NaNs by the column's median
  '''
  data_columns = list(set(x))
  ## Replace inf en -inf by NaN
  x_inf = x.replace([np.inf, -np.inf], np.nan)
  ## Replace strings by NaN
  x_str = (x_inf.drop(data_columns, axis=1)
             .join(x_inf[data_columns].apply(pd.to_numeric, errors='coerce')))
  ## Delete all columns containing over 50% NaN
  x_del_nan = x_str.dropna(axis='columns', thresh= round(0.5 * len(x_str)))
  ## Replace all NaNs with the column's median
  x_finished = x_del_nan.fillna(x_del_nan.median())

  return x_finished


def standardscaler(x_train, x_test):
  '''
  Scale all values using standard scaling
  '''
  ## Design scaler
  scaler = preprocessing.StandardScaler()
  # scaler = preprocessing.RobustScaler(quantile_range=[5, 95])
  scaler.fit(x_train)

  ## Apply scaler to both sets and return scaled sets
  x_scaled_train = scaler.transform(x_train)
  x_scaled_test = scaler.transform(x_test)

  x_scaled_df_train = pd.DataFrame(x_scaled_train, columns=x_train.columns)
  x_scaled_df_test = pd.DataFrame(x_scaled_test, columns=x_test.columns)

  return x_scaled_df_train, x_scaled_df_test


def select_features(x_train, x_test, y_train):
  '''
  Select discriminating features using PCA & univariate feature selection 
  '''
  ## PCA feature selection
  pca = PCA(n_components=None)
  pca.fit(x_train)

  variancelist = np.cumsum(pca.explained_variance_ratio_)
  # plt.figure(figsize=(12,8))
  # plt.plot(variancelist)
  # plt.xlabel('number of components')
  # plt.ylabel('cumulative explained variance');

  ## Determine the amount of features containing 75% of the variance
  comp = np.searchsorted(variancelist,0.75)

  pca_spec = PCA(n_components=comp)
  pca_spec.fit(x_train)
  
  ## Apply PCA to the different sets
  x_pca_train = pd.DataFrame(pca_spec.transform(x_train))
  x_pca_test = pd.DataFrame(pca_spec.transform(x_test))
  
  ## Select the five best features and apply them to the different sets
  Kbest = SelectKBest(f_classif, k=5).fit(x_train, y_train)
  x_Kbest_train = pd.DataFrame(Kbest.transform(x_train))
  x_Kbest_test = pd.DataFrame(Kbest.transform(x_test))

  ## determine the 5 used features
  feature_names = list(x_train.columns.values)
  mask = Kbest.get_support() #list of booleans
  used_features = [] # The list of the 5 best features

  for bool, feature in zip(mask, feature_names):
    if bool:
        used_features.append(feature)

  return x_pca_train, x_Kbest_train, x_pca_test, x_Kbest_test, used_features


def pair_plot(x,y,features):
  '''
  Plot the five selected features using a pairplot
  '''
  x_pairplot = pd.DataFrame(x)
  x_pairplot.columns = features
  y_pairplot = pd.DataFrame(y, columns=['label'])
  y_pairplot = y_pairplot.reset_index(drop=True)
  total = pd.concat([x_pairplot, y_pairplot], axis=1)
  # fig = seaborn.pairplot(total, hue='label')


def crosval (x_train, y_train, x_test, y_test):
  '''
   Determine classifier performance using cross-validation
  '''
  ## Identify the classifiers used for cross validation
  clfs = [svm.SVC(C=0.05 ,kernel='linear'), KNeighborsClassifier(n_neighbors=9), RandomForestClassifier(n_estimators=5, random_state=42, max_depth=4)]
  clfs_name = ['Support Vector Machine','K Nearest Neighbors','Random Forest']

  ## Start cross validation
  # num = 0
  train_acc = []

  for clf in clfs:
    ## Plot learning curve
    # fig = plt.figure(figsize=(24,8*len(clfs)))
    # ax=fig.add_subplot(4, 3, num + 1)
    # num += 1
    # plot_learning_curve(clf,str(type(clf)), x_train, y_train, axes=ax)

    ## Calculate performance
    scores = cross_val_score(clf, x_train, y_train, cv=5)
    mean_scores = scores.mean()
    train_acc.append(mean_scores)

  ## Determine the best performing classifier
  max_score = max(train_acc)
  array = np.array(train_acc)
  idx_max = np.argmax(array)
  clf_max = clfs[idx_max]
  clf_max_name = clfs_name[idx_max]

  ## Apply chosen classifier on test data
  clf_max.fit(x_train, y_train)
  y_pred=clf_max.predict(x_test)
  accuracy = accuracy_score(y_test, y_pred)
  matrix = confusion_matrix(y_test, y_pred)
  sens = matrix[0,0] / (matrix[0,0]+matrix[0,1])
  spec = matrix[1,1] / (matrix[1,0]+matrix[1,1])

  perf = [accuracy, sens, spec]

  return clf_max_name, perf


def print_result(result, feature):
  '''
  Print the result dataframe
  Print the prevalence of each classifier in the result dataframe
  Print the mean accuracy of the overall machine learning algorithm
  --------------
  print the features used in the cross validation
  '''
  print('-'*80+'\n'+'Result')
  print(result)
  print('-'*80+'\n'+'Prevalence of classifiers')
  print(result['Classifier'].value_counts(normalize=True)*100)
  print('='*80+'\n'+'Results over all iterations')
  print('Mean accuracy:',result['Accuracy'].mean())
  print('Sensitivity:',result['Sensitivity'].mean())
  print('Specificity:',result['Specificity'].mean())
  print('='*80)
  print('Prevalence of selected features:'+'\n'+'-'*80+'\n')
  print(feature['Feature'].value_counts(normalize=True)*100*5)


def runscript (data_train, data_test, labels_train, labels_test):
  '''
  Run all the different functions needed for one iteration.
  '''
  x_train = delnan(data_train)
  x_test = delnan(data_test)
  x_scaled_train, x_scaled_test = standardscaler(x_train, x_test)
  x_pca_train, x_Kbest_train, x_pca_test, x_Kbest_test, features = select_features(x_scaled_train, x_scaled_test, labels_train)
  pair_plot(x_Kbest_train, labels_train, features)
  clf, perf = crosval(x_Kbest_train, labels_train, x_Kbest_test, labels_test)

  return clf, perf, features


def start(data, iterations):
  '''
  Run the iterations. Print and save results.
  '''
  ## Run the script "iterations" time
  iteration = 0
  outcome = pd.DataFrame(columns=['Classifier', 'Accuracy','Sensitivity','Specificity'])
  used_features = pd.DataFrame(columns=['Feature'])
  while iteration < iterations:
    x_train, x_test, y_train, y_test = split(data)
    clf, perf, features = runscript(x_train, x_test, y_train, y_test)
    
    ## add results of iteration to dataframe
    add_result = {'Classifier': str((clf)), 'Accuracy': perf[0],
                  'Sensitivity': perf[1], 'Specificity': perf[2]}
    outcome = outcome.append(add_result, ignore_index=True)
    
    ## add features used in iteration to dataframe
    for feature in features:
      add_feature = {'Feature': feature}
      used_features = used_features.append(add_feature, ignore_index=True)
      
    iteration += 1
  
  ## Print results
  print_result(outcome, used_features)
  
  ## Save results
  path = Path('results_TM10007.csv')
  outcome.to_csv(path)
  
  return outcome

In [0]:
'''
Run the script
arg1 = all the data
arg2 = the number of desired iterations
'''
result = start(data, 100)



--------------------------------------------------------------------------------
Result
                Classifier  Accuracy  Sensitivity  Specificity
0      K Nearest Neighbors  0.823529     0.882353     0.764706
1   Support Vector Machine  0.882353     0.952381     0.769231
2            Random Forest  0.911765     1.000000     0.769231
3   Support Vector Machine  0.911765     0.950000     0.857143
4            Random Forest  0.794118     0.863636     0.666667
..                     ...       ...          ...          ...
95     K Nearest Neighbors  0.852941     0.952381     0.692308
96     K Nearest Neighbors  0.823529     0.952381     0.615385
97           Random Forest  0.911765     0.900000     0.928571
98     K Nearest Neighbors  0.882353     0.952381     0.769231
99  Support Vector Machine  0.882353     0.952381     0.769231

[100 rows x 4 columns]
--------------------------------------------------------------------------------
Prevalence of classifiers
K Nearest Neighbors      