# TM10007 Assignment template Head and Neck cancer Radiomics 

In [17]:
# Run this to use from colab environment
#!pip install -q --upgrade git+https://github.com/karinvangarderen/tm10007_project.git

## Data loading and cleaning

Below are functions to load the dataset of your choice. After that, it is all up to you to create and evaluate a classification method. Beware, there may be missing values in these datasets. Good luck!

## Importing Modules

In [39]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneOut 
from sklearn.model_selection import RepeatedKFold
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
import sklearn.linear_model 
import numpy as np
import statistics

## 1. Data Collection 

In [5]:
from hn.load_data import load_data

data = load_data()
print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')

if data.isnull().values.any():
    print('In the csv data file, some values are missing or NaN'), sys.exit


The number of samples: 113
The number of columns: 160


## 2. Preprocessing 
The data is split into feature values and labels. (high risk or low risk). The amount of high-risk and low-risk patients in printed as an output. Handling missing data and implementing standard scalar on the data


In [36]:
features = data.loc[:, data.columns != 'label'].values
features = StandardScaler().fit_transform(features)
labels = data.loc[:,['label']].values
labels = [item if item!='T12' else 0 for item in labels]
labels = [item if item!='T34' else 1 for item in labels]
labels = np.array(labels)
print(f'Number of high risk patients: {np.count_nonzero(labels)}') 
print(f'Number of low risk patients: {len(labels) - np.count_nonzero(labels)}')


Number of high risk patients: 55
Number of low risk patients: 58


## 3. Feature selection


In [45]:
#Hier moet feature selection komen, weet alleen nog niet helemaal hoe...

## 4. Data mining 
The data is divided into a training, validation and test set. The validation set is used to optimize the hyperparameters during the data mining process. 
Two different classifiers will be used: Random Forest and k-nearest neighbour. A principal component analysis is computed. The amount of components to reach a 98% variance was calculated and used in the analysis. 
- Leave one out cross validation
- KFold cross validation

In [34]:
# splitting the data in a training and a test set 
def split_sets(features, labels):
    """
    splits the features and labels into a training set (80%) and test set (20%)
    """
    x_train, x_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.2, random_state=1)
    return x_train, x_test, y_train, y_test 

x_train, x_test, y_train, y_test = split_sets(features, labels) 

#The training set is again divided into a training and a validation set and afterwards classified using a leave one out validation and logistic regression.
def leave_one_out_val(x,y):
    """
    Leave One Out Cross Validation using Logistic Regression as a classifier
    """

    loo = LeaveOneOut()
    loo.get_n_splits(x,y)

    LeaveOneOut() 

    prediction = [] 
    y_val_total = []

    for train_index, val_index in loo.split(x,y):
        x_train, x_val = x[train_index], x[val_index]
        y_train, y_val= y[train_index], y[val_index]
    
        lrg= sklearn.linear_model.LogisticRegression()
        lrg.fit(x_train,y_train) 
    
        lrg_predicted=lrg.predict(x_val)
        prediction.append(lrg_predicted)
        y_val_total.append(y_val)
    accuracy = accuracy_score(y_val_total, prediction)

    return accuracy

accuracy = leave_one_out_val(x_train,y_train)
print(f'Leave one out cross validation accuracy: {accuracy}')

#The training set is again divided into a training and a validation set and afterwards classified using a Kfold cross validation and logistic regression.
def cross_val(x,y):
    """
    Cross validation using a Logistic Regression classifier (5 folds)
    """

    crss_val = RepeatedKFold(n_splits = 5, n_repeats=10, random_state = None)           
    crss_val.get_n_splits(x, y)

    performances = [] 

    for train_index, val_index in crss_val.split(x, y):
        x_train, x_val = x[train_index], x[val_index]
        y_train, y_val= y[train_index], y[val_index]

        lrg=sklearn.linear_model.LogisticRegression()
        lrg.fit(x_train,y_train) 
        prediction=lrg.predict(x_val)
        accuracy = accuracy_score(y_val, prediction)
        performances.append(accuracy)

    return performances

accuracy = cross_val(x_train, y_train)
print(f'KFold cross validation accuracies: {accuracy}')
print(f'Kfold cross validation average accuracy: {statistics.mean(accuracy)}')


Leave one out cross validation accuracy: 0.6666666666666666
KFold cross validation accuracies: [0.7222222222222222, 0.7777777777777778, 0.7222222222222222, 0.6666666666666666, 0.7222222222222222, 0.5555555555555556, 0.6666666666666666, 0.6111111111111112, 0.6111111111111112, 0.7222222222222222, 0.8333333333333334, 0.7222222222222222, 0.7222222222222222, 0.6666666666666666, 0.6666666666666666, 0.7777777777777778, 0.7222222222222222, 0.6111111111111112, 0.5555555555555556, 0.5555555555555556, 0.6111111111111112, 0.6111111111111112, 0.6111111111111112, 0.6666666666666666, 0.6666666666666666, 0.7222222222222222, 0.9444444444444444, 0.5555555555555556, 0.7222222222222222, 0.5555555555555556, 0.7777777777777778, 0.6666666666666666, 0.6666666666666666, 0.7222222222222222, 0.6666666666666666, 0.6111111111111112, 0.5, 0.6666666666666666, 0.8333333333333334, 0.6111111111111112, 0.5555555555555556, 0.6111111111111112, 0.5555555555555556, 0.6666666666666666, 0.6111111111111112, 0.6666666666666666,

## 6. Evaluation
Outcome measures including: 
- boxplots
- confusion matrix 
- ROC curves
