# Report for the sleep data project

## Sleep Data Description


In [None]:
import pywt
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.ndimage    
import scipy.signal    
import os
import pickle
from feature_extractor import *

In [None]:
# read the data for subject a
data = pd.read_csv('../data/by_subject/a_data.csv')
labels = pd.read_csv('../data/by_subject/a_labels.csv')

### Median filter justification

In [None]:
plt.figure(figsize = (15,5))
plt.plot(data['Ch0'])
plt.show()

In [None]:
plt.figure(figsize = (15,5))
plt.plot(data['Ch0'][1034300:1034500])
plt.show()

In [None]:
# Median filter implementation loop
pre_processed = scipy.signal.medfilt(data['Ch0'], kernel_size=3)

In [None]:
plt.figure(figsize = (15,5))
plt.plot(pre_processed[1034300:1034500])
plt.show()

In [None]:
# group datapoints into bins, corresponding to a second of recording time maybe mit preprocessing
data['TimestampToSec'] = data['Timestamp'].astype(int)
grouped = data.groupby('TimestampToSec')

In [None]:
# plot a second of data of all channels

single_sec_data = grouped.get_group(1489016350)
single_sec_ch = single_sec_data['Ch0']

#plt.plot(single_sec_ch)
plt.plot(single_sec_data['Ch0'])
plt.plot(single_sec_data['Ch1'])
plt.plot(single_sec_data['Ch2'])
plt.plot(single_sec_data['Ch3'])
plt.plot(single_sec_data['Ch4'])
plt.plot(single_sec_data['Ch5'])
plt.plot(single_sec_data['Ch6'])
plt.plot(single_sec_data['Ch7'])



plt.show()


## Discrete Wavelet Transform

### Discrete Wavelet Transform Overview

### Feature Extraction for sleep classification 

In [None]:
mode = pywt.Modes.smooth

def signal_decomp(data):
    """Decompose and plot a signal S.
    S = An + Dn + Dn-1 + ... + D1
    """
    w = pywt.Wavelet('db4')
    a = data
    ca = []
    cd = []
    for i in range(5):
        (a, d) = pywt.dwt(a, w, mode)
        ca.append(a)
        cd.append(d)  
    return ca, cd

In [None]:
def Energy(coeffs, k):
    return np.sqrt(np.sum(np.array(coeffs[-k]) ** 2)) / len(coeffs[-k])

In [None]:
def plot_signal_decomp(data, w, title):
    ca, cd = signal_decomp(data)
        
    rec_a = []
    rec_d = []

    for i, coeff in enumerate(ca):
        coeff_list = [coeff, None] + [None] * i
        rec_a.append(pywt.waverec(coeff_list, w))

    for i, coeff in enumerate(cd):
        coeff_list = [None, coeff] + [None] * i
        rec_d.append(pywt.waverec(coeff_list, w))

    fig = plt.figure(figsize=(12,10))
    ax_main = fig.add_subplot(len(rec_a) + 1, 1, 1)
    ax_main.set_title(title, fontsize=20)
    ax_main.plot(data)
    ax_main.set_xlim(data.index[0], data.index[len(data) - 1])

    for i, y in enumerate(rec_a):
        ax = fig.add_subplot(len(rec_a) + 1, 2, 3 + i * 2)
        ax.plot(y, 'r')
        ax.set_xlim(0, len(y) - 1)
        ax.set_ylabel("A%d" % (i + 1))

    for i, y in enumerate(rec_d):
        ax = fig.add_subplot(len(rec_d) + 1, 2, 4 + i * 2)
        ax.plot(y, 'g')
        ax.set_xlim(0, len(y) - 1)
        ax.set_ylabel("D%d" % (i + 1))


In [None]:
plot_signal_decomp(single_sec_ch, 'db4', "Single Sec single Channel EEG data")
plt.show()

### ignore from here

In [None]:
# CONSTRUCT FEATURES

# for every label, look up the corresponding data
features = []
for l in range(len(labels)):
    try:
        time = labels['Timestamp'][l]
        slice = grouped.get_group(time)
    except KeyError:
        print(time)
        pass
    # for every channel
    power_all_channels = []
    # 1-7 EEG, 8th channel is ECG data
    for ch in range(8):
        single_sec_ch = slice['Ch{}'.format(ch)]
        
        # median filter the data
        pre_processed = scipy.signal.medfilt(single_sec_ch, kernel_size=3)  
        
        _, cd = signal_decomp(pre_processed)
        # for every decomp. level
        power = []
        for l in range(5):
            power.append(Energy(cd, l))
            
        # collect power for all channels into one vector 
        power_all_channels.append(power) 
    # currently mean power of the frequency bands over all channels are the only features
    power_vec = np.asarray(power_all_channels).flatten()
    features.append(power_vec)
features =np.asarray(features)
 

In [None]:
print(features.shape)

## Classification

### Load all the features
If no features are available run the feature_extractor.py to get the feature files

In [None]:
from sklearn import ensemble
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from scipy.stats import randint as sp_randint




In [None]:
features = pd.read_csv("../data/precomputed_features/features.csv")
targets = pd.read_csv("../data/precomputed_features/targets.csv")
targets.columns = ['stages']

### Create a separate test set to test our classifiers on

In [None]:
X_train,X_test,y_train,y_test = train_test_split(
...     features, targets['stages'], test_size=0.33, random_state=0)


### General Set up
 10 fol crossvalidation, Random Search for hyperparamters

In [None]:
# Utility function to report best scores for Random Search
def report(results, n_top=n_iter_search):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

### Random Forest

In [None]:
# run randomized search
n_iter_search = 20


## Random Forst
clf = ensemble.RandomForestClassifier(n_estimators = 10, criterion='entropy', class_weight='balanced', n_jobs = -1)


# specify parameters and distributions to sample from
param_dist = {"n_estimators":sp_randint(1, 100),
              "max_depth": [3, None],
              "max_features": sp_randint(1, 40),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}


random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)

random_search.fit(X_train, y_train)

report(random_search.cv_results_)


In [None]:
#bootstrap': True, 'criterion': 'entropy', 'max_depth': None, 'max_features': 10, 'n_estimators': 61

## Random Forst
rf_clf = ensemble.RandomForestClassifier(n_estimators = 73, criterion='entropy', class_weight='balanced', max_features=21, n_jobs = -1)


rf_predicted = cross_val_predict(rf_clf, X_train, y_train, cv=10)

rf_acc = metrics.accuracy_score(y_train, rf_predicted)
print("This is the Score: {}".format(rf_acc))


### AdaBoost

In [None]:
# run randomized search
n_iter_search = 20

## AdaBoost
clf = ensemble.AdaBoostClassifier()


# specify parameters and distributions to sample from
param_dist = {"n_estimators":sp_randint(50, 250),
              "algorithm": ["SAMME", "SAMME.R"],
              "base_estimator": [DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=2), DecisionTreeClassifier(max_depth=3)]
             }


random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)

random_search.fit(X_train, y_train)

report(random_search.cv_results_)

In [None]:
#bootstrap': True, 'criterion': 'entropy', 'max_depth': None, 'max_features': 10, 'n_estimators': 61

## Random Forst
ada_clf = ensemble.AdaBoostClassifier(base_estimator= DecisionTreeClassifier(max_depth=3),n_estimators=188, algorithm ='SAMME.R')


ada_predicted = cross_val_predict(ada_clf, X_train, y_train, cv=10)

ada_acc = metrics.accuracy_score(y_train, ada_predicted)
print("This is the Score: {}".format(ada_acc))


### Smoothing Bayesian

## Results

In [None]:
from sklearn.metrics import confusion_matrix
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=18)
    #plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45, fontsize=18)
    plt.yticks(tick_marks, classes, fontsize=18)

    if normalize:
        float_formatter = lambda x: "%.2f" % x
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, float_formatter(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black", fontsize=18)
        else:
            plt.text(j, i, cm[i, j],
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black", fontsize=18)

    plt.tight_layout()
    plt.ylabel('True label', fontsize=18)
    plt.xlabel('Predicted label', fontsize=18)

In [None]:
# get class names for labels of plot
class_names, counts = np.unique(y_train, return_counts=True)
    
# Compute confusion matrix
np.set_printoptions(precision=2)

rf_cnf_matrix = confusion_matrix(y_train, rf_predicted)
ada_cnf_matrix = confusion_matrix(y_train, ada_predicted)

# Plot non-normalized confusion matrix
plt.figure(figsize=(20,10))

plt.subplot(121)
plot_confusion_matrix(rf_cnf_matrix, classes=class_names,
                      title='Confusion matrix for the Random Forest ')

# Plot normalized confusion matrix
plt.subplot(122)
plot_confusion_matrix(ada_cnf_matrix, classes=class_names,
                      title='Confusion matrix for the Ada Boost Classifiers')

plt.show()

### Test Set

In [None]:
rf_clf.fit(X_train,y_train)
ada_clf.fit(X_train,y_train)


rf_pred_test = rf_clf.predict(X_test)
ada_pred_test = ada_clf.predict(X_test)


rf_acc_test = metrics.accuracy_score(y_test, rf_pred_test)
print("This is the Score for Random Forest on the test set: {}".format(rf_acc_test))

ada_acc_test = metrics.accuracy_score(y_test, ada_pred_test)
print("This is the Score for Ada-Boost on the test set: {} \n \n".format(ada_acc_test))

rf_cnf_matrix_test = confusion_matrix(y_test, rf_pred_test)
ada_cnf_matrix_test = confusion_matrix(y_test, ada_pred_test)

# Plot non-normalized confusion matrix
plt.figure(figsize=(20,10))
plt.subplot(121)
plot_confusion_matrix(rf_cnf_matrix_test, classes=class_names,
                      title='Confusion matrix for the Random Forest for the test set')

# Plot normalized confusion matrix
plt.subplot(122)
plot_confusion_matrix(ada_cnf_matrix_test, classes=class_names, normalize=False,
                      title='Confusion matrix for the Random Forest for test set')

plt.show()

## Conclusion