In [288]:
import pandas as pd
import numpy as np
from jupyter_utils import jupyter_setup, load_tracker
jupyter_setup()
import os
from evaluation_and_tracking import IDPerformanceTracker
import torch
from sklearn.metrics import roc_curve, auc
from sklearn.linear_model import LinearRegression

C:\Michi\acoustic_covid_detection\python


In [152]:
ID_PERFORMANCE_TRACKING = "test_linearRegression.pickle"
id_performance = IDPerformanceTracker(ID_PERFORMANCE_TRACKING)

In [153]:
len(id_performance)

2898

In [154]:
eval_data = id_performance.df[id_performance.df.set_type == "eval"]
test_data = id_performance.df[id_performance.df.set_type == "test"]

In [155]:
eval_ids = eval_data.ID.unique()
test_ids = test_data.ID.unique()
len(eval_ids), len(test_ids)

(360, 466)

In [156]:
recording_types = eval_data.rec_type.unique()
recording_types

array(['combined_speech', 'combined_coughs', 'combined_vowels',
       'combined_breaths'], dtype=object)

In [900]:
class Participant:
    def __init__(self, identifier, df, allow_n_missing_recordings=3):
        self.id = identifier
        self.cough = self.get_single_prediction("combined_coughs", df)
        self.speech = self.get_single_prediction("combined_speech", df)
        self.breath = self.get_single_prediction("combined_breaths", df)
        self.vowels = self.get_single_prediction("combined_vowels", df)
        self.label = df[df.ID == identifier].label.values[0]
        
        no_recording = 0
        if self.cough is None:
            no_recording += 1
            self.cough = 0
        if self.speech is None:
            no_recording += 1
            self.speech = 0
        if self.breath is None:
            no_recording += 1
            self.breath = 0
        if self.vowels is None:
            no_recording += 1
            self.vowels = 0
        if no_recording > allow_n_missing_recordings:
            raise ValueError("there is at least 1 recording not present")
            
            
        
    def get_single_prediction(self, rec_type, df):
        idx = np.logical_and(df.ID == self.id, df.rec_type == rec_type)
        n_entries = len(df[idx])
        if n_entries == 1:
            prediction = df[idx].prediction.values[0][-1]
        elif n_entries == 0:
            # print("error")
            # raise ValueError("No Entry for this")
            # prediction = 0
            prediction = None
        else:
            raise ValueError("there cannot be more than one entry with the same ID and rec type")
        # add sigmoid???
        # print(prediction)
        return prediction
    
    def get_all_predictions(self):
        return np.array([self.cough, self.speech, self.breath, self.vowels])
    # def calculate AUCROC, accuracy, loss for one category and after linear regression?

In [901]:
eval_data[eval_data.ID == "00xKcQMmcAhX8CODgBBLOe7Dm0T2"].label.values[0]

1.0

In [902]:
participant = Participant(eval_ids[0], eval_data)

In [903]:
np.append(participant.get_all_predictions(), 1)

array([-0.63678253, -0.19896442, -0.26176679, -0.22631815,  1.        ])

In [904]:
def get_linregr_matrices(eval_ids, data):
    predictions_matrix = np.array([])
    for i, participant_id in enumerate(eval_ids):
        try:
            participant = Participant(participant_id, data)
        except ValueError:
            # print("error")
            continue
        # print(participant_id)
        if i == 0 or len(predictions_matrix) == 0:
            predictions_matrix = participant.get_all_predictions()
            labels = np.array([participant.label])
        else:
            predictions = participant.get_all_predictions()
            # print(predictions)
            predictions_matrix = np.vstack([predictions_matrix, predictions])
            labels = np.append(labels, participant.label)
        # print(participant_id)
    return predictions_matrix, labels

In [905]:
def get_accuracy(labels, predictions, threshold=0.5, verbose=False):
    labels_bool = labels > threshold
    # predictions = torch.sigmoid(torch.Tensor(predictions))
    predicted_labels = predictions > threshold
    n_correctly_predicted = np.sum(predicted_labels == labels_bool) / len(predictions)
    return np.round(n_correctly_predicted*100, 1)

In [906]:
def get_aucroc(labels, predictions):
    # using mixup, the resulting labels are no longer binary but continous between 0 and 1
    # we round to get any kind of result but for the training data, the auc-roc is not quite meaningful
    # labels = np.round(self.labels)
    # try:
    fpr, tpr, thresh = roc_curve(labels, predictions)
    aucroc = auc(fpr, tpr)
    # except ValueError:
    #     # fpr, tpr, thresh = 0, 0, 0
    #     aucroc = 0.0
    return np.round(aucroc*100, 1)

# LINEAR REGRESSION

In [907]:
def sigmoid(A):
    return torch.sigmoid(torch.Tensor(A)).numpy()

In [908]:
def extend_linregr_matrx(A):
    # include further components for the linear regression, like a constant, square of each component, inverse squre, square root,...)
    # bias = np.ones((A.shape[0], 1))
    # sign = np.sign(A)
    absolute = np.abs(A)
    squares = np.power(A, 2)
    # cubes = np.power(A, 3)
    roots = np.power(absolute, 1/2)
    # cuberoots = np.power(absolute, 1/3)
    # power_four = np.power(A, 4)
    # power_five = np.power(A, 5)
    return np.concatenate((A, squares, roots), axis=1)

In [909]:
A_val, y_val = get_linregr_matrices(eval_ids, eval_data)

In [910]:
A_test, y_test = get_linregr_matrices(test_ids, test_data)

In [911]:
len(A_val), len(A_test)

(360, 466)

In [912]:
# A_val = sigmoid(A_val)
# A_test = sigmoid(A_test)

In [918]:
A_val = extend_linregr_matrx(A_val)
A_test = extend_linregr_matrx(A_test)

In [919]:
model = LinearRegression().fit(A_val, y_val)

In [920]:
y_val_pred = model.predict(A_val)
get_aucroc(y_val, y_val_pred), get_accuracy(y_val, y_val_pred)

(90.2, 84.2)

In [921]:
y_test_pred = model.predict(A_test)
get_aucroc(y_test, y_test_pred), get_accuracy(y_test, y_test_pred)

(89.4, 92.5)

In [922]:
model.coef_

array([ 0.2152338 ,  0.26773116,  0.37782496,  0.07782764,  0.26772716,
        0.2044399 ,  0.35342703,  0.01884383, -0.12775933, -0.13247216,
       -0.04418645, -0.0956226 ])

In [800]:
# AUCROC and accuracy for validation set and test set before linear regression
# keep in mind that IDs that were excluded for various reasons (e.g. audio quality) were set to have a prediction right in the middle (0.5) which decreases the performance, especially accuracy
for rec_type_idx in range(4): 
    pred = torch.sigmoid(torch.Tensor(A_val))[:, rec_type_idx].numpy()
    print("eval set:   ", get_aucroc(y_val, pred), get_accuracy(y_val, pred))
    pred = torch.sigmoid(torch.Tensor(A_test))[:, rec_type_idx].numpy()
    print("test set:   ", get_aucroc(y_test, pred), get_accuracy(y_test, pred))
    print("#################################################")

eval set:    77.4 83.8
test set:    79.2 87.0
#################################################
eval set:    83.9 87.8
test set:    78.8 88.9
#################################################
eval set:    82.6 82.7
test set:    82.1 86.0
#################################################
eval set:    76.2 82.7
test set:    76.7 87.6
#################################################
