In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/MyDrive/Mia/csci566/project

/content/drive/MyDrive/Mia/csci566/project


In [None]:
%cd /content/drive/MyDrive/Mia/csci566/project/readmission

/content/drive/MyDrive/Mia/csci566/project/readmission


Data loading and preprocessing

In [None]:
from tqdm import tqdm
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
import pandas as pd
import os
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score, accuracy_score,roc_auc_score
import random

In [None]:
df = pd.read_csv('1_train_listfile801010.csv')

In [None]:
print(df)

                                                   stay  period_length  y_true
0      88091_279924_episode1_timeseries_readmission.csv        29.0328       1
1      14019_295689_episode1_timeseries_readmission.csv       691.0824       0
2      23847_261301_episode1_timeseries_readmission.csv        33.6120       0
3      78522_287059_episode1_timeseries_readmission.csv       312.6312       1
4      83932_277793_episode2_timeseries_readmission.csv        24.1512       0
...                                                 ...            ...     ...
29863  89097_288765_episode1_timeseries_readmission.csv        27.5832       0
29864  29050_255508_episode1_timeseries_readmission.csv        40.2720       0
29865  31196_236381_episode1_timeseries_readmission.csv        42.1992       0
29866  48233_267174_episode2_timeseries_readmission.csv        22.8960       0
29867  31175_265786_episode1_timeseries_readmission.csv        92.4480       0

[29868 rows x 3 columns]


In [None]:
os.path.exists('78522_287059_episode1_timeseries_readmission.csv')

True

In [None]:
df2 = pd.read_csv('78522_287059_episode1_timeseries_readmission.csv')

In [None]:
numeric_cols = ['Hours', 'Capillary refill rate', 'Diastolic blood pressure', 'Fraction inspired oxygen',\
            'Glascow coma scale total', 'Glucose', 'Heart Rate', 'Height', 'Mean blood pressure',\
            'Oxygen saturation', 'Respiratory rate', 'Systolic blood pressure', 'Temperature',\
            'Weight', 'pH']
df2 = df2[numeric_cols]
for col in numeric_cols:
    df2[col] = df2[col].fillna(method='ffill').fillna(method='bfill').fillna(value=0)
df2_tensor = torch.tensor(df2.values)
# print(df2.head(10))
print(df2_tensor.shape)

torch.Size([436, 15])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].fillna(method='ffill').fillna(method='bfill').fillna(value=0)


In [None]:
df_train = df[:3200]
print(df_train.shape)

(3200, 3)


In [None]:
class Dataset(object):
    """An abstract class representing a Dataset.
    All other datasets should subclass it. All subclasses should override
    ``__len__``, that provides the size of the dataset, and ``__getitem__``,
    supporting integer indexing in range from 0 to len(self) exclusive.
    """

    def __getitem__(self, index):
        raise NotImplementedError

    def __len__(self):
        raise NotImplementedError

In [None]:
class TrainData(Dataset):
    
    def __init__(self, data, labels):
        # padding
        self.data = pad_sequence(data, batch_first=True)[:,0:7680]
        self.labels = labels
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        input = self.data[index]
        label = self.labels[index]
        return input, label

In [None]:
def prepare_data(input):
    df = pd.read_csv(str(input['stay']))
    # only choose numeric-valued features
    numeric_cols = ['Hours', 'Capillary refill rate', 'Diastolic blood pressure', 'Fraction inspired oxygen',\
            'Glascow coma scale total', 'Glucose', 'Heart Rate', 'Height', 'Mean blood pressure',\
            'Oxygen saturation', 'Respiratory rate', 'Systolic blood pressure', 'Temperature',\
            'Weight', 'pH']
    df = df[numeric_cols]
    for col in numeric_cols:
        # fill missing data
        df[col] = df[col].fillna(method='ffill').fillna(method='bfill').fillna(value=0)
    df_tensor = torch.tensor(df.values).reshape(-1)
    return df_tensor

In [None]:
# input size: 1000, runtime: ~6 min
df_train['tensor'] = df_train.apply(prepare_data, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['tensor'] = df_train.apply(prepare_data, axis=1)


In [None]:
df_train['tensor'][0]

tensor([ 12.5000,   0.0000,  40.0000,   0.0000,   0.0000, 105.0000,  78.0000,
          0.0000,  54.0000,  94.0000,  18.0000, 100.0000,  36.4444,  51.7095,
          7.0000,  15.0000,   0.0000,  40.0000,   0.0000,   0.0000, 105.0000,
         78.0000,   0.0000,  54.0000,  94.0000,  18.0000, 100.0000,  36.4444,
         51.7095,   7.0000,  17.3167,   0.0000,  40.0000,   0.0000,   0.0000,
        105.0000,  78.0000,   0.0000,  54.0000,  94.0000,  18.0000, 100.0000,
         36.4444,  51.7095,   7.0000,  17.3333,   0.0000,  40.0000,   0.0000,
          0.0000, 105.0000,  78.0000,   0.0000,  54.0000,  94.0000,  18.0000,
        100.0000,  36.4444,  51.7095,   7.0000,  17.3500,   0.0000,  40.0000,
          0.0000,   0.0000, 105.0000,  78.0000,   0.0000,  54.0000,  94.0000,
         18.0000, 100.0000,  36.4444,  51.7095,   7.0000,  17.3667,   0.0000,
         40.0000,   0.0000,   0.0000, 105.0000,  78.0000,   0.0000,  54.0000,
         94.0000,  18.0000, 100.0000,  36.4444,  51.7095,   7.00

In [None]:
df_test = df[5000:5320]
df_test['tensor'] = df_test.apply(prepare_data, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['tensor'] = df_test.apply(prepare_data, axis=1)


Neural Network

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
data_vectors_train = df_train['tensor'].values
labels_train = df_train['y_true'].values

data_vectors_test = df_test['tensor'].values
labels_test = df_test['y_true'].values

In [None]:
train_data = TrainData(data_vectors_train, labels_train)
train_loader = torch.utils.data.DataLoader(train_data, shuffle=True, batch_size=32)

test_data = TrainData(data_vectors_test, labels_test)
test_loader = torch.utils.data.DataLoader(test_data, shuffle=False, batch_size=32)

In [None]:
for data, target in test_loader:
            print(data)
            print(target)

tensor([[11.0000,  0.0000, 60.0000,  ...,  0.0000,  0.0000,  0.0000],
        [10.1500,  0.0000, 62.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 7.7000,  0.0000, 47.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 1.2500,  0.0000, 89.0000,  ...,  0.0000,  0.0000,  0.0000],
        [13.3000,  0.0000, 75.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 6.4000,  0.0000, 62.0000,  ...,  0.0000,  0.0000,  0.0000]],
       dtype=torch.float64)
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0])
tensor([[ 22.0000,   0.0000,  78.0000,  ...,   0.0000,   0.0000,   0.0000],
        [ 10.8333,   0.0000,  74.0000,  ...,   0.0000,   0.0000,   0.0000],
        [  9.0000,   0.0000,  59.0000,  ...,  37.6111, 124.1000,   5.0000],
        ...,
        [ 10.5000,   0.0000,  62.0000,  ...,   0.0000,   0.0000,   0.0000],
        [  7.9333,   0.0000,  66.0000,  ...,   0.0000,   0.0000,   0.0000],
        [ 14.5833,   0.0000,  60

In [None]:
def pre(predictions):
    results = []
    for i in predictions:
        if i<0.5:
            results.append(0)
        else:
            results.append(1)
    return torch.Tensor(results)

In [None]:
class Model(nn.Module):
    def __init__(self, input_size, output_size):
        super(Model, self).__init__()
        #Defining the layers
        self.fc1 = nn.Linear(input_size, 256)
        self.act1 = nn.ReLU()
        self.fc2 = nn.Linear(256, 128)
        self.act2 = nn.ReLU()
        self.fc3 = nn.Linear(256, output_size)
        self.act3 = nn.Sigmoid()
    
    def forward(self, x):
        
        x = x.float()

        x = self.fc1(x)
        x = self.act1(x)
        # x = self.fc2(x)
        # x = self.act2(x)
        x = self.fc3(x)
        x = self.act3(x)

        return x

In [None]:
def train_model(model, train_loader, test_loader, lr, weight_decay, n_epochs, batch_size):
    # Define Loss, Optimizer
    # criterion = nn.CrossEntropyLoss()
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    # initialize tracker for max test f1 score
    test_f1_max = 0.0 

    # Training Run
    model.train() # prep model for training

    for epoch in tqdm(range(n_epochs)):
        # model.train() # prep model for training
        train_loss = 0.0
        test_loss = 0.0
        num_samples_train = 0
        num_samples_test = 0

        for data, target in train_loader:
            # clear the gradients of all optimized variables
            optimizer.zero_grad()

            # forward pass: compute predicted outputs by passing inputs to the model
            output = model(data).reshape(-1)#.detach()
            output = output.to(torch.float)
            target = target.to(torch.float)

            # print(output)
            # print(target)
            # calculate the loss
            loss = criterion(output, target)
            # loss = F.binary_cross_entropy(output.squeeze(), target)
            # print(output)
            # print(target)
          
            # backward pass: compute gradient of the loss with respect to model parameters
            loss.backward()

            # perform a single optimization step (parameter update)
            optimizer.step()

            # update running training loss
            train_loss += loss.item()*data.size(0)
            num_samples_train += data.size(0)

        model.eval() # prep model for evaluation
        for data, target in test_loader:
            # forward pass: compute predicted outputs by passing inputs to the model
            output = model(data).reshape(-1)#.detach()
            output = output.to(torch.float)
            target = target.to(torch.float)
            # calculate the loss
            loss = criterion(output, target)
            # loss = F.binary_cross_entropy(output.squeeze(), target)
            # update running test loss 
            test_loss += loss.item()*data.size(0)
            num_samples_test += data.size(0)

        train_loss /= num_samples_train
        test_loss /= num_samples_test
        with torch.no_grad():
            y_train_pred = torch.tensor([])
            y_train_true = torch.tensor([])
            y_train_score = torch.tensor([])
            for data, target in train_loader:
                output = model(data)
                # _, pred_batch = output.max(1)
                # print(output)
                score_batch = output
                # _, pred_batch = output.max(1)
                pred_batch = pre(output)
                y_train_pred = torch.cat((y_train_pred, pred_batch))
                y_train_score = torch.cat((y_train_score, score_batch))
                y_train_true = torch.cat((y_train_true, target))
            y_train_pred = np.array(y_train_pred)
            acc_train = accuracy_score(y_train_true, y_train_pred)
            f1_train = f1_score(y_train_true, y_train_pred, average='macro')
            auc_train = roc_auc_score(y_train_true, y_train_score)

            y_test_pred = torch.tensor([])
            y_test_true = torch.tensor([])
            y_test_score = torch.tensor([])
            for data, target in test_loader:
                output = model(data)
                # _, pred_batch = output.max(1)
                score_batch = output
                pred_batch = pre(output)
                _, pred_batch = output.max(1)
                y_test_pred = torch.cat((y_test_pred, pred_batch))
                y_test_score = torch.cat((y_test_score, score_batch))
                y_test_true = torch.cat((y_test_true, target))
            y_test_pred = np.array(y_test_pred)
            acc_test = accuracy_score(y_test_true, y_test_pred)
            f1_test = f1_score(y_test_true, y_test_pred, average='macro')
            auc_test = roc_auc_score(y_test_true, y_test_score)
        print()
        print('Epoch: {} \tTraining Loss: {:.6f} \tTest Loss: {:.6f} \tTraining acc: {:.6f} \tTest acc: {:.6f}\tTraining f1: {:.6f} \tTest f1: {:.6f}\tTraining auc: {:.6f} \tTest auc: {:.6f}'.format(
          epoch+1, 
          train_loss,
          test_loss,
          acc_train,
          acc_test,
          f1_train,
          f1_test,
          auc_train,
          auc_test
          ))
        
      #  # save model if test f1 score has decreased
      #   if f1_test >= test_f1_max:
      #       print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
      #         test_f1_max,
      #         f1_test))
      #       torch.save(model.state_dict(), 'model.pt')
      #       test_f1_max = f1_test

In [None]:
# Instantiate the model with hyperparameters
model = Model(input_size=7680, output_size=1)
# Define hyperparameters
n_epochs = 20
lr = 1e-5
batch_size = 32
weight_decay = 0
train_model(model, train_loader, test_loader, lr, weight_decay, n_epochs, batch_size)

  5%|▌         | 1/20 [00:02<00:50,  2.64s/it]


Epoch: 1 	Training Loss: 0.942517 	Test Loss: 0.945596 	Training acc: 0.846250 	Test acc: 0.868750	Training f1: 0.499094 	Test f1: 0.464883	Training auc: 0.519027 	Test auc: 0.470880


 10%|█         | 2/20 [00:05<00:50,  2.83s/it]


Epoch: 2 	Training Loss: 0.787756 	Test Loss: 0.646966 	Training acc: 0.820937 	Test acc: 0.868750	Training f1: 0.561705 	Test f1: 0.464883	Training auc: 0.609804 	Test auc: 0.497259


 15%|█▌        | 3/20 [00:08<00:45,  2.68s/it]


Epoch: 3 	Training Loss: 0.573468 	Test Loss: 0.596653 	Training acc: 0.857187 	Test acc: 0.868750	Training f1: 0.597130 	Test f1: 0.464883	Training auc: 0.659351 	Test auc: 0.461631


 20%|██        | 4/20 [00:10<00:42,  2.67s/it]


Epoch: 4 	Training Loss: 0.548516 	Test Loss: 0.656960 	Training acc: 0.839375 	Test acc: 0.868750	Training f1: 0.618013 	Test f1: 0.464883	Training auc: 0.684203 	Test auc: 0.522097


 25%|██▌       | 5/20 [00:13<00:41,  2.79s/it]


Epoch: 5 	Training Loss: 0.559587 	Test Loss: 0.743799 	Training acc: 0.872500 	Test acc: 0.868750	Training f1: 0.566916 	Test f1: 0.464883	Training auc: 0.642456 	Test auc: 0.461374


 30%|███       | 6/20 [00:16<00:36,  2.64s/it]


Epoch: 6 	Training Loss: 0.509117 	Test Loss: 0.626390 	Training acc: 0.871563 	Test acc: 0.868750	Training f1: 0.614563 	Test f1: 0.464883	Training auc: 0.729121 	Test auc: 0.486725


 35%|███▌      | 7/20 [00:19<00:39,  3.00s/it]


Epoch: 7 	Training Loss: 0.489955 	Test Loss: 0.838607 	Training acc: 0.778438 	Test acc: 0.868750	Training f1: 0.602478 	Test f1: 0.464883	Training auc: 0.717921 	Test auc: 0.526465


 40%|████      | 8/20 [00:22<00:33,  2.76s/it]


Epoch: 8 	Training Loss: 0.489590 	Test Loss: 0.657670 	Training acc: 0.883750 	Test acc: 0.868750	Training f1: 0.581081 	Test f1: 0.464883	Training auc: 0.738806 	Test auc: 0.470538


 45%|████▌     | 9/20 [00:25<00:31,  2.82s/it]


Epoch: 9 	Training Loss: 0.471825 	Test Loss: 0.574042 	Training acc: 0.870000 	Test acc: 0.868750	Training f1: 0.608072 	Test f1: 0.464883	Training auc: 0.732729 	Test auc: 0.598664


 50%|█████     | 10/20 [00:27<00:27,  2.74s/it]


Epoch: 10 	Training Loss: 0.499918 	Test Loss: 0.692419 	Training acc: 0.886250 	Test acc: 0.868750	Training f1: 0.600763 	Test f1: 0.464883	Training auc: 0.742753 	Test auc: 0.483128


 55%|█████▌    | 11/20 [00:31<00:27,  3.06s/it]


Epoch: 11 	Training Loss: 0.453352 	Test Loss: 0.650711 	Training acc: 0.862187 	Test acc: 0.868750	Training f1: 0.640443 	Test f1: 0.464883	Training auc: 0.754138 	Test auc: 0.489037


 60%|██████    | 12/20 [00:34<00:24,  3.05s/it]


Epoch: 12 	Training Loss: 0.434795 	Test Loss: 0.759300 	Training acc: 0.877500 	Test acc: 0.868750	Training f1: 0.580908 	Test f1: 0.464883	Training auc: 0.749440 	Test auc: 0.489637


 65%|██████▌   | 13/20 [00:37<00:20,  2.98s/it]


Epoch: 13 	Training Loss: 0.407028 	Test Loss: 0.483674 	Training acc: 0.892188 	Test acc: 0.868750	Training f1: 0.651704 	Test f1: 0.464883	Training auc: 0.827203 	Test auc: 0.590185


 70%|███████   | 14/20 [00:39<00:16,  2.81s/it]


Epoch: 14 	Training Loss: 0.388131 	Test Loss: 0.532437 	Training acc: 0.893437 	Test acc: 0.868750	Training f1: 0.625294 	Test f1: 0.464883	Training auc: 0.844650 	Test auc: 0.523381


 75%|███████▌  | 15/20 [00:42<00:13,  2.67s/it]


Epoch: 15 	Training Loss: 0.370929 	Test Loss: 0.502135 	Training acc: 0.901875 	Test acc: 0.868750	Training f1: 0.720416 	Test f1: 0.464883	Training auc: 0.846259 	Test auc: 0.579394


 80%|████████  | 16/20 [00:44<00:10,  2.70s/it]


Epoch: 16 	Training Loss: 0.400861 	Test Loss: 0.657524 	Training acc: 0.896250 	Test acc: 0.868750	Training f1: 0.646326 	Test f1: 0.464883	Training auc: 0.808431 	Test auc: 0.478845


 85%|████████▌ | 17/20 [00:47<00:07,  2.61s/it]


Epoch: 17 	Training Loss: 0.462927 	Test Loss: 0.555765 	Training acc: 0.894375 	Test acc: 0.868750	Training f1: 0.646259 	Test f1: 0.464883	Training auc: 0.831617 	Test auc: 0.547619


 90%|█████████ | 18/20 [00:49<00:05,  2.60s/it]


Epoch: 18 	Training Loss: 0.411533 	Test Loss: 0.528039 	Training acc: 0.902500 	Test acc: 0.868750	Training f1: 0.708743 	Test f1: 0.464883	Training auc: 0.874455 	Test auc: 0.553015


 95%|█████████▌| 19/20 [00:52<00:02,  2.58s/it]


Epoch: 19 	Training Loss: 0.405418 	Test Loss: 0.925310 	Training acc: 0.876563 	Test acc: 0.868750	Training f1: 0.692784 	Test f1: 0.464883	Training auc: 0.790579 	Test auc: 0.428058


100%|██████████| 20/20 [00:55<00:00,  2.78s/it]


Epoch: 20 	Training Loss: 0.500512 	Test Loss: 0.522796 	Training acc: 0.897500 	Test acc: 0.868750	Training f1: 0.661465 	Test f1: 0.464883	Training auc: 0.854706 	Test auc: 0.562008





Oversampling

In [None]:
data_vectors_train = df_train['tensor'].values
data_vectors_train_0 = [data_vectors_train[i] for i in range(len(data_vectors_train)) if labels_train[i] == 0]
majority_len = len(data_vectors_train_0)
data_vectors_train_1 = [data_vectors_train[i] for i in range(len(data_vectors_train)) if labels_train[i] == 1]
data_vectors_train_1_os = random.choices(data_vectors_train_1, k=majority_len)
data_vectors_train_os = data_vectors_train_0 + data_vectors_train_1_os
labels_train_os = np.concatenate((np.zeros(majority_len),np.ones(majority_len)), axis=None).astype('int64')
train_data_os = TrainData(data_vectors_train_os, labels_train_os)
train_loader = torch.utils.data.DataLoader(train_data_os, batch_size=32, shuffle=True)
# data_vectors_test = df_test['tensor'].values

NameError: ignored

In [None]:
# Instantiate the model with hyperparameters
model = Model(input_size=7680, output_size=1)
# Define hyperparameters
n_epochs = 20
lr = 1e-4
batch_size = 32
weight_decay = 0
train_model(model, train_loader, test_loader, lr, weight_decay, n_epochs, batch_size)

  5%|▌         | 1/20 [00:04<01:19,  4.18s/it]


Epoch: 1 	Training Loss: 9.467683 	Test Loss: 9.291238 	Training acc: 0.606746 	Test acc: 0.868750	Training f1: 0.595634 	Test f1: 0.464883	Training auc: 0.651398 	Test auc: 0.610740


 10%|█         | 2/20 [00:09<01:23,  4.62s/it]


Epoch: 2 	Training Loss: 4.810053 	Test Loss: 7.016119 	Training acc: 0.646215 	Test acc: 0.868750	Training f1: 0.644420 	Test f1: 0.464883	Training auc: 0.697581 	Test auc: 0.570658


 15%|█▌        | 3/20 [00:13<01:15,  4.41s/it]


Epoch: 3 	Training Loss: 3.558314 	Test Loss: 3.071363 	Training acc: 0.620919 	Test acc: 0.868750	Training f1: 0.597334 	Test f1: 0.464883	Training auc: 0.694566 	Test auc: 0.625557


 20%|██        | 4/20 [00:17<01:09,  4.36s/it]


Epoch: 4 	Training Loss: 3.637872 	Test Loss: 4.408882 	Training acc: 0.706136 	Test acc: 0.868750	Training f1: 0.701616 	Test f1: 0.464883	Training auc: 0.801548 	Test auc: 0.615450


 25%|██▌       | 5/20 [00:22<01:08,  4.57s/it]


Epoch: 5 	Training Loss: 3.083592 	Test Loss: 4.112598 	Training acc: 0.751346 	Test acc: 0.868750	Training f1: 0.746736 	Test f1: 0.464883	Training auc: 0.817773 	Test auc: 0.598493


 30%|███       | 6/20 [00:26<01:03,  4.51s/it]


Epoch: 6 	Training Loss: 2.917485 	Test Loss: 3.553485 	Training acc: 0.698780 	Test acc: 0.868750	Training f1: 0.697381 	Test f1: 0.464883	Training auc: 0.746953 	Test auc: 0.597936


 35%|███▌      | 7/20 [00:30<00:56,  4.37s/it]


Epoch: 7 	Training Loss: 2.543442 	Test Loss: 3.165892 	Training acc: 0.786329 	Test acc: 0.868750	Training f1: 0.786329 	Test f1: 0.464883	Training auc: 0.852871 	Test auc: 0.583162


 40%|████      | 8/20 [00:36<00:56,  4.72s/it]


Epoch: 8 	Training Loss: 2.603999 	Test Loss: 2.967119 	Training acc: 0.650879 	Test acc: 0.868750	Training f1: 0.624553 	Test f1: 0.464883	Training auc: 0.802977 	Test auc: 0.632151


 45%|████▌     | 9/20 [00:40<00:49,  4.49s/it]


Epoch: 9 	Training Loss: 2.168548 	Test Loss: 3.305994 	Training acc: 0.776642 	Test acc: 0.868750	Training f1: 0.776518 	Test f1: 0.464883	Training auc: 0.852985 	Test auc: 0.582819


 50%|█████     | 10/20 [00:45<00:46,  4.61s/it]


Epoch: 10 	Training Loss: 1.759596 	Test Loss: 3.324629 	Training acc: 0.787047 	Test acc: 0.868750	Training f1: 0.786797 	Test f1: 0.464883	Training auc: 0.855527 	Test auc: 0.588215


 55%|█████▌    | 11/20 [00:49<00:40,  4.55s/it]


Epoch: 11 	Training Loss: 1.845957 	Test Loss: 3.034169 	Training acc: 0.784535 	Test acc: 0.868750	Training f1: 0.783922 	Test f1: 0.464883	Training auc: 0.863459 	Test auc: 0.535886


 60%|██████    | 12/20 [00:53<00:34,  4.32s/it]


Epoch: 12 	Training Loss: 1.536507 	Test Loss: 3.007930 	Training acc: 0.759598 	Test acc: 0.868750	Training f1: 0.758021 	Test f1: 0.464883	Training auc: 0.819086 	Test auc: 0.561665


 65%|██████▌   | 13/20 [00:57<00:29,  4.20s/it]


Epoch: 13 	Training Loss: 0.971123 	Test Loss: 2.222860 	Training acc: 0.686222 	Test acc: 0.868750	Training f1: 0.665191 	Test f1: 0.464883	Training auc: 0.861506 	Test auc: 0.601233


 70%|███████   | 14/20 [01:02<00:26,  4.40s/it]


Epoch: 14 	Training Loss: 0.893737 	Test Loss: 0.794759 	Training acc: 0.824901 	Test acc: 0.868750	Training f1: 0.822958 	Test f1: 0.464883	Training auc: 0.915030 	Test auc: 0.502484


 75%|███████▌  | 15/20 [01:06<00:21,  4.35s/it]


Epoch: 15 	Training Loss: 0.747108 	Test Loss: 0.967497 	Training acc: 0.788841 	Test acc: 0.868750	Training f1: 0.785042 	Test f1: 0.464883	Training auc: 0.900324 	Test auc: 0.585260


 80%|████████  | 16/20 [01:10<00:17,  4.37s/it]


Epoch: 16 	Training Loss: 0.642263 	Test Loss: 1.144829 	Training acc: 0.874955 	Test acc: 0.868750	Training f1: 0.874697 	Test f1: 0.464883	Training auc: 0.937382 	Test auc: 0.562050


 85%|████████▌ | 17/20 [01:14<00:12,  4.23s/it]


Epoch: 17 	Training Loss: 0.390227 	Test Loss: 1.315994 	Training acc: 0.842124 	Test acc: 0.868750	Training f1: 0.841347 	Test f1: 0.464883	Training auc: 0.905096 	Test auc: 0.580207


 90%|█████████ | 18/20 [01:18<00:08,  4.19s/it]


Epoch: 18 	Training Loss: 0.417962 	Test Loss: 1.133682 	Training acc: 0.826337 	Test acc: 0.868750	Training f1: 0.822628 	Test f1: 0.464883	Training auc: 0.915864 	Test auc: 0.556012


 95%|█████████▌| 19/20 [01:22<00:04,  4.08s/it]


Epoch: 19 	Training Loss: 0.386246 	Test Loss: 0.695950 	Training acc: 0.849839 	Test acc: 0.868750	Training f1: 0.848795 	Test f1: 0.464883	Training auc: 0.949219 	Test auc: 0.510106


100%|██████████| 20/20 [01:27<00:00,  4.39s/it]


Epoch: 20 	Training Loss: 0.309922 	Test Loss: 0.730590 	Training acc: 0.881414 	Test acc: 0.868750	Training f1: 0.880700 	Test f1: 0.464883	Training auc: 0.969668 	Test auc: 0.505824





Random forests

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score

In [None]:
data_vectors_train = df_train['tensor'].values
data_vectors_train_0 = [data_vectors_train[i] for i in range(len(data_vectors_train)) if labels_train[i] == 0]
majority_len = len(data_vectors_train_0)
data_vectors_train_1 = [data_vectors_train[i] for i in range(len(data_vectors_train)) if labels_train[i] == 1]
data_vectors_train_1_os = random.choices(data_vectors_train_1, k=majority_len)
data_vectors_train_os = data_vectors_train_0 + data_vectors_train_1_os
labels_train_os = np.concatenate((np.zeros(majority_len),np.ones(majority_len)), axis=None).astype('int64')
train_data_os = TrainData(data_vectors_train_os, labels_train_os)
train_loader = torch.utils.data.DataLoader(train_data_os, batch_size=32, shuffle=True)

In [None]:
x_train = pad_sequence(df_train['tensor'],batch_first=True)

In [None]:
x_test = pad_sequence(df_test['tensor'], batch_first=True)

In [None]:
print(x_train.size())
print(x_test.size())

torch.Size([3200, 60330])
torch.Size([320, 23130])


In [None]:
a = np.zeros([len(x_train),len(max(x_train,key = lambda x: len(x)))])
for i,j in enumerate(x_train):
        a[i][0:len(j)] = j
a = a[:,0:x_test.size(1)]

In [None]:
# convert list of lists with different lengths to a numpy array
b = np.zeros([len(x_test),len(max(x_train,key = lambda x: len(x)))])
for i,j in enumerate(x_test):
        b[i][0:len(j)] = j
b = b[:,0:x_test.size(1)]

In [None]:
print(np.size(b,axis=0))
print(np.size(b,axis=1))

320
23130


In [None]:
rf = RandomForestClassifier(n_estimators=300,max_depth=10)
rf.fit(a, df_train['y_true'])

In [None]:
y_pred = rf.predict(b)
y_pred_prob = rf.predict_proba(b)
print(len(y_pred))
fpr, tpr, thresholds = metrics.roc_curve(df_test['y_true'], y_pred_prob[:,1])
auc = metrics.auc(fpr, tpr)
print('Random forests auc is', auc)
print('Random forests f1 is', f1_score(df_test['y_true'], y_pred, average='macro'))
print('Random forests accuracy is',accuracy_score(df_test['y_true'], y_pred))

320
Random forests auc is 0.6220452209660843
Random forests f1 is 0.46488294314381273
Random forests accuracy is 0.86875


In [None]:
from sklearn.svm import SVC

In [None]:
svm = SVC(probability=True)
svm.fit(a, df_train['y_true'])

In [None]:
y_pred = svm.predict(b)
y_pred_prob = svm.predict_proba(b)
print(len(y_pred))
fpr, tpr, thresholds = metrics.roc_curve(df_test['y_true'], y_pred_prob[:,1])
auc = metrics.auc(fpr, tpr)
print('SVM auc is', auc)
print('SVM f1 is', f1_score(df_test['y_true'], y_pred, average='macro'))
print('SVM accuracy is',accuracy_score(df_test['y_true'], y_pred))

320
SVM auc is 0.6252141144227474
SVM f1 is 0.4889174554945269
SVM accuracy is 0.871875


XG-boost

In [None]:
import xgboost as xgb

In [None]:
model = xgb.XGBClassifier(
    n_estimators = 300,
    max_depth = 10,
)
model.fit(a,df_train['y_true'])

In [None]:
y_pred_prob = model.predict_proba(b)
y_pred = model.predict(b)
print(len(y_pred))
fpr, tpr, thresholds = metrics.roc_curve(df_test['y_true'], y_pred_prob[:,1])
auc = metrics.auc(fpr, tpr)
print('XGboost auc is', auc)
print('XGboost f1 is', f1_score(df_test['y_true'], y_pred, average='macro'))
print('XGboost accuracy is',accuracy_score(df_test['y_true'], y_pred))

320
XGboost auc is 0.5969510106200754
XGboost f1 is 0.4612794612794613
XGboost accuracy is 0.85625
