## Setup

In [1]:
import numpy as np
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

from sklearn.neighbors import KNeighborsClassifier

import json
import random

from utils import *

import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open("data/sub_1000.json","r") as j_file:
    data = json.load(j_file)

In [3]:
split_ratios = [0.7, 0.15, 0.15]

random.shuffle(data)
train_data, val_data, test_data = split_list(data, split_ratios)

train_data += val_data
print(f"""
train: {len(train_data)}
 test: {len(test_data)}
""")


train: 850
 test: 150



In [4]:
def extract_features(dataloader, encoder, device='cuda'):
    features = []
    labels = []

    # Ensure the encoder is on the correct device and switch to evaluation mode
    encoder = encoder.to(device)
    encoder.eval()

    with torch.no_grad():
        for data in dataloader:
            inputs, targets = data['embeddings'].to(device), data['label']  # Move inputs to GPU
            feature = encoder(inputs)
            features.append(feature.cpu().numpy())  # Move features back to CPU and convert to numpy
            labels.append(targets.numpy())

    features = np.vstack(features)
    labels = np.hstack(labels)

    return features, labels


In [5]:
### load embeddings ###
with open('data/text2embeddings.pkl', 'rb') as f:
    text2embeddings = pickle.load(f)

# pretrained_hae = PatientAutoencoder.load_from_checkpoint(checkpoint_path='checkpoints/last.ckpt')
# encoder = nn.Sequential(
#     pretrained_hae.encounter_autoencoder,
#     pretrained_hae.patient_encoder
# )
# for param in encoder.parameters():
#     param.requires_grad = False

num = len([f for f in os.listdir("./checkpoints_tmp") if 'last' in f]) - 1
pretrained_hae = PatientAutoencoder.load_from_checkpoint(checkpoint_path=f'checkpoints_tmp/last-v{num}.ckpt')
encoder = nn.Sequential(
    pretrained_hae.encounter_autoencoder,
    pretrained_hae.patient_encoder
)



### setup dataloaders ###
train_ds = PatientDataset(train_data + val_data, text2embeddings, undersample=True)
test_ds = PatientDataset(test_data, text2embeddings, undersample=False)

train_dl = DataLoader(train_ds, batch_size=20, shuffle=True, collate_fn=collate_fn, num_workers=5, persistent_workers=True)
test_dl = DataLoader(test_ds, batch_size=20, collate_fn=collate_fn, num_workers=5)

# Extract features
X_train, y_train = extract_features(train_dl, encoder)
X_test, y_test = extract_features(test_dl, encoder)


## Kneighbors

In [10]:
# Define the model
kn = KNeighborsClassifier()

# Specify parameters and distributions to sample from
param_dist = {
    'n_neighbors': [2, 4, 6, 8, 10],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [1, 10, 30, 50, 70],
    'p': [1.0, 1.2, 1.4, 1.6, 1.8, 2.0]
}

# Random search of parameters
random_search = RandomizedSearchCV(estimator=kn, param_distributions=param_dist, n_iter=200, cv=10, verbose=0, random_state=42, n_jobs=-1)

# Fit the random search model
random_search.fit(X_train, y_train)


In [11]:
# Predict on test data with the best parameters
y_pred = random_search.predict(X_test)

# Print results
print("Best parameters found: ", random_search.best_params_)
print("Classification report:\n", classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

Best parameters found:  {'weights': 'distance', 'p': 1.6, 'n_neighbors': 6, 'leaf_size': 10, 'algorithm': 'brute'}
Classification report:
               precision    recall  f1-score   support

           0       0.80      0.41      0.54       119
           1       0.21      0.61      0.32        31

    accuracy                           0.45       150
   macro avg       0.51      0.51      0.43       150
weighted avg       0.68      0.45      0.50       150

Accuracy: 0.4533333333333333


In [12]:
confusion_matrix(y_test, y_pred)

array([[49, 70],
       [12, 19]])

## SVM

In [9]:
from sklearn.model_selection import cross_val_score

c_s = np.logspace(0.1, 4, 20)
gamma_s = np.logspace(-9, 3, 13) #['scale', 'auto'] # np.logspace(-9, 3, 7),
kernel_s = ['rbf'] #['linear', 'rbf', 'poly', 'sigmoid']

for c in c_s:
    for gamma in gamma_s:
        for kernel in kernel_s:
            clf = SVC(kernel=kernel, C=c,gamma=gamma, random_state=42).fit(X_train, y_train)
            scores = cross_val_score(clf, X_train, y_train, cv=5)
            print(f"{c} | {gamma} | {kernel} | {scores.mean()}")
            y_pred = clf.predict(X_test)
            print(confusion_matrix(y_pred, y_test))

1.258925411794167 | 9.999999999999999e-10 | rbf | 0.4951807228915662
[[120  30]
 [  0   0]]
1.258925411794167 | 1e-08 | rbf | 0.4951807228915662
[[120  30]
 [  0   0]]
1.258925411794167 | 1e-07 | rbf | 0.4951807228915662
[[120  30]
 [  0   0]]
1.258925411794167 | 1e-06 | rbf | 0.4951807228915662
[[120  30]
 [  0   0]]
1.258925411794167 | 9.999999999999999e-06 | rbf | 0.4951807228915662
[[120  30]
 [  0   0]]
1.258925411794167 | 9.999999999999999e-05 | rbf | 0.4951807228915662
[[120  30]
 [  0   0]]
1.258925411794167 | 0.001 | rbf | 0.4951807228915662
[[120  30]
 [  0   0]]
1.258925411794167 | 0.01 | rbf | 0.4951807228915662
[[120  30]
 [  0   0]]
1.258925411794167 | 0.09999999999999999 | rbf | 0.4951807228915662
[[120  30]
 [  0   0]]
1.258925411794167 | 1.0 | rbf | 0.4975616752725186
[[53 16]
 [67 14]]
1.258925411794167 | 10.0 | rbf | 0.5047045324153758
[[28  8]
 [92 22]]
1.258925411794167 | 100.0 | rbf | 0.5166092943201377
[[40  8]
 [80 22]]
1.258925411794167 | 1000.0 | rbf | 0.51184

In [21]:
# 1.258925411794167 | 10.0 | rbf | 0.5299683544303797
# [[71 22]
#  [46 11]]


# 1.258925411794167 | 1000.0 | rbf | 0.5099683544303797


array([1.00000000e-04, 2.63665090e-04, 6.95192796e-04, 1.83298071e-03,
       4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04])

In [1]:
# Define the model
svm = SVC(verbose=0)

# Specify parameters and distributions to sample from
param_dist = {
    'C': np.logspace(0.1, 4, 20),
    'gamma': np.logspace(-9, 3, 13),
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}

# Random search of parameters
random_search = RandomizedSearchCV(estimator=svm, param_distributions=param_dist, n_iter=50, cv=3, verbose=0, random_state=42, n_jobs=-1)

# Fit the random search model
random_search.fit(X_train, y_train)


NameError: name 'SVC' is not defined

In [22]:
# Predict on test data with the best parameters
y_pred = random_search.predict(X_test)

# Print results
print("Best parameters found: ", random_search.best_params_)
print("Classification report:\n", classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

Best parameters found:  {'weights': 'uniform', 'p': 2.0, 'n_neighbors': 2, 'leaf_size': 10, 'algorithm': 'auto'}
Classification report:
               precision    recall  f1-score   support

           0       0.83      0.74      0.78       121
           1       0.26      0.38      0.31        29

    accuracy                           0.67       150
   macro avg       0.54      0.56      0.54       150
weighted avg       0.72      0.67      0.69       150

Accuracy: 0.6666666666666666


In [20]:
confusion_matrix(y_test, y_pred)

array([[89, 32],
       [18, 11]])