In [1]:
import numpy as np
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

import json
import random

from utils import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open("data/sub_1000.json","r") as j_file:
    data = json.load(j_file)

In [3]:
split_ratios = [0.7, 0.15, 0.15]

random.shuffle(data)
train_data, val_data, test_data = split_list(data, split_ratios)

train_data += val_data
print(f"""
train: {len(train_data)}
 test: {len(test_data)}
""")


train: 850
 test: 150



In [4]:
def extract_features(dataloader, encoder, device='cuda'):
    features = []
    labels = []

    # Ensure the encoder is on the correct device and switch to evaluation mode
    encoder = encoder.to(device)
    encoder.eval()

    with torch.no_grad():
        for data in dataloader:
            inputs, targets = data['embeddings'].to(device), data['label']  # Move inputs to GPU
            feature = encoder(inputs)
            features.append(feature.cpu().numpy())  # Move features back to CPU and convert to numpy
            labels.append(targets.numpy())

    features = np.vstack(features)
    labels = np.hstack(labels)

    return features, labels


In [5]:
### load embeddings ###
with open('data/text2embeddings.pkl', 'rb') as f:
    text2embeddings = pickle.load(f)

pretrained_hae = PatientAutoencoder.load_from_checkpoint(checkpoint_path='checkpoints/last.ckpt')
encoder = nn.Sequential(
    pretrained_hae.encounter_autoencoder,
    pretrained_hae.patient_encoder
)
for param in encoder.parameters():
    param.requires_grad = False

### setup dataloaders ###
train_ds, val_ds, test_ds = PatientDataset(train_data, text2embeddings), PatientDataset(val_data, text2embeddings), PatientDataset(test_data, text2embeddings)
train_dl = DataLoader(train_ds, batch_size=20, shuffle=True, collate_fn=collate_fn, num_workers=5, persistent_workers=True)
val_dl = DataLoader(val_ds, batch_size=20, collate_fn=collate_fn, num_workers=5, persistent_workers=True)
test_dl = DataLoader(test_ds, batch_size=20, collate_fn=collate_fn, num_workers=5)

# Extract features
X_train, y_train = extract_features(train_dl, encoder)
X_test, y_test = extract_features(test_dl, encoder)


In [None]:
# Define the model
svm = SVC(verbose=1)

# Specify parameters and distributions to sample from
param_dist = {
    'C': np.logspace(-4, 4, 20),
    'gamma': np.logspace(-9, 3, 13),
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}

# Random search of parameters
random_search = RandomizedSearchCV(estimator=svm, param_distributions=param_dist, n_iter=250, cv=5, verbose=1, random_state=42, n_jobs=-1)

# Fit the random search model
random_search.fit(X_train, y_train)

# Predict on test data with the best parameters
y_pred = random_search.predict(X_test)

# Print results
print("Best parameters found: ", random_search.best_params_)
print("Classification report:\n", classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
