In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
import pickle
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
with open('BERT_model_final.pkl', 'rb') as file:
    model = pickle.load(file)

In [None]:
with open("tokenizedBERT.pkl", "rb") as f:
    all_tokenized = pickle.load(f)

In [None]:
def tokenize_query(query_text, tokenizer_name='emilyalsentzer/Bio_ClinicalBERT', max_length=128):

    tokenizer = BertTokenizer.from_pretrained(tokenizer_name)

    inputs = tokenizer.encode_plus(
        query_text,
        None,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_token_type_ids=True,
        return_tensors='pt'
    )

    tokenized_query = {
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
    }

    return tokenized_query

query_text = "There are seviere symptoms of fever while discharged , with blood pressure of 80"
tokenized_query = tokenize_query(query_text)
print("Tokenized Query:", tokenized_query)


Tokenized Query: {'input_ids': tensor([[  101,  1175,  1132, 14516, 15339,  1162,  8006,  1104, 10880,  1229,
         15207,   117,  1114,  1892,  2997,  1104,  2908,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0

In [None]:
query_input_ids = tokenized_query['input_ids']
query_attention_mask = tokenized_query['attention_mask']
query_input_ids_np = query_input_ids.cpu().numpy().reshape(1, -1)
query_attention_mask_np = query_attention_mask.cpu().numpy().reshape(1, -1)

similarities = []
for record in all_tokenized:
    record_input_ids = record['input_ids']
    record_attention_mask = record['attention_mask']


    record_input_ids_np = record_input_ids.cpu().numpy().reshape(1, -1)
    record_attention_mask_np = record_attention_mask.cpu().numpy().reshape(1, -1)


    record_input_ids_np = record_input_ids_np[:, :query_input_ids_np.shape[1]]
    record_attention_mask_np = record_attention_mask_np[:, :query_attention_mask_np.shape[1]]


    sim = cosine_similarity(query_input_ids_np, record_input_ids_np)
    similarities.append(sim)

similarities_array = np.array(similarities)
top_10_indices = similarities_array.argsort(axis=None)[-10:][::-1]
top_10_records = [all_tokenized[idx] for idx in top_10_indices]

print("Top 10 Similar Records:")
for i, record in enumerate(top_10_records, start=1):
    print(f"Record {i}: {record}")

Top 10 Similar Records:
Record 1: {'input_ids': tensor([[  101,  1120, 13119,  ...,  4980, 14452,   102],
        [  101, 19538, 14541,  ...,  1394,  1818,   102],
        [  101, 22196,  1545,  ..., 12754,  3324,   102],
        ...,
        [  101, 14402,  1358,  ...,  4578, 16936,   102],
        [  101, 21692, 20581,  ...,  3862,  1527,   102],
        [  101,   175,  1181,  ..., 11109,  4351,   102]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0'), 'labels': tensor([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1], device='cuda:0')}
Record 2: {'input_ids': tensor([[  101,  1120, 13119,  ...,  1161,  5855,   102],
        [  101, 22723,  1604,  ..., 14541,  1545,   102],
        [  101, 22196, 18202,  ..., 12602,   174,   102],
        ...,
        [  101, 22148,

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def predict_record(record):

    input_ids = record['input_ids'].to(device)
    attention_mask = record['attention_mask'].to(device)

    model.eval()

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        _, prediction = torch.max(outputs.logits, dim=1)

    return prediction

def predict_records(records):
    predictions = []
    for record in records:
        prediction = predict_record(record)
        predictions.extend(prediction.tolist())

    counts = torch.bincount(torch.tensor(predictions))

    majority_prediction = torch.argmax(counts)
    return majority_prediction.item()


final_prediction = predict_records(top_10_records)

if final_prediction == 0:
    print("No Readmission Risk within 30 days")
elif final_prediction == 1:
    print("Readmission Risk within 30 days")

Readmission Risk within 30 days


In [None]:
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")

generator_tokenizer = BertTokenizer.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')

def convert_tokenized_query_to_text(tokenized_query):
    input_ids = tokenized_query['input_ids'].tolist()[0]
    text = generator_tokenizer.decode(input_ids, skip_special_tokens=True)
    return text

def convert_records_to_text(records):
    texts = []
    for record in records[:1]:
        input_ids = record['input_ids'].tolist()[0]
        text = generator_tokenizer.decode(input_ids, skip_special_tokens=True)
        texts.append(text)
    return texts


def generate_input_for_gpt2(query_text, prediction_result):
    combined_text = query_text + ". "
    combined_text += ". Prediction: " + prediction_result
    return combined_text

retrieved_texts = convert_records_to_text(top_10_records)
query_text_decoded = convert_tokenized_query_to_text(tokenized_query)



In [None]:
prediction_result = "No Readmission Risk within 30 days" if final_prediction == 0 else "Readmission Risk within 30 days"

input_for_gpt2 = generate_input_for_gpt2(query_text_decoded, prediction_result)

input_ids = gpt2_tokenizer.encode(input_for_gpt2, return_tensors='pt')

output = gpt2_model.generate(
    input_ids,
    max_new_tokens=300,
    num_return_sequences=1,
    temperature=0.7,
    repetition_penalty=1.0,
    top_k=100,
    top_p=0.95
)

generated_text = gpt2_tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


there are seviere symptoms of fever while discharged, with blood pressure of 80.. Prediction: Readmission Risk within 30 days of discharge.

The following are the symptoms of seviere:

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache

Severe headache
