In [None]:
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel

# Define the model class
class HierarchicalClassifier(nn.Module):
    def __init__(self, num_labels_h3):
        super(HierarchicalClassifier, self).__init__()
        self.longformer = AutoModel.from_pretrained("yikuan8/Clinical-Longformer")
        self.dropout = nn.Dropout(0.1)
        self.fc_h1 = nn.Linear(self.longformer.config.hidden_size, 1)
        self.fc_h2 = nn.Linear(self.longformer.config.hidden_size, 1)
        self.fc_h3 = nn.Linear(self.longformer.config.hidden_size, num_labels_h3)

    def forward(self, input_ids, attention_mask):
        outputs = self.longformer(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        logits_h1 = self.fc_h1(self.dropout(pooled_output))
        logits_h2 = self.fc_h2(self.dropout(pooled_output))
        logits_h3 = self.fc_h3(self.dropout(pooled_output))
        return logits_h1, logits_h2, logits_h3

# Load the saved model
num_labels_h3 = 13  # Number of determinants
model = HierarchicalClassifier(num_labels_h3)
model.load_state_dict(torch.load('model_Clinical-Longformer.pth'))
model.eval()

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("yikuan8/Clinical-Longformer")

# Preprocess the text data
def preprocess_texts(texts, tokenizer, max_length=512):
    input_ids = []
    attention_masks = []
    for text in texts:
        inputs = tokenizer.encode_plus(
            text, None, add_special_tokens=True, max_length=max_length,
            padding='max_length', return_token_type_ids=True, truncation=True
        )
        input_ids.append(inputs['input_ids'])
        attention_masks.append(inputs['attention_mask'])
    return torch.tensor(input_ids, dtype=torch.long), torch.tensor(attention_masks, dtype=torch.long)

# Function to make predictions for a batch of texts
def predict_batch(texts, model, tokenizer, device):
    model.to(device)
    input_ids, attention_masks = preprocess_texts(texts, tokenizer)
    input_ids, attention_masks = input_ids.to(device), attention_masks.to(device)

    with torch.no_grad():
        logits_h1, logits_h2, logits_h3 = model(input_ids, attention_masks)
        pred_h1 = torch.sigmoid(logits_h1).cpu().numpy() > 0.5
        pred_h2 = torch.sigmoid(logits_h2).cpu().numpy() > 0.5
        pred_h3 = torch.sigmoid(logits_h3).cpu().numpy() > 0.5

    return pred_h1, pred_h2, pred_h3

# Process the data in chunks and mini-batches
chunk_size = 10000  # Define the chunk size
batch_size = 32     # Define the batch size
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize an empty DataFrame to store the predictions
predictions_df = pd.DataFrame(columns=['text', 'opioid_pr_ab', 'determinant_pr_ab', *['determinant_' + str(i) for i in range(1, 14)]])

# Load the discharge.csv file in chunks
for chunk in pd.read_csv('discharge.csv', chunksize=chunk_size):
    chunk_predictions = []
    texts = chunk['text'].tolist()
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        pred_h1, pred_h2, pred_h3 = predict_batch(batch_texts, model, tokenizer, device)
        for text, h1, h2, h3 in zip(batch_texts, pred_h1, pred_h2, pred_h3):
            chunk_predictions.append([text, h1[0], h2[0], *h3])

    # Convert chunk predictions to a DataFrame and append to the main DataFrame
    chunk_predictions_df = pd.DataFrame(chunk_predictions, columns=['text', 'opioid_pr_ab', 'determinant_pr_ab', *['determinant_' + str(i) for i in range(1, 14)]])
    predictions_df = pd.concat([predictions_df, chunk_predictions_df], ignore_index=True)

# Save the predictions to a CSV file
predictions_df.to_csv('predictions_2ndjune.csv', index=False)

# Load and print the first 5 rows of predictions.csv
#predictions_df = pd.read_csv('predictions.csv')
#print(predictions_df.head())
