In [None]:
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
from sentence_transformers import SentenceTransformer

In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [38]:
from sklearn.metrics import mean_squared_error


In [None]:
pubmed_bert = SentenceTransformer('pritamdeka/S-PubMedBert-MS-MARCO')

In [9]:
df_reports = pd.read_csv("data/reports_filtered.csv")
df_reports

Unnamed: 0.1,Unnamed: 0,pt_shsc_id,imaging_date,healthy,fracture_or_metastases,ServDescription,ReportDate,ReportText
0,147,SHSC-1AR4W-5L54C-ITDX3-TF89XE1YWW-ZKFBG-9VWKC-...,2010-01-20,0.0,0.0,Bone Mass Density High Risk - Multiple Sites,2010-01-21 11:39:00.140,bone densitometry (dxa): a baseline bone densi...
1,260,SHSC-1AR4W-5L54C-ITDX3-TF89XE1YWW-ZKFBG-9VWKC-...,2011-02-02,1.0,0.0,Bone Scan(Whole Body)Nuc Med,2011-02-02 16:24:11.000,technetium mdp bone scan findings: there is a ...
2,272,SHSC-1AR4W-5L54C-ITDX3-TF89XE1YWW-ZKFBG-9VWKC-...,2011-02-02,1.0,0.0,X-Ray Chest PA+LAT Routine,2011-02-02 09:12:24.000,chest pa and lateral reference:no previous the...
3,311,SHSC-1AR4W-5L54C-ITDX3-TF89XE1YWW-ZKFBG-9VWKC-...,2011-04-11,0.0,1.0,Abdomen + Pelvis CT with oral C+,2011-04-12 16:27:57.000,ct abdomen pelvis (enhanced) comparison: ct da...
4,389,SHSC-1AR4W-5L54C-ITDX3-TF89XE1YWW-ZKFBG-9VWKC-...,2011-06-06,0.0,1.0,Abdomen + Pelvis CT with oral C+,2011-06-06 15:08:04.000,ct abdomen pelvis (enhanced) comparison: ct da...
...,...,...,...,...,...,...,...,...
1362,60124,SHSC-ZT1YP-WP8FY-45FV0-HFN9E2J3E8-50SNT-5PP8Q-...,2012-01-27,1.0,0.0,Nephrostomy or NUT change,2012-01-27 13:15:45.000,right nephrostomy catheter exchange history: u...
1363,60138,SHSC-ZT1YP-WP8FY-45FV0-HFN9E2J3E8-50SNT-5PP8Q-...,2012-01-27,1.0,0.0,Kidney + Bladder US,2012-01-27 18:48:44.000,findings: right nephrostomy tube in situ. mark...
1364,60158,SHSC-ZT1YP-WP8FY-45FV0-HFN9E2J3E8-50SNT-5PP8Q-...,2012-08-22,0.0,1.0,Abdomen + Pelvis CT with oral C-,2012-08-23 10:57:52.000,ct abdomen and pelvis volumetric ct images hav...
1365,60172,SHSC-ZT1YP-WP8FY-45FV0-HFN9E2J3E8-50SNT-5PP8Q-...,2012-08-22,0.0,1.0,Bone Scan Whole Body+Extra Views+Flow,2012-08-22 15:04:50.000,technetium mdp bone scan whole body: history:p...


In [10]:
df_reports.dtypes

Unnamed: 0                  int64
pt_shsc_id                 object
imaging_date               object
healthy                   float64
fracture_or_metastases    float64
ServDescription            object
ReportDate                 object
ReportText                 object
dtype: object

In [11]:
embeddings = df_reports['ReportText'].apply(pubmed_bert.encode).tolist()
X = np.array(embeddings)
X.shape

(1367, 768)

In [12]:
labels = np.array(df_reports['fracture_or_metastases'] == 1.0)
labels

array([False, False, False, ...,  True,  True,  True])

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

In [None]:
# convert back to pytorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

  X_train = torch.tensor(X_train, dtype=torch.float32)
  X_test = torch.tensor(X_test, dtype=torch.float32)
  y_train = torch.tensor(y_train, dtype=torch.float32)  # Use `torch.long` for classification targets
  y_test = torch.tensor(y_test, dtype=torch.float32)


In [16]:
class EmbeddingsNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(EmbeddingsNN, self).__init__()
        
        self.bn1 = nn.BatchNorm1d(input_dim)
        self.mlp1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.bn2 = nn.BatchNorm1d(hidden_dim)
        self.mlp2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.bn1(x)
        x = self.mlp1(x)
        x = self.relu(x)
        x = self.bn2(x)
        x = self.mlp2(x)
        return x

In [None]:
def train_model(model, dataloader, criterion, optimizer, epochs=50):
    model.train()

    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, targets in dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()

        avg_loss = running_loss / len(dataloader)
        print(f"Epoch [{epoch + 1}/{epochs}], Loss: {avg_loss:.4f}")

In [None]:
input_dim = 768
hidden_dim = 20
output_dim = 1
learning_rate = 0.001
batch_size = 8
epochs = 21         # empirically has converged at about 21

model = EmbeddingsNN(input_dim, hidden_dim, output_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [49]:
dataset = TensorDataset(X_train, y_train)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
train_model(model, dataloader, criterion, optimizer, epochs)

Epoch [1/100], Loss: 0.5530
Epoch [2/100], Loss: 0.2735
Epoch [3/100], Loss: 0.2360
Epoch [4/100], Loss: 0.2231
Epoch [5/100], Loss: 0.2208
Epoch [6/100], Loss: 0.2192
Epoch [7/100], Loss: 0.2197
Epoch [8/100], Loss: 0.2182
Epoch [9/100], Loss: 0.2180
Epoch [10/100], Loss: 0.2169
Epoch [11/100], Loss: 0.2175
Epoch [12/100], Loss: 0.2170
Epoch [13/100], Loss: 0.2202
Epoch [14/100], Loss: 0.2189
Epoch [15/100], Loss: 0.2178
Epoch [16/100], Loss: 0.2167
Epoch [17/100], Loss: 0.2162
Epoch [18/100], Loss: 0.2168
Epoch [19/100], Loss: 0.2168
Epoch [20/100], Loss: 0.2159
Epoch [21/100], Loss: 0.2167
Epoch [22/100], Loss: 0.2168
Epoch [23/100], Loss: 0.2166
Epoch [24/100], Loss: 0.2157
Epoch [25/100], Loss: 0.2159
Epoch [26/100], Loss: 0.2168
Epoch [27/100], Loss: 0.2176
Epoch [28/100], Loss: 0.2167
Epoch [29/100], Loss: 0.2166
Epoch [30/100], Loss: 0.2159
Epoch [31/100], Loss: 0.2165
Epoch [32/100], Loss: 0.2149
Epoch [33/100], Loss: 0.2157
Epoch [34/100], Loss: 0.2167
Epoch [35/100], Loss: 0

In [None]:

model.eval()
with torch.no_grad():
    predictions = model(X_test)
    predicted_labels = (predictions > 0.5).long().squeeze()

accuracy = accuracy_score(y_test, predicted_labels)
f1 = f1_score(y_test, predicted_labels)

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"F1 Score: {f1:.4f}")


Accuracy: 71.17%
F1 Score: 0.8316
