In [1]:
!pip install torch transformers pandas scikit-learn




In [38]:
import pandas as pd

# Load facts CSV
facts_df = pd.read_csv(
    "/home/liorkob/M.Sc/thesis/data/drugs/processed_verdicts_with_gpt.csv",
    encoding="utf-8-sig",
    quotechar='"'
)

# Load similarity CSV
pairs_df = pd.read_csv(
    "/home/liorkob/M.Sc/thesis/data/drugs/similarity_gt_drugs.csv",
    encoding="utf-8-sig"
)

# Clean whitespace
facts_df["verdict"] = facts_df["verdict"].astype(str).str.strip()
pairs_df["verdict_1"] = pairs_df["verdict_1"].astype(str).str.strip()
pairs_df["verdict_2"] = pairs_df["verdict_2"].astype(str).str.strip()

# Merge paragraph A
merged = pairs_df.merge(
    facts_df[["verdict", "extracted_gpt_facts"]],
    left_on="verdict_1",
    right_on="verdict",
    how="left"
).rename(columns={"extracted_gpt_facts": "verdict_a_paragraph"}).drop(columns=["verdict"])

# Merge paragraph B
merged = merged.merge(
    facts_df[["verdict", "extracted_gpt_facts"]],
    left_on="verdict_2",
    right_on="verdict",
    how="left"
).rename(columns={"extracted_gpt_facts": "verdict_b_paragraph"}).drop(columns=["verdict"])

# Rename similarity column
merged = merged.rename(columns={"Similarity": "similarity_score"})

# Drop rows with missing paragraphs
final_df = merged.dropna(subset=["verdict_a_paragraph", "verdict_b_paragraph"])

# Save
output_path = "/home/liorkob/M.Sc/thesis/data/drugs/verdict_paragraph_pairs.csv"
final_df[["verdict_a_paragraph", "verdict_b_paragraph", "similarity_score"]].to_csv(
    output_path,
    index=False,
    encoding="utf-8-sig"
)

print(f"✅ Saved {len(final_df)} valid paragraph pairs to:")
print(output_path)
missing_a = merged["verdict_a_paragraph"].isna().sum()
missing_b = merged["verdict_b_paragraph"].isna().sum()
print(f"🔍 Missing A: {missing_a}, Missing B: {missing_b}")


✅ Saved 0 valid paragraph pairs to:
/home/liorkob/M.Sc/thesis/data/drugs/verdict_paragraph_pairs.csv
🔍 Missing A: 95, Missing B: 103


🔍 Examples from facts_df: ['ME-19-04-31343-39', 'ME-21-03-10139-230']
🔍 Examples from pairs_df (verdict_1): ['ME-16-07-11608-225', 'ME-16-11-63255-338', 'ME-16-12-26620-433', 'ME-17-01-59971-328', 'ME-17-01-620-279', 'ME-17-03-30876-747', 'ME-17-03-30876-747', 'ME-17-04-17677-800', 'ME-18-02-5680-55', 'ME-18-02-5680-55']
🔍 Examples from pairs_df (verdict_2): ['ME-22-12-47987-640', 'ME-18-01-73652-357', 'ME-16-06-6788-21', 'ME-14-01-16958-980', 'ME-16-12-19902-389', 'SH-23-02-41988-839', 'ME-17-01-59971-328', 'ME-14-01-16958-980', 'ME-22-04-7867-732', 'ME-15-11-31122-886']


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
import torch.nn.functional as F

class SiameseHeBERT(nn.Module):
    def __init__(self, model_name='avichr/heBERT'):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        self.pool = lambda x: x.last_hidden_state[:, 0]  # [CLS]

    def forward(self, ids_a, mask_a, ids_b, mask_b):
        vec_a = self.pool(self.encoder(input_ids=ids_a, attention_mask=mask_a))
        vec_b = self.pool(self.encoder(input_ids=ids_b, attention_mask=mask_b))
        sim = F.cosine_similarity(vec_a, vec_b)
        return sim


In [None]:
from torch.utils.data import Dataset
import torch

class VerdictDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.df = df.copy()
        self.tokenizer = tokenizer
        self.max_len = max_len
        # Normalize score: 1 → 0.0, 2 → 0.5, 3 → 1.0
        self.df['norm_score'] = (df['similarity_score'] - 1) / 2

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        enc_a = self.tokenizer(row['verdict_a_paragraph'], truncation=True, padding='max_length',
                               max_length=self.max_len, return_tensors='pt')
        enc_b = self.tokenizer(row['verdict_b_paragraph'], truncation=True, padding='max_length',
                               max_length=self.max_len, return_tensors='pt')
        return {
            'input_ids_a': enc_a['input_ids'].squeeze(),
            'attention_mask_a': enc_a['attention_mask'].squeeze(),
            'input_ids_b': enc_b['input_ids'].squeeze(),
            'attention_mask_b': enc_b['attention_mask'].squeeze(),
            'label': torch.tensor(row['norm_score'], dtype=torch.float)
        }


In [None]:
from torch.optim import Adam
from tqdm import tqdm
import torch.nn as nn

def train(model, dataloader, optimizer, device):
    model.train()
    loss_fn = nn.MSELoss()

    for batch in tqdm(dataloader):
        for key in batch:
            batch[key] = batch[key].to(device)

        pred = model(batch['input_ids_a'], batch['attention_mask_a'],
                     batch['input_ids_b'], batch['attention_mask_b'])

        loss = loss_fn(pred, batch['label'])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(f"Loss: {loss.item():.4f}")


In [None]:
import pandas as pd
from torch.utils.data import DataLoader

df = pd.read_csv('/home/liorkob/M.Sc/thesis/data/drugs/verdict_paragraph_pairs.csv')  # with columns: verdict_a_paragraph, verdict_b_paragraph, similarity_score
tokenizer = AutoTokenizer.from_pretrained('avichr/heBERT')
dataset = VerdictDataset(df, tokenizer)
loader = DataLoader(dataset, batch_size=8, shuffle=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SiameseHeBERT().to(device)
optimizer = Adam(model.parameters(), lr=2e-5)

train(model, loader, optimizer, device)


In [None]:
model.eval()
with torch.no_grad():
    sim = model(ids_a, mask_a, ids_b, mask_b)
    pred_score = 1 + sim.item() * 2  # Convert [0,1] back to [1,3]
    print(f"Predicted similarity score: {pred_score:.2f}")
