In [None]:
# !pip install torch transformers pandas scikit-learn


### data prep:

In [None]:
# import pandas as pd

# # Load facts CSV
# facts_df = pd.read_csv(
#     "/home/liorkob/M.Sc/thesis/data/drugs/processed_verdicts_with_gpt.csv",
#     encoding="utf-8-sig",
#     quotechar='"'
# )

# # Load similarity CSV
# pairs_df = pd.read_csv(
#     "/home/liorkob/M.Sc/thesis/data/drugs/similarity_gt_drugs.csv",
#     encoding="utf-8-sig"
# )

# # Clean whitespace
# facts_df["verdict"] = facts_df["verdict"].astype(str).str.strip()
# pairs_df["verdict_1"] = pairs_df["verdict_1"].astype(str).str.strip()
# pairs_df["verdict_2"] = pairs_df["verdict_2"].astype(str).str.strip()

# facts_df = facts_df.drop_duplicates(subset="verdict")

# # Merge paragraph A
# merged = pairs_df.merge(
#     facts_df[["verdict", "extracted_gpt_facts"]],
#     left_on="verdict_1",
#     right_on="verdict",
#     how="left"
# ).rename(columns={"extracted_gpt_facts": "verdict_a_paragraph"}).drop(columns=["verdict"])

# # Merge paragraph B
# merged = merged.merge(
#     facts_df[["verdict", "extracted_gpt_facts"]],
#     left_on="verdict_2",
#     right_on="verdict",
#     how="left"
# ).rename(columns={"extracted_gpt_facts": "verdict_b_paragraph"}).drop(columns=["verdict"])



# # Rename similarity column
# merged = merged.rename(columns={"Similarity": "similarity_score"})

# # Drop rows with missing paragraphs
# final_df = merged.dropna(subset=["verdict_a_paragraph", "verdict_b_paragraph"])

# # Save
# output_path = "/home/liorkob/M.Sc/thesis/data/drugs/verdict_paragraph_pairs.csv"
# final_df[["verdict_a_paragraph", "verdict_b_paragraph", "similarity_score"]].to_csv(
#     output_path,
#     index=False,
#     encoding="utf-8-sig"
# )

# print(f"✅ Saved {len(final_df)} valid paragraph pairs to:")
# print(output_path)
# missing_a = merged["verdict_a_paragraph"].isna().sum()
# missing_b = merged["verdict_b_paragraph"].isna().sum()
# print(f"🔍 Missing A: {missing_a}, Missing B: {missing_b}")


In [2]:
import os
import pandas as pd

# Paths
tag_dir = '/home/liorkob/M.Sc/thesis/data/drugs/tag_citations'
gpt_facts_path = '/home/liorkob/M.Sc/thesis/data/drugs/processed_verdicts_with_gpt.csv'
output_path = '/home/liorkob/M.Sc/thesis/data/drugs/verdict_pairs_with_similarity.csv'

# Load verdict facts
verdict_facts = pd.read_csv(gpt_facts_path)
facts_dict = dict(zip(verdict_facts['verdict'], verdict_facts['extracted_gpt_facts']))

# Collect data
rows = []
for file in os.listdir(tag_dir):
    if file.endswith('.csv'):
        verdict_a = file.replace('.csv', '')
        file_path = os.path.join(tag_dir, file)
        
        # Skip empty files
        if os.path.getsize(file_path) == 0:
            continue
        
        try:
            df = pd.read_csv(file_path)
        except pd.errors.EmptyDataError:
            continue

        for _, row in df.iterrows():
            if row['predicted_label'] == 1:
                verdict_b = row['citation']
                a_facts = facts_dict.get(verdict_a, "")
                b_facts = facts_dict.get(verdict_b, "")
                rows.append([verdict_a, a_facts, verdict_b, b_facts, 1])

# Save result
result_df = pd.DataFrame(rows, columns=[
    'verdict_a_name', 'verdict_a_extracted_gpt_facts',
    'verdict_b_name', 'verdict_b_extracted_gpt_facts', 'similarity_score'
])
result_df.to_csv(output_path, index=False)


### Model:

In [None]:
from transformers import AutoTokenizer, AutoModel
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd
from torch.optim import Adam
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# ----- Model -----
class SiameseHeBERT(nn.Module):
    def __init__(self, model_name='avichr/heBERT'):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        hidden = self.encoder.config.hidden_size

        self.regressor = nn.Sequential(
            nn.Linear(hidden * 2, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size())
        return (token_embeddings * input_mask_expanded).sum(1) / input_mask_expanded.sum(1)

    def forward(self, ids_a, mask_a, ids_b, mask_b):
        out_a = self.encoder(ids_a, attention_mask=mask_a)
        out_b = self.encoder(ids_b, attention_mask=mask_b)
        vec_a = self.mean_pooling(out_a, mask_a)
        vec_b = self.mean_pooling(out_b, mask_b)
        combined = torch.cat([vec_a, vec_b], dim=1)
        return self.regressor(combined).squeeze()

# ----- Dataset -----
class VerdictDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.df = df.copy()
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.df['norm_score'] = (df['similarity_score'] - 1) / 2  # [1,3] → [0,1]

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        enc_a = self.tokenizer(row['verdict_a_paragraph'], truncation=True, padding='max_length',
                               max_length=self.max_len, return_tensors='pt')
        enc_b = self.tokenizer(row['verdict_b_paragraph'], truncation=True, padding='max_length',
                               max_length=self.max_len, return_tensors='pt')
        return {
            'input_ids_a': enc_a['input_ids'].squeeze(),
            'attention_mask_a': enc_a['attention_mask'].squeeze(),
            'input_ids_b': enc_b['input_ids'].squeeze(),
            'attention_mask_b': enc_b['attention_mask'].squeeze(),
            'label': torch.tensor(row['norm_score'], dtype=torch.float)
        }

# ----- Training Function -----
def train(model, dataloader, optimizer, device, epochs=20):
    model.train()
    loss_fn = nn.MSELoss()
    loss_history = []
    acc_history = []

    for epoch in range(epochs):
        total_loss = 0
        correct = 0
        total = 0

        print(f"\nEpoch {epoch+1}/{epochs}")
        for batch in tqdm(dataloader):
            for key in batch:
                batch[key] = batch[key].to(device)

            pred = model(batch['input_ids_a'], batch['attention_mask_a'],
                         batch['input_ids_b'], batch['attention_mask_b'])
            loss = loss_fn(pred, batch['label'])

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            # Accuracy calculation
            pred_scores = 1 + pred.detach().cpu().numpy() * 2
            label_scores = 1 + batch['label'].detach().cpu().numpy() * 2
            pred_classes = np.round(pred_scores)
            label_classes = np.round(label_scores)
            correct += (pred_classes == label_classes).sum()
            total += len(label_classes)

        avg_loss = total_loss / len(dataloader)
        acc = correct / total

        loss_history.append(avg_loss)
        acc_history.append(acc)

        print(f"Average Loss: {avg_loss:.4f} | Accuracy: {acc:.4f}")

    return loss_history, acc_history




# ----- Load Data -----
df = pd.read_csv('/home/liorkob/M.Sc/thesis/data/drugs/verdict_paragraph_pairs.csv')
tokenizer = AutoTokenizer.from_pretrained('avichr/heBERT')
dataset = VerdictDataset(df, tokenizer)

# ----- Train/Test Split -----

from sklearn.model_selection import train_test_split

df['norm_score'] = (df['similarity_score'] - 1) / 2  # ensure this column exists

train_df, test_df = train_test_split(
    df, test_size=0.2, stratify=np.round(df['norm_score']), random_state=42
)

train_dataset = VerdictDataset(train_df, tokenizer)
test_dataset = VerdictDataset(test_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# ----- Train -----
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SiameseHeBERT().to(device)

optimizer = Adam(model.parameters(), lr=2e-5)

loss_history, acc_history = train(model, train_loader, optimizer, device)

# Plot loss
import matplotlib.pyplot as plt

epochs = range(1, len(loss_history) + 1)
plt.figure(figsize=(8, 5))
plt.plot(epochs, loss_history, marker='o')
plt.title("Training Loss Over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Loss (MSE)")
plt.grid(True)
plt.tight_layout()
plt.show()



plt.figure(figsize=(8, 5))
plt.plot(epochs, acc_history, marker='o')
plt.title("Training acc_history Over Epochs")
plt.xlabel("Epoch")
plt.ylabel("acc")
plt.grid(True)
plt.tight_layout()
plt.show()



### #2 model - cross validation

In [15]:
from transformers import AutoTokenizer, AutoModel
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
import pandas as pd
from torch.optim import Adam
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
import pickle

from sklearn.utils.class_weight import compute_class_weight

CUDA_LAUNCH_BLOCKING=1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name='/home/liorkob/M.Sc/thesis/similarity-model/hebert-mlm-verdicts/final'
# model_name="avichr/heBERT"
# ----- Model -----
class SiameseHeBERT(nn.Module):
    def __init__(self, model_name=model_name):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        hidden = self.encoder.config.hidden_size

        self.classifier = nn.Sequential(
            nn.Linear(hidden * 2, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, 1)  # Binary output (logit)
        )

    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size())
        return (token_embeddings * input_mask_expanded).sum(1) / input_mask_expanded.sum(1)

    def forward(self, ids_a, mask_a, ids_b, mask_b):
        out_a = self.encoder(ids_a, attention_mask=mask_a)
        out_b = self.encoder(ids_b, attention_mask=mask_b)
        vec_a = self.mean_pooling(out_a, mask_a)
        vec_b = self.mean_pooling(out_b, mask_b)
        combined = torch.cat([vec_a, vec_b], dim=1)
        return self.classifier(combined).squeeze(-1)

# ----- Dataset -----
class VerdictDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.df = df.copy()
        self.tokenizer = tokenizer
        self.max_len = max_len
        # self.df['class_label'] = (df['similarity_score'] - 1).astype(int)  # class 0,1,2
        valid_scores = [1, 2, 3]
        self.df = self.df[self.df['similarity_score'].isin(valid_scores)]
        # self.df['class_label'] = (self.df['similarity_score'] - 1).astype(int)
        self.df['class_label'] = (self.df['similarity_score'] == 3).astype(int)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        enc_a = self.tokenizer(row['verdict_a_paragraph'], truncation=True, padding='max_length',
                               max_length=self.max_len, return_tensors='pt')
        enc_b = self.tokenizer(row['verdict_b_paragraph'], truncation=True, padding='max_length',
                               max_length=self.max_len, return_tensors='pt')
        return {
            'input_ids_a': enc_a['input_ids'].squeeze(),
            'attention_mask_a': enc_a['attention_mask'].squeeze(),
            'input_ids_b': enc_b['input_ids'].squeeze(),
            'attention_mask_b': enc_b['attention_mask'].squeeze(),
            'label': torch.tensor(row['class_label'], dtype=torch.float)
        }

# ----- Train Function -----
def train(model, dataloader, optimizer, device,pos_weight_tensor,epochs=15):
    model.train()
    loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight_tensor)
    for epoch in range(epochs):
        total_loss, correct, total = 0, 0, 0
        for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}"):
            for key in batch:
                batch[key] = batch[key].to(device)

            logits = model(batch['input_ids_a'], batch['attention_mask_a'],
                          batch['input_ids_b'], batch['attention_mask_b'])
            loss = loss_fn(logits, batch['label'])

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            preds = (torch.sigmoid(logits) >= 0.5).long()
            correct += (preds == batch['label']).sum().item()
            total += batch['label'].size(0)

        print(f"Loss: {total_loss / len(dataloader):.4f}, Accuracy: {correct / total:.4f}")

# ----- Evaluation Function -----
def evaluate(model, dataloader, device):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for batch in dataloader:
            for key in batch:
                batch[key] = batch[key].to(device)

            logits = model(batch['input_ids_a'], batch['attention_mask_a'],
                          batch['input_ids_b'], batch['attention_mask_b'])
            preds = (torch.sigmoid(logits) >= 0.5).long()
            correct += (preds == batch['label']).sum().item()
            total += batch['label'].size(0)

    return correct / total

# ----- Main K-Fold Script -----
df = pd.read_csv('/home/liorkob/M.Sc/thesis/data/drugs/verdict_paragraph_pairs__.csv')

tokenizer = AutoTokenizer.from_pretrained("avichr/heBERT")
dataset = VerdictDataset(df, tokenizer)
print(df['similarity_score'].unique())
print(dataset.df['class_label'].unique())
print("Similarity scores:", df['similarity_score'].unique())
print("Label distribution:", dataset.df['class_label'].value_counts())
assert dataset.df['class_label'].isin([0, 1, 2]).all(), "Invalid label detected!"

# ----- Compute pos_weight -----
labels = dataset.df['class_label']
num_pos = (labels == 1).sum()
num_neg = (labels == 0).sum()
pos_weight_value = num_neg / num_pos
pos_weight_tensor = torch.tensor(pos_weight_value, dtype=torch.float).to(device)

k = 2
kf = KFold(n_splits=k, shuffle=True, random_state=42)
fold_accuracies = []

# splits = list(kf.split(dataset))
# with open("fold_indices.pkl", "wb") as f:
#     pickle.dump(splits, f)

# Later:
with open("fold_indices.pkl", "rb") as f:
    splits = pickle.load(f)



for fold, (train_idx, test_idx) in enumerate(splits):
    print(f"\n--- Fold {fold + 1} ---")
    train_subset = Subset(dataset, train_idx)
    test_subset = Subset(dataset, test_idx)

    train_loader = DataLoader(train_subset, batch_size=8, shuffle=True)
    test_loader = DataLoader(test_subset, batch_size=8)

    model = SiameseHeBERT().to(device)
    optimizer = Adam(model.parameters(), lr=2e-5)
    train(model, train_loader, optimizer, device,pos_weight_tensor)
    acc = evaluate(model, test_loader, device)
    print(f"Test Accuracy: {acc:.4f}")
    fold_accuracies.append(acc)

print(f"\nAverage k-Fold Accuracy: {np.mean(fold_accuracies):.4f}")


from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score, precision_score, recall_score
acc = evaluate(model, test_loader, device)
print(f"Test Accuracy: {acc:.4f}")
fold_accuracies.append(acc)

print(f"\nAverage k-Fold Accuracy: {np.mean(fold_accuracies):.4f}")

def collect_predictions(model, dataloader, device):
    model.eval()
    probs, targets = [], []
    with torch.no_grad():
        for batch in dataloader:
            for key in batch:
                batch[key] = batch[key].to(device)
            logits = model(batch['input_ids_a'], batch['attention_mask_a'],
                           batch['input_ids_b'], batch['attention_mask_b'])
            prob = torch.sigmoid(logits).cpu().numpy()
            label = batch['label'].cpu().numpy()
            probs.extend(prob)
            targets.extend(label)
    return np.array(probs), np.array(targets)

probs, targets = collect_predictions(model, test_loader, device)
print(f"AUC-ROC: {roc_auc_score(targets, probs):.4f}")
preds = (probs >= 0.5).astype(int)  # Add this line

print(f"F1 Score: {f1_score(targets, preds):.4f}")
print(f"Precision: {precision_score(targets, preds):.4f}")
print(f"Recall: {recall_score(targets, preds):.4f}")


Some weights of BertModel were not initialized from the model checkpoint at /home/liorkob/M.Sc/thesis/similarity-model/hebert-mlm-verdicts/final and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[1 3 2 4]
[0 1]
Similarity scores: [1 3 2 4]
Label distribution: class_label
0    72
1    35
Name: count, dtype: int64

--- Fold 1 ---


Epoch 1: 100%|██████████| 7/7 [00:00<00:00, 12.56it/s]


Loss: 0.9651, Accuracy: 0.2642


Epoch 2: 100%|██████████| 7/7 [00:00<00:00, 12.21it/s]


Loss: 0.9690, Accuracy: 0.3962


Epoch 3: 100%|██████████| 7/7 [00:00<00:00, 12.98it/s]


Loss: 0.9589, Accuracy: 0.3774


Epoch 4: 100%|██████████| 7/7 [00:00<00:00, 13.44it/s]


Loss: 0.9479, Accuracy: 0.3774


Epoch 5: 100%|██████████| 7/7 [00:00<00:00, 13.53it/s]


Loss: 0.9284, Accuracy: 0.3774


Epoch 6: 100%|██████████| 7/7 [00:00<00:00, 13.55it/s]


Loss: 0.9074, Accuracy: 0.3774


Epoch 7: 100%|██████████| 7/7 [00:00<00:00, 13.55it/s]


Loss: 0.8668, Accuracy: 0.6038


Epoch 8: 100%|██████████| 7/7 [00:00<00:00, 12.54it/s]


Loss: 0.8072, Accuracy: 0.6604


Epoch 9: 100%|██████████| 7/7 [00:00<00:00, 12.56it/s]


Loss: 0.7179, Accuracy: 0.8113


Epoch 10: 100%|██████████| 7/7 [00:00<00:00, 13.12it/s]


Loss: 0.6443, Accuracy: 0.8679


Epoch 11: 100%|██████████| 7/7 [00:00<00:00, 13.69it/s]


Loss: 0.5318, Accuracy: 0.8868


Epoch 12: 100%|██████████| 7/7 [00:00<00:00, 13.49it/s]


Loss: 0.4433, Accuracy: 0.9623


Epoch 13: 100%|██████████| 7/7 [00:00<00:00, 13.69it/s]


Loss: 0.3587, Accuracy: 0.9811


Epoch 14: 100%|██████████| 7/7 [00:00<00:00, 13.75it/s]


Loss: 0.3016, Accuracy: 0.9623


Epoch 15: 100%|██████████| 7/7 [00:00<00:00, 13.77it/s]


Loss: 0.2772, Accuracy: 0.9811


Some weights of BertModel were not initialized from the model checkpoint at /home/liorkob/M.Sc/thesis/similarity-model/hebert-mlm-verdicts/final and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test Accuracy: 0.5185

--- Fold 2 ---


Epoch 1: 100%|██████████| 7/7 [00:00<00:00, 13.36it/s]


Loss: 0.9119, Accuracy: 0.3889


Epoch 2: 100%|██████████| 7/7 [00:00<00:00, 13.53it/s]


Loss: 0.8687, Accuracy: 0.8148


Epoch 3: 100%|██████████| 7/7 [00:00<00:00, 13.54it/s]


Loss: 0.8485, Accuracy: 0.7778


Epoch 4: 100%|██████████| 7/7 [00:00<00:00, 13.55it/s]


Loss: 0.8042, Accuracy: 0.8889


Epoch 5: 100%|██████████| 7/7 [00:00<00:00, 13.49it/s]


Loss: 0.7381, Accuracy: 0.8889


Epoch 6: 100%|██████████| 7/7 [00:00<00:00, 13.57it/s]


Loss: 0.6335, Accuracy: 0.9074


Epoch 7: 100%|██████████| 7/7 [00:00<00:00, 13.55it/s]


Loss: 0.5538, Accuracy: 0.9074


Epoch 8: 100%|██████████| 7/7 [00:00<00:00, 13.60it/s]


Loss: 0.4585, Accuracy: 0.9444


Epoch 9: 100%|██████████| 7/7 [00:00<00:00, 13.48it/s]


Loss: 0.4168, Accuracy: 0.9259


Epoch 10: 100%|██████████| 7/7 [00:00<00:00, 13.65it/s]


Loss: 0.3385, Accuracy: 0.9444


Epoch 11: 100%|██████████| 7/7 [00:00<00:00, 13.64it/s]


Loss: 0.2907, Accuracy: 0.9630


Epoch 12: 100%|██████████| 7/7 [00:00<00:00, 13.58it/s]


Loss: 0.2503, Accuracy: 0.9630


Epoch 13: 100%|██████████| 7/7 [00:00<00:00, 13.49it/s]


Loss: 0.2108, Accuracy: 1.0000


Epoch 14: 100%|██████████| 7/7 [00:00<00:00, 13.36it/s]


Loss: 0.2118, Accuracy: 0.9630


Epoch 15: 100%|██████████| 7/7 [00:00<00:00, 13.45it/s]


Loss: 0.1594, Accuracy: 0.9815
Test Accuracy: 0.7358

Average k-Fold Accuracy: 0.6272
Test Accuracy: 0.7358

Average k-Fold Accuracy: 0.6634
AUC-ROC: 0.7712
F1 Score: 0.5625
Precision: 0.7500
Recall: 0.4500


In [14]:
import shap
import torch.nn.functional as F

# נבחר batch לבדיקה
batch = next(iter(test_loader))
for key in batch:
    batch[key] = batch[key].to(device)

# יצירת embedding של input_ids_a
with torch.no_grad():
    embedded_inputs = model.encoder.embeddings(batch['input_ids_a'])  # shape: [batch, seq_len, hidden]

class EmbeddingWrapper(nn.Module):
    def __init__(self, base_model):
        super().__init__()
        self.encoder = base_model.encoder  # HeBERT
        self.pool = base_model.mean_pooling
        self.classifier = base_model.classifier

    def forward(self, embedded_input):
        # צרי attention mask שמתאים ל־embedding
        attention_mask = (embedded_input.abs().sum(-1) > 0).bool().to(embedded_input.device)

        # העבר את ה־embedding ל־AutoModel במקום input_ids
        output = self.encoder(inputs_embeds=embedded_input, attention_mask=attention_mask)
        pooled = self.pool(output, attention_mask)

        # כמו קודם - צד B אפס
        combined = torch.cat([pooled, torch.zeros_like(pooled)], dim=1)
        return self.classifier(combined)

# עטיפת המודל
wrapped_model = EmbeddingWrapper(model).to(device)

# הפעלת SHAP
explainer = shap.DeepExplainer(wrapped_model, embedded_inputs)
shap_values = explainer.shap_values(embedded_inputs, check_additivity=False)
print(np.array(shap_values).shape)


all_explanations = []

# נניח שיש לך batch בגודל 8 כמו בדוגמה שלך
for i in range(len(batch['input_ids_a'])):
    tokens = tokenizer.convert_ids_to_tokens(batch['input_ids_a'][i].cpu())
    attention_mask = batch['attention_mask_a'][i].cpu().bool().numpy()

    shap_tensor_i = shap_values[i][:, :, 0]  # צורה: [seq_len, embedding_dim]
    shap_tensor_valid = shap_tensor_i[attention_mask, :]
    token_level_values = shap_tensor_valid.mean(axis=1)

    valid_tokens = np.array(tokens)[attention_mask]

    all_explanations.append((valid_tokens, token_level_values))




# הנחת צורת SHAP values: [batch, seq_len, embedding_dim, 1]
shap_tensor = shap_values[0][:, :, 0]  # צורה: [128, 768]
print("Fixed SHAP shape:", shap_tensor.shape)

# ניקוי padding
attention_mask = batch['attention_mask_a'][0]
valid_indices = attention_mask.bool().cpu().numpy()

# סינון טוקנים אמיתיים
shap_tensor_valid = shap_tensor[valid_indices, :]  # [valid_seq_len, 768]
token_level_values = shap_tensor_valid.mean(axis=1)  # ציון אחד לכל טוקן

# טוקנים
tokens = tokenizer.convert_ids_to_tokens(batch['input_ids_a'][0].cpu())
valid_tokens = np.array(tokens)[valid_indices]

# יצירת הסבר והצגה
explanation = shap.Explanation(values=token_level_values, data=valid_tokens, base_values=0)
shap.plots.text(explanation)





(8, 128, 768, 1)
Fixed SHAP shape: (128, 768)


In [5]:
from collections import defaultdict
import numpy as np

token_scores = defaultdict(list)

# נניח שיש לך רשימה של זוגות: (valid_tokens, token_level_values)
for tokens, shap_vals in all_explanations:  # כל דוגמה
    for token, val in zip(tokens, shap_vals):
        token_scores[token].append(val)

# חשב ממוצע והשפעה כוללת
token_stats = {
    token: {
        'mean': np.mean(vals),
        'count': len(vals),
        'abs_mean': np.mean(np.abs(vals)),
    }
    for token, vals in token_scores.items()
}

# מיון לפי השפעה ממוצעת חיובית או שלילית
most_positive = sorted(token_stats.items(), key=lambda x: x[1]['mean'], reverse=True)[:10]
most_negative = sorted(token_stats.items(), key=lambda x: x[1]['mean'])[:10]
most_influential = sorted(token_stats.items(), key=lambda x: x[1]['abs_mean'], reverse=True)[:10]
print("🔴 Top Positive Tokens:")
for tok, stats in most_positive:
    print(f"{tok}: mean={stats['mean']:.4f}, count={stats['count']}")

print("\n🔵 Top Negative Tokens:")
for tok, stats in most_negative:
    print(f"{tok}: mean={stats['mean']:.4f}, count={stats['count']}")

print("\n🟡 Top Influential Tokens (abs):")
for tok, stats in most_influential:
    print(f"{tok}: abs_mean={stats['abs_mean']:.4f}, count={stats['count']}")


🔴 Top Positive Tokens:
01: mean=0.0019, count=1
הוע: mean=0.0011, count=1
בהתאם: mean=0.0006, count=5
##מד: mean=0.0003, count=1
ייבוא: mean=0.0003, count=2
אינה: mean=0.0002, count=1
ביכ: mean=0.0002, count=1
על: mean=0.0002, count=7
להתגורר: mean=0.0002, count=1
ואדם: mean=0.0002, count=1

🔵 Top Negative Tokens:
במזומן: mean=-0.0007, count=1
m: mean=-0.0006, count=1
שזהו: mean=-0.0004, count=1
פשע: mean=-0.0003, count=2
להסדר: mean=-0.0002, count=1
אותו: mean=-0.0002, count=1
סכום: mean=-0.0002, count=1
שהתקיים: mean=-0.0002, count=1
אוורור: mean=-0.0001, count=1
##בוס: mean=-0.0001, count=1

🟡 Top Influential Tokens (abs):
01: abs_mean=0.0019, count=1
הוע: abs_mean=0.0011, count=1
במזומן: abs_mean=0.0007, count=1
בהתאם: abs_mean=0.0007, count=5
m: abs_mean=0.0006, count=1
שזהו: abs_mean=0.0004, count=1
##מד: abs_mean=0.0003, count=1
פשע: abs_mean=0.0003, count=2
על: abs_mean=0.0003, count=7
ייבוא: abs_mean=0.0003, count=2


In [7]:
def merge_tokens_to_words(token_stats_list):
    """
    ממיר רשימת טוקנים עם השפעות למילים מקוריות מאוחדות
    """
    merged = []
    current_tokens = []
    current_scores = []

    for token, stats in token_stats_list:
        score = stats["mean"] if "mean" in stats else stats["abs_mean"]
        
        # התחלה של מילה חדשה
        if not token.startswith("##"):
            if current_tokens:
                # שמירה של המילה שהסתיימה
                word = tokenizer.convert_tokens_to_string(current_tokens)
                avg_score = np.mean(current_scores)
                merged.append((word, avg_score, len(current_scores)))
            # התחלה של מילה חדשה
            current_tokens = [token]
            current_scores = [score]
        else:
            # ממשיכים את אותה מילה
            current_tokens.append(token)
            current_scores.append(score)

    # הוספת המילה האחרונה
    if current_tokens:
        word = tokenizer.convert_tokens_to_string(current_tokens)
        avg_score = np.mean(current_scores)
        merged.append((word, avg_score, len(current_scores)))

    # מיון מחדש
    merged_sorted = sorted(merged, key=lambda x: x[1], reverse=True)
    return merged_sorted[:10]  # Top 10

# המרה של כל אחת מהקטגוריות
top_words_positive = merge_tokens_to_words(most_positive)
top_words_negative = merge_tokens_to_words(most_negative)
top_words_influential = merge_tokens_to_words(most_influential)

# הדפסה
print("\n🔴 Top Positive Words:")
for word, score, count in top_words_positive:
    print(f"{word}: mean={score:.4f}, parts={count}")

print("\n🔵 Top Negative Words:")
for word, score, count in top_words_negative:
    print(f"{word}: mean={score:.4f}, parts={count}")

print("\n🟡 Top Influential Words (abs):")
for word, score, count in top_words_influential:
    print(f"{word}: abs_mean={score:.4f}, parts={count}")



🔴 Top Positive Words:
01: mean=0.0019, parts=1
הוע: mean=0.0011, parts=1
בהתאםמד: mean=0.0004, parts=2
ייבוא: mean=0.0003, parts=1
אינה: mean=0.0002, parts=1
ביכ: mean=0.0002, parts=1
על: mean=0.0002, parts=1
להתגורר: mean=0.0002, parts=1
ואדם: mean=0.0002, parts=1

🔵 Top Negative Words:
אוורורבוס: mean=-0.0001, parts=2
שהתקיים: mean=-0.0002, parts=1
סכום: mean=-0.0002, parts=1
אותו: mean=-0.0002, parts=1
להסדר: mean=-0.0002, parts=1
פשע: mean=-0.0003, parts=1
שזהו: mean=-0.0004, parts=1
m: mean=-0.0006, parts=1
במזומן: mean=-0.0007, parts=1

🟡 Top Influential Words (abs):
01: abs_mean=0.0019, parts=1
הוע: abs_mean=0.0011, parts=1
בהתאם: abs_mean=0.0006, parts=1
ייבוא: abs_mean=0.0003, parts=1
על: abs_mean=0.0002, parts=1
שזהומד: abs_mean=-0.0000, parts=2
פשע: abs_mean=-0.0003, parts=1
m: abs_mean=-0.0006, parts=1
במזומן: abs_mean=-0.0007, parts=1
