In [None]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load KBERT tokenizer and model (assuming KBERT is in Hugging Face)
tokenizer = BertTokenizer.from_pretrained("hfl/chinese-bert-wwm-ext")
model = BertModel.from_pretrained("hfl/chinese-bert-wwm-ext")

In [None]:
def get_kbert_embedding(sentence):
    """Get KBERT embedding for a given sentence."""
    inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Extract the [CLS] token's embedding (for sentence-level representation)
    cls_embedding = outputs.last_hidden_state[:, 0, :]
    
    return cls_embedding.numpy()

In [None]:
def kbert_distance(sent1, sent2):
    """Calculate MSE-based distance between two sentences using KBERT."""
    emb1 = get_kbert_embedding(sent1)
    emb2 = get_kbert_embedding(sent2)
    
    # Ensure the embeddings have the same shape
    if emb1.shape != emb2.shape:
        raise ValueError("Embeddings must have the same shape to calculate MSE distance")
    
    # Compute Mean Squared Error (MSE) between embeddings
    mse_distance = np.mean((emb1 - emb2) ** 2)
    
    return mse_distance

In [None]:
def kbert_based_divergence(Dt1, Dt2, Ut1, Ut2):
    """Calculate divergence using KBERT embeddings."""
    # Calculate KBERT-based distances between user summaries (Ut1, Ut2) and document texts (Dt1, Dt2)
    document_dist1 = kbert_distance(Dt1, Ut1)
    document_dist2 = kbert_distance(Dt2, Ut2)
    doc_dist_t1_t2 = kbert_distance(Dt1, Dt2)
    user_dist_t1_t2 = kbert_distance(Ut1, Ut2)

    # Compute divergence based on the distances
    if user_dist_t1_t2 != 0:
        divergence_value = (doc_dist_t1_t2 / user_dist_t1_t2) * 0.5 * (document_dist2 / document_dist1)
    else:
        divergence_value = 0  # Handle the case where distance is zero
    
    return divergence_value


In [None]:
for index, row in tqdm(dataset.iterrows(), total=len(dataset), desc="Processing user trajectories"):
    actions = row['Action'].split(',')
    docs = row['Docs'].split(',')
    cleaned_actions = [act.strip(" '") for act in actions]
    docs = [d.strip(" '") for d in docs]

    for i, (action, doc) in enumerate(zip(cleaned_actions, docs)):
        if action == 'gen_summ':
            summ_id = docs[i+1]
            summ_row = summ_df[summ_df['SummID'] == summ_id]

            if summ_row.empty:
                continue

            Ut1 = summ_row['Summary'].values[0]
            Dt1 = summ_row['NewsID'].values[0]

            for j in range(i + 1, len(cleaned_actions)):
                next_action = cleaned_actions[j]
                if next_action == 'gen_summ':
                    next_summ_id = docs[j+1]
                    next_summ_row = summ_df[summ_df['SummID'] == next_summ_id]

                    if next_summ_row.empty:
                        break

                    Ut2 = next_summ_row['Summary'].values[0]
                    Dt2 = next_summ_row['NewsID'].values[0]

                    # Calculate KBERT-based document divergence
                    document_divergence = kbert_based_divergence(Dt1, Dt2, Ut1, Ut2)
                    overall_divergence += document_divergence

                    # Update Ut1 and Dt1 for next iteration
                    Ut1, Dt1 = Ut2, Dt2

overall_divergence = Overall_divergence/len(dataset)