In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import mean_squared_error

In [2]:
def sigma(u, v):
    '''
    divergence metrics used to calculate sigma(e1,e2)
    '''
    return np.sum(np.abs(u - v))

In [3]:
with open('doc_embedding_dict.pkl', 'rb') as f:
    doc_embeddings = pickle.load(f)  # Document embeddings dictionary: {news_id: embedding}

with open('summary_embedding_dict.pkl', 'rb') as f:
    user_embeddings = pickle.load(f)  # User embeddings dictionary: {user_id: embedding}

with open('title_embedding_dict.pkl', 'rb') as f:
    title_embeddings = pickle.load(f)  # Title embeddings dictionary: {news_id: embedding}

In [4]:
from tqdm import tqdm

In [5]:
# Load user interaction dataset and split columns
user_data = pd.read_csv('synthetic-original_D1_150_40.csv')  # Adjust the path to your file

# Example dataset columns:
# ['user_id', 'news_ids', 'actions', 'num_summaries']

In [None]:
from ast import literal_eval
tqdm.pandas(desc="Processing Action and Docs columns")
user_data['Action'] = user_data['Action'].progress_apply(lambda x: literal_eval(x) if isinstance(x, str) else x)
user_data['Docs'] = user_data['Docs'].progress_apply(lambda x: literal_eval(x) if isinstance(x, str) else x)


Processing Action and Docs columns: 100%|████████████████████████████████████████████████████████████████████████| 399994/399994 [03:00<00:00, 2219.05it/s]
Processing Action and Docs columns:  70%|██████████████████████████████████████████████████▍                     | 280334/399994 [03:36<00:52, 2264.55it/s]

In [None]:
# Calculate the number of distinct user IDs
distinct_user_ids = user_data['UserID'].nunique()

print("Number of distinct user IDs:", distinct_user_ids)


In [None]:
overall_divergence = 0
total_users = distinct_user_ids

# Iterate through the dataset with tqdm for progress tracking
for index, row in tqdm(user_data.iterrows(), total=len(user_data), desc="Processing User Trajectories"):
    user_id = row['UserID']
    actions = row['Action']  # List of actions (click, skip, gensumm, sumgenn)
    doc_ids = row['Docs']  # List of document IDs that the user interacted with

    # Initialize variables
    num_clicks = 0  # Track the number of 'click' actions
    num_skips = 0
    prev_user_embedding = None
    doc_embeddings_up_to_t1 = []  # To store document embeddings for moving average
    total_divergence = 0

    # Loop over each action in the trajectory
    for t1 in range(len(actions) - 1):
        action = actions[t1]
        doc_id_t1 = doc_ids[t1]
        doc_id_t2 = doc_ids[t1 + 1]

        # Handle 'click' action
        if action == 'click' and num_clicks == 0:
            #print("First click")
            Dt1 = doc_embeddings.get(doc_id_t1, np.zeros(384))
            Ut1 = title_embeddings.get(doc_id_t1, np.zeros(384))
            num_clicks += 1  # Increment click counter
            doc_embeddings_up_to_t1.append(Dt1)  # Store the document embedding
        
        #handle 'click' between summary nodes
        elif action == 'click' and num_clicks > 0:
            doc_embeddings_up_to_t1.append(doc_embeddings.get(doc_id_t1,np.zeros(384)))
        
        # Handle 'skip' action
        elif action == 'skip':
            doc_embeddings_up_to_t1.append(doc_embeddings.get(doc_id_t1,np.zeros(384)))
        
        # Handle 'gensumm' or 'sumgenn' actions
        elif action == 'gen_summ':
            #print("Gensumm encountered")
            doc_embeddings_up_to_t1.append(doc_embeddings.get(doc_id_t1,np.zeros(384)))
            #print("Number of actions in between : ",len(doc_embeddings_up_to_t1))
            if len(doc_embeddings_up_to_t1) > 0:
                Dt2 = np.mean(doc_embeddings_up_to_t1, axis=0)  # Moving average of document embeddings
            else:
                Dt2 = doc_embeddings.get(doc_id_t1, np.zeros(384))

            Ut2 = user_embeddings.get(doc_id_t2, np.zeros(384))
            #print("First set", Dt1,Ut1)
            #print("Second set", Dt2,Ut2)

            # Calculate DePS using the provided formula
            
            doc_divergence = (min(sigma(Dt1, Dt2), sigma(Ut2, Ut1)) + 1e-7) / (max(sigma(Dt1, Dt2), sigma(Ut2, Ut1)) + 1e-7)
                             

            # Add penalty for deviation from document theme
            penalty = sigma(Dt1, Ut1) / (sigma(Dt2, Ut2) + 1e-7)
            penalized_deps = doc_divergence * penalty

            # Calculate overall divergence
            document_divergence = penalized_deps * sigma(Ut2, Ut1)

            # Add to total divergence for the trajectory
            total_divergence += document_divergence

            # Reset doc embeddings list after processing
            doc_embeddings_up_to_t1 = []

            # Update embeddings for the next time step
            Dt1 = Dt2
            Ut1 = Ut2

    # Calculate row divergence normalized by the number of actions
    if len(actions) > 0:
        row_divergence = total_divergence / (len(actions) - 1)
    

    # Add the row's divergence to the overall divergence
    overall_divergence += row_divergence
    #print("Row divergence", overall_divergence)
# Normalize the overall divergence by the number of users
degree_d = 0.51 * overall_divergence / total_users

# Output the final DegreeD
print("Overall DegreeD in the dataset D2_150_20:", degree_d)


In [6]:
# Load user interaction dataset and split columns
user_data = pd.read_csv('synthetic-original_D1_150_10.csv')  # Adjust the path to your file

# Example dataset columns:
# ['user_id', 'news_ids', 'actions', 'num_summaries']

from ast import literal_eval
tqdm.pandas(desc="Processing Action and Docs columns")
user_data['Action'] = user_data['Action'].progress_apply(lambda x: literal_eval(x) if isinstance(x, str) else x)
user_data['Docs'] = user_data['Docs'].progress_apply(lambda x: literal_eval(x) if isinstance(x, str) else x)

Processing Action and Docs columns: 100%|████████████████████████████████████| 399994/399994 [04:17<00:00, 1554.69it/s]
Processing Action and Docs columns: 100%|█████████████████████████████████████| 399994/399994 [08:14<00:00, 808.57it/s]


In [7]:
# Calculate the number of distinct user IDs
distinct_user_ids = user_data['UserID'].nunique()

print("Number of distinct user IDs:", distinct_user_ids)


Number of distinct user IDs: 363285


In [8]:
overall_divergence = 0
total_users = distinct_user_ids

# Iterate through the dataset with tqdm for progress tracking
for index, row in tqdm(user_data.iterrows(), total=len(user_data), desc="Processing User Trajectories"):
    user_id = row['UserID']
    actions = row['Action']  # List of actions (click, skip, gensumm, sumgenn)
    doc_ids = row['Docs']  # List of document IDs that the user interacted with

    # Initialize variables
    num_clicks = 0  # Track the number of 'click' actions
    num_skips = 0
    prev_user_embedding = None
    doc_embeddings_up_to_t1 = []  # To store document embeddings for moving average
    total_divergence = 0

    # Loop over each action in the trajectory
    for t1 in range(len(actions) - 1):
        action = actions[t1]
        doc_id_t1 = doc_ids[t1]
        doc_id_t2 = doc_ids[t1 + 1]

        # Handle 'click' action
        if action == 'click' and num_clicks == 0:
            #print("First click")
            Dt1 = doc_embeddings.get(doc_id_t1, np.zeros(384))
            Ut1 = title_embeddings.get(doc_id_t1, np.zeros(384))
            num_clicks += 1  # Increment click counter
            doc_embeddings_up_to_t1.append(Dt1)  # Store the document embedding
        
        #handle 'click' between summary nodes
        elif action == 'click' and num_clicks > 0:
            doc_embeddings_up_to_t1.append(doc_embeddings.get(doc_id_t1,np.zeros(384)))
        
        # Handle 'skip' action
        elif action == 'skip':
            doc_embeddings_up_to_t1.append(doc_embeddings.get(doc_id_t1,np.zeros(384)))
        
        # Handle 'gensumm' or 'sumgenn' actions
        elif action == 'gen_summ':
            #print("Gensumm encountered")
            doc_embeddings_up_to_t1.append(doc_embeddings.get(doc_id_t1,np.zeros(384)))
            #print("Number of actions in between : ",len(doc_embeddings_up_to_t1))
            if len(doc_embeddings_up_to_t1) > 0:
                Dt2 = np.mean(doc_embeddings_up_to_t1, axis=0)  # Moving average of document embeddings
            else:
                Dt2 = doc_embeddings.get(doc_id_t1, np.zeros(384))

            Ut2 = user_embeddings.get(doc_id_t2, np.zeros(384))
            #print("First set", Dt1,Ut1)
            #print("Second set", Dt2,Ut2)

            # Calculate DePS using the provided formula
            
            doc_divergence = (min(sigma(Dt1, Dt2), sigma(Ut2, Ut1)) + 1e-7) / (max(sigma(Dt1, Dt2), sigma(Ut2, Ut1)) + 1e-7)
                             

            # Add penalty for deviation from document theme
            penalty = sigma(Dt1, Ut1) / (sigma(Dt2, Ut2) + 1e-7)
            penalized_deps = doc_divergence * penalty

            # Calculate overall divergence
            document_divergence = penalized_deps * sigma(Ut2, Ut1)

            # Add to total divergence for the trajectory
            total_divergence += document_divergence

            # Reset doc embeddings list after processing
            doc_embeddings_up_to_t1 = []

            # Update embeddings for the next time step
            Dt1 = Dt2
            Ut1 = Ut2

    # Calculate row divergence normalized by the number of actions
    if len(actions) > 0:
        row_divergence = total_divergence / (len(actions) - 1)
    

    # Add the row's divergence to the overall divergence
    overall_divergence += row_divergence
    #print("Row divergence", overall_divergence)
# Normalize the overall divergence by the number of users
degree_d = 0.51 * overall_divergence / total_users

# Output the final DegreeD
print("Overall DegreeD in the D2_150_10 dataset:", degree_d)


Processing User Trajectories: 100%|███████████████████████████████████████████| 399994/399994 [11:16<00:00, 590.91it/s]


Overall DegreeD in the D2_25_50 dataset: 0.1258991744919089
