In [1]:
import random
import torch
import numpy as np 
import pandas as pd
from transformers import AutoModel
from time import perf_counter as timer
from sentence_transformers import util, SentenceTransformer
import torch
import time

import textwrap

from is_tpt_ref import ReferenceClassifier

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load dataframe
text_chunks_and_embeddings_df = pd.read_pickle("ajp_perc_prper_tpt_text_chunks_embeddings_jinav3_5.pkl")

# Convert texts and embedding df to list of dicts
doi_and_chunks = text_chunks_and_embeddings_df.to_dict(orient="records")

# Load embeddings onto GPU
embeddings = torch.tensor(np.array(text_chunks_and_embeddings_df["embedding"].tolist()), dtype=torch.float32).to(device)

# Load model
embedding_model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3", trust_remote_code=True, device_map=device)



  from .autonotebook import tqdm as notebook_tqdm


In [2]:

#Clean the labels on the data
csv=pd.read_excel("paper_labels_M_S_2.xlsx")

csv['Category (M)']=csv['Category (M)'].str.replace("*", "").replace("Journal business", "journal business").replace("Teacher", "teacher").replace('teaching', 'teacher').replace('Content', 'content').replace('content  ', 'content').replace('Student', 'student').replace("Content    ", "content").replace("Content  ", "content").replace("Teacher  ", "teacher").replace("Teacher ", "teacher").replace("Teaching", "teacher")


In [3]:

#Define the title of the groups, used for quick/human reading
t1 = "Teaching students."
t2 = "Student focus."
t3 = "Physics content."
t4= "Journal business."

#Make the list of queries titles. 
query_list = [t1, t2, t3, t4]

#Describe the content of each group. This is the actual text that the "user" would enter to separate the papers
s1="Teaching. Laboratory equipment. Teaching methods."
s2="Student belonging. Student focused. Student agency."
s3="Physics content. Physics material. Math. Derivations."
s4="Editorials, book reviews, announcements, obituaries. Journal business. Reports on business. "

#Put them into a list
query_list_verbose=[s1, s2, s3, s4]
query_embedding = embedding_model.encode(query_list_verbose, convert_to_tensor=True)

print(f"Query: {query_list}")

# Compute dot product similarity
start_time = timer()
dot_scores_1 = util.dot_score(a=query_embedding, b=embeddings).T  # Shape: (n_embeddings, 3)
end_time = timer()

print(f"Time take to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

# Add columns for s1, s2, s3, s4 similarity to dataframe
for n, t in enumerate(query_list):
    text_chunks_and_embeddings_df[t]= dot_scores_1[:, n].cpu().numpy() #Loaded into the CPU at first because that's how the tensors are loaded in


Query: ['Teaching students.', 'Student focus.', 'Physics content.', 'Journal business.']
Time take to get scores on 1180219 embeddings: 0.25976 seconds.


In [4]:
##But first we're going to calculate the topic scores for sentence chunk. For more on this function, see Odden et al. (2024)

# Default value of 'a'
a = -10

# Precompute the exponentials for efficiency
exp_values = np.exp(a * (1-text_chunks_and_embeddings_df[query_list]))

# Compute the denominator for softmax-like normalization
denominator = exp_values.sum(axis=1)

# Create new score columns

for col in query_list:
    score_col = f"{col.strip()}_score"
    text_chunks_and_embeddings_df[score_col] = exp_values[col] / denominator


In [5]:
# Weight mask was a concept that was initially oging to be used to cut out sections of the text we found to be unhelpful
# This never ended up being used, but we did code it to format our data so it needs to remain in. The weights should work if 
# you ever want to use it. The format is simple, [0, 50, 200, 100] means the first sentence chunk is not considered, the second chunk is 
# weighted half, the tird chunk is weighted twice as much as everything else, and the fourth chunk is weighted as much as the rest of the paper
def apply_weight_mask(df, query_list, chunk_weights):
    """
    Applies weighted averaging to chunks (sentence blocks) for each DOI and computes weighted embeddings.
    
    This function processes a DataFrame containing document chunks grouped by DOI, applying 
    specified weights to each chunk position across all DOIs. It computes weighted averages
    for query-related columns and creates weighted centroid embeddings for each DOI.
    
    Parameters
    ----------
    df : pandas.DataFrame
        Input DataFrame containing document chunks with the following expected columns:
        - 'doi': Document identifier for grouping chunks
        - 'embedding': PyTorch tensor representing the embedding for each chunk
        - Query columns specified in query_list (e.g., similarity scores)
        - Corresponding '_score' columns for each query column
        - Optional 'weight' column (will be created if not present)
    
    query_list : list of str
        List of column names representing query-related metrics to be weighted.
        The function expects corresponding '{col}_score' columns to exist in the DataFrame.
    
    chunk_weights : list of float
        List of weight values to apply to chunks, where:
        - Each value corresponds to the weight for a chunk at that index position
        - Length determines how many chunk positions receive custom weights
        - Values are normalized by dividing by 100.0
        - Chunks beyond this length retain default weight of 1.0
    
    Returns
    -------
    pandas.DataFrame
        Processed DataFrame with the following columns:
        - 'doi': Document identifier
        - '{col}_weighted': Weighted average for each query column
        - '{col}_score_weighted': Weighted average for each query score column  
        - 'Weighted Centroid': PyTorch tensor representing weighted centroid embedding
        - 'Centroid': PyTorch tensor representing unweighted mean embedding
    
    Notes
    -----
    - Chunks are indexed by their order within each DOI group (0-indexed)
    - Weights are normalized by dividing input values by 100.0
    - For embeddings, weighted centroids are computed as: sum(embedding * weight) / sum(weights)
    - Unweighted centroids are computed as simple mean of all embeddings per DOI
    - Function includes timing prints for performance monitoring
    - Requires PyTorch for tensor operations
    
    Examples
    --------
    >>> import pandas as pd
    >>> import torch
    >>> 
    >>> # Sample data
    >>> df = pd.DataFrame({
    ...     'doi': ['10.1000/1', '10.1000/1', '10.1000/2'],
    ...     'query1': [0.8, 0.6, 0.9],
    ...     'query1_score': [0.85, 0.65, 0.95],
    ...     'embedding': [torch.randn(128), torch.randn(128), torch.randn(128)]
    ... })
    >>> 
    >>> result = apply_weight_mask(df, ['query1'], [80, 60])
    >>> print(result.columns)
    Index(['doi', 'query1_weighted', 'query1_score_weighted', 
           'Weighted Centroid', 'Centroid'], dtype='object')
    
    Raises
    ------
    KeyError
        If expected columns (doi, embedding, or query columns) are missing from input DataFrame
    IndexError  
        If chunk_weights list is empty
    ValueError
        If embeddings cannot be stacked (inconsistent tensor dimensions)
    """

    # Normalize weight values to match the scale of the weight system
    normalized_weights = [weight / 100.0 for weight in chunk_weights]
    
    df = df.copy()
    df["chunk_index"] = df.groupby("doi").cumcount()

    # Create a weight column
    if "weight" not in df.columns:
        df["weight"] = 1.0  # Default weight for non-specified chunks
    
    # Apply the provided weights to each chunk index
    max_chunk_index = len(chunk_weights)
    for idx in range(min(max_chunk_index, df["chunk_index"].max() + 1)):
        df.loc[df["chunk_index"] == idx, "weight"] = normalized_weights[idx]
    
    # Multiply each of the query columns by the weight
    for col in query_list:
        df[col + "_weighted"] = df[col] * df["weight"]
    for col in query_list:
        df[col + "_score_weighted"] = df[f"{col}_score"] * df["weight"]

    ##Helper function to compute the similarity scores given the inputted weights
    def compute_weighted_scores(g):
        result = {}
        for col in query_list:
            result[f"{col}_weighted"] = g[col + "_weighted"].sum() / g["weight"].sum()
            result[f"{col}_score_weighted"] = g[col + "_score_weighted"].sum() / g["weight"].sum()
        return pd.Series(result)

    grouped_weighted = df.groupby("doi").apply(compute_weighted_scores).reset_index()
    print("grouped_weighted keys")
    print(grouped_weighted.keys())
    # Step 1: Apply weighting to embeddings
    df["weighted_embedding"] = df.apply(lambda row: row["embedding"] * row["weight"], axis=1)
    # Step 2: Group manually and compute sums
    grouped_rows = []

    start_time = time.time()
    
    for doi, group in df.groupby("doi"):
        embeddings = group["weighted_embedding"].tolist()
        weights = group["weight"].tolist()
        emb_unweighted= group['embedding'].tolist()
        
        summed_embedding = torch.stack(embeddings).sum(dim=0)
        total_weight = sum(weights)
        mean_embedding= torch.stack(emb_unweighted).mean(dim=0)
        grouped_rows.append({
            "doi": doi,
            "weighted_embedding": summed_embedding,
            "weight": total_weight,
            "unweighted_centroid": mean_embedding
        })

    end_time = time.time()
    print(f"Time it took to complete the averaging: {end_time-start_time}")
    
    grouped_df = pd.DataFrame(grouped_rows)
    start=time.time()
    # Step 3: Compute weighted centroids
    grouped_df["Weighted Centroid"] = grouped_df.apply(
        lambda row: row["weighted_embedding"] / row["weight"]
        if row["weight"] != 0 else torch.zeros_like(row["weighted_embedding"]),
        axis=1
    )
    
    end=time.time()
    print(f"Time it took to complete weighted centroiding: {end-start}")
    print("test1")
    #print(grouped_rows)

    start=time.time()
    
    # Merge to add the 'Weighted Centroid' to grouped_weighted
    grouped_weighted = grouped_weighted.merge(
        grouped_df[["doi", "Weighted Centroid", "unweighted_centroid"]],
        on="doi",
        how="left"
    )
    end=time.time()
    grouped_weighted=grouped_weighted.rename(columns={'unweighted_centroid':"Centroid"})
    print(f"Time to add weighted centroids to grouped_weighted: {end-start}")
    return grouped_weighted
# Apply weighting. The following implies that the first three sentence chunks are to be weighted equally (no weighting). 
weight = [100, 100, 100 ]         

# Apply the functions
df=text_chunks_and_embeddings_df
grouped = apply_weight_mask(df, query_list, weight)

#Make a new column for the topic scores if weighting is applied. 
weighted_ql=[]
for query in query_list:
    weighted_ql.append(f"{query}_weighted")


# Determine dominant group, insert that label as a new column
grouped["Main Group DotP Weighted"] = grouped[weighted_ql].idxmax(axis=1)


#This grabs items like "title", "doi", "Journal", and "year" from the dataframe and adds that relevant information into
#the dataframe "grouped". 
t=df.drop_duplicates('doi', keep='last').reset_index()
t2=t[['title','doi', 'journal', 'year']]

grouped=t2.merge(grouped, how='right', on='doi')


  grouped_weighted = df.groupby("doi").apply(compute_weighted_scores).reset_index()


grouped_weighted keys
Index(['doi', 'Teaching students._weighted',
       'Teaching students._score_weighted', 'Student focus._weighted',
       'Student focus._score_weighted', 'Physics content._weighted',
       'Physics content._score_weighted', 'Journal business._weighted',
       'Journal business._score_weighted'],
      dtype='object')
Time it took to complete the averaging: 17.305710554122925
Time it took to complete weighted centroiding: 2.4801182746887207
test1
Time to add weighted centroids to grouped_weighted: 0.012016057968139648
Here 


Index(['doi', 'Teaching students._weighted',
       'Teaching students._score_weighted', 'Student focus._weighted',
       'Student focus._score_weighted', 'Physics content._weighted',
       'Physics content._score_weighted', 'Journal business._weighted',
       'Journal business._score_weighted', 'Weighted Centroid', 'Centroid'],
      dtype='object')


In [6]:
#Grabs labels form the CSV and inserts them into the dataframe. Renames the dataframe df_labels. 

weighted_score_ql=[]
for query in query_list:
    weighted_score_ql.append(f"{query}_score_weighted")


csv=csv.rename(columns={"Doi": "doi"})
csv['doi']=csv['doi'].str.replace(" ", "")
csv['doi']=csv['doi'].str.replace("\'", "")
csv['doi']=csv['doi'].str.replace("https://doi.org/", "")
t=grouped[["doi", "year", "title", "Weighted Centroid", "Centroid", "journal" ] + weighted_ql + weighted_score_ql]
t_=t.reset_index()
df_labels=pd.merge(t_, csv[['Category (M)', 'doi']], on='doi', how="left")

##At this point, the dataframe has handmade labels for the items that have received them by matching DOI. The labels are in single letter format
##the main topic labels need to be converted to this format 

In [7]:
##Calcualting the topic score after weights have been applied. 

# Default value of 'a'
#a = -10

# Precompute the exponentials for efficiency
exp_values = np.exp(a * (1-df_labels[weighted_ql]))

# Compute the denominator for softmax-like normalization
denominator = exp_values.sum(axis=1)

# Create new score columns

for col in weighted_ql:
    score_col = f"{col.strip()}_score_avg_cos"
    df_labels[score_col] = exp_values[col] / denominator

# Step 2: Get main_group column based on max score
score_cols = [f"{col.strip()}_score_avg_cos" for col in weighted_ql]
df_labels['MG Score Avg Cos'] = df_labels[score_cols].idxmax(axis=1).str.replace('_weighted_score_avg_cos', '')

# Step 3: Define evaluator function


In [8]:
#This is the topic score applied to each paper by averaging together the cosine similarities 

#a = -10


# Create new score columns
#score_cols = [f"{col.strip()}_score_weighted" for col in weighted_ql]
df_labels['MG Score Avg Score'] = df_labels[weighted_ql].idxmax(axis=1).str.replace('_weighted', '')


# Display the updated dataframe structure
#print(df_labels.head())

##Want to also calculate here the labels based on assigning a topic score to each sentence chunk, then averaging that
##and seeing which one of those works better. 

for n,  (cos, score) in enumerate(zip(df_labels['MG Score Avg Cos'], df_labels['MG Score Avg Cos'])):
    same_val= cos==score
    if same_val==False:
        print(f"{n}: {cos} {score}")

for element, title in zip(df_labels.iloc[766], df_labels.keys()):
    print(f"label: {title} Element: {element}")

grouped[grouped['doi']=='10.1103/PhysRevPhysEducRes.13.019901']

In [16]:
#Load in file with embeddings of the whole paper pre-computed
full_text_and_embeddings = pd.read_pickle("ajp_perc_prper_tpt_full_text_embeddings_2.pkl")


In [18]:
full_text_and_embeddings

Unnamed: 0,full_text,doi,year,journal,char_count,token_count,sentences,sentences_count,sentence_chunks,num_chunks,Full Text Embedding
0,Make a Mystery Circuit with a Bar Light Fixtur...,10.1119/1.2715425,,tpt,7234,1808.50,[Make a Mystery Circuit with a Bar Light Fixtu...,81,[[Make a Mystery Circuit with a Bar Light Fixt...,17,"[tensor(-0.0012), tensor(0.0354), tensor(0.108..."
1,AGOLDEN OLDIE-ABLAOK BOX OIROUIT \r\nClifton K...,10.1119/1.2343976,,tpt,4633,1158.25,[AGOLDEN OLDIE-ABLAOK BOX OIROUIT \r\nClifton ...,58,[[AGOLDEN OLDIE-ABLAOK BOX OIROUIT \r\nClifton...,12,"[tensor(0.0039), tensor(-0.0006), tensor(0.114..."
2,Modeling Electricity: Model-Based Inquiry with...,10.1119/1.4745686,,tpt,17496,4374.00,[Modeling Electricity: Model-Based Inquiry wit...,140,[[Modeling Electricity: Model-Based Inquiry wi...,28,"[tensor(0.0564), tensor(-0.0816), tensor(0.150..."
3,"Two Approaches to Learning Physics \r\n""I look...",10.1119/1.2342910,,tpt,37322,9330.50,"[Two Approaches to Learning Physics \r\n""I loo...",324,"[[Two Approaches to Learning Physics \r\n""I lo...",65,"[tensor(0.0499), tensor(-0.1132), tensor(0.178..."
4,"\r\nJochen Kuhn and Patrik Vogt, Column Editor...",10.1119/1.4865529,,tpt,6621,1655.25,"[\r\nJochen Kuhn and Patrik Vogt, Column Edito...",76,"[[\r\nJochen Kuhn and Patrik Vogt, Column Edit...",16,"[tensor(0.0292), tensor(-0.0519), tensor(0.055..."
...,...,...,...,...,...,...,...,...,...,...,...
43602,Examining faculty choices while implementing t...,10.1119/perc.2023.pr.Willison,2023.0,perc,24881,6220.25,[Examining faculty choices while implementing ...,203,[[Examining faculty choices while implementing...,41,"[tensor(0.1635), tensor(-0.1263), tensor(0.057..."
43603,Analyzing the dimensionality of the Energy and...,10.1119/perc.2023.pr.Wu,2023.0,perc,24460,6115.00,[Analyzing the dimensionality of the Energy an...,227,[[Analyzing the dimensionality of the Energy a...,46,"[tensor(0.0763), tensor(-0.1726), tensor(0.054..."
43604,Students’ use of symmetry as a tool for sensem...,10.1119/perc.2023.pr.Young,2023.0,perc,27598,6899.50,[Students’ use of symmetry as a tool for sense...,264,[[Students’ use of symmetry as a tool for sens...,53,"[tensor(0.1260), tensor(-0.0853), tensor(0.105..."
43605,Analyzing Physics Majors’ Specialization Low I...,10.1119/perc.2023.pr.Zohrabi_Alaee,2023.0,perc,27954,6988.50,[Analyzing Physics Majors’ Specialization Low ...,222,[[Analyzing Physics Majors’ Specialization Low...,45,"[tensor(0.0705), tensor(-0.1829), tensor(0.015..."


In [19]:
#Making a new dataframe for the whole-paper-embeddings to be inserted into

df_labels_new=df_labels.merge(full_text_and_embeddings[['doi', 'Full Text Embedding']], how="left", on="doi")
df_full_text=df_labels_new.copy()[['doi', 'year', 'title', 'journal','Full Text Embedding']]

#Load the tensors onto the GPU for faster processing
embeddings = torch.tensor(np.array(df_full_text['Full Text Embedding'].tolist()), dtype=torch.float32).to(device)


#Compute dot product similarity
start_time = timer()
dot_scores_1 = util.dot_score(a=query_embedding, b=embeddings).T  # Shape: (n_embeddings, 3)
end_time = timer()


print(f"Time take to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")


# Add columns for s1, s2, s3, s4 similarity to dataframe. Load onto the CPU for now
for n, t in enumerate(query_list):
    df_full_text[t]= dot_scores_1[:, n].cpu().numpy()


#a = -10

# Precompute the exponentials for efficiency
exp_values = np.exp(a * (1-df_full_text[query_list]))

# Compute the denominator for softmax-like normalization
denominator = exp_values.sum(axis=1)

# Create new score columns

for col in query_list:
    score_col = f"{col.strip()}_score"
    df_full_text[score_col] = exp_values[col] / denominator

# Step 2: Get main_group column based on max score
score_cols = [f"{col.strip()}_score" for col in query_list]
df_full_text['MG Score Full Embedding'] = df_full_text[score_cols].idxmax(axis=1).str.replace("_score", "")


Time take to get scores on 35376 embeddings: 0.02628 seconds.


In [20]:
#Combine the two dataframes for simplicity 
df_labels_new=df_labels_new.merge(df_full_text[['doi', 'MG Score Full Embedding']], how="left", on="doi")

##Evaluating the labeling process for chunked labels

def evaluate_labeling(df, teacher_col, student_col, content_col, journal_col, main_group):
    # Mapping of category labels to full column names
        """
    Evaluate the accuracy of automated labeling by comparing against manual categorization.
    
    This function assesses how well an automated labeling system performs by comparing
    its predictions against manual category labels. It provides detailed accuracy metrics
    both overall and per category, along with label distribution statistics.
    
    Parameters
    ----------
    df : pandas.DataFrame
        Input dataframe containing the data to evaluate. Must include columns:
        - 'Category (M)': Manual category labels ('t', 's', 'c', 'jb', or variations)
        - 'doi': Document identifiers for tracking results
        - A column specified by `main_group` containing automated predictions
    teacher_col : str
        Full name/label corresponding to teacher category ('t')
    student_col : str
        Full name/label corresponding to student category ('s')  
    content_col : str
        Full name/label corresponding to content category ('c')
    journal_col : str
        Full name/label corresponding to journal/book category ('jb')
    main_group : str
        Column name in df containing the automated labeling predictions to evaluate
        
    Returns
    -------
    pandas.DataFrame
        DataFrame with two columns:
        - 'doi': Document identifiers from input data
        - 'Correctly labeled': Boolean values indicating whether each entry
          was correctly labeled by the automated system
        Only includes rows that had valid manual categories.
        
    Side Effects
    ------------
    Prints comprehensive evaluation metrics to stdout:
    1. Total number of entries with valid manual categories
    2. Overall percentage of correctly labeled entries
    3. Per-category accuracy breakdown
    4. Label distribution for both manual and automated categories
    
    Notes
    -----
    - Manual category labels are case-insensitive and whitespace is stripped
    - Entries with missing/NaN values in 'Category (M)' are excluded from evaluation
    - Category mapping uses short codes: 't'->teacher, 's'->student, 'c'->content, 'jb'->journal
    - Automated label names are truncated to 40 characters in output for readability
    
    Examples
    --------
    ... 
    >>> result = evaluate_labeling(df, 'Teacher Focus', 'Student Focus', 
    ...                           'Content Focus', 'Journal Business', "MG Score Avg Score")
    """

    
    category_map = {
        't': teacher_col.strip(),
        's': student_col.strip(),
        'c': content_col.strip(),
        'jb': journal_col.strip()
    }

    def compare_labels(row):
        category = row['Category (M)']
        if pd.isna(category):
            return np.nan
        expected_group = category_map.get(category.strip().lower())
        return expected_group == row[main_group]

    df['Correctly labeled'] = df.apply(compare_labels, axis=1)

    # Only keep rows with a valid Category (M)
    valid = df[~df['Category (M)'].isna()]

    print("Here: ")
    print(len(valid))

    total = len(valid)
    correct = valid['Correctly labeled'].sum()
    percent_correct = 100 * correct / total if total > 0 else 0

    print(f"1. Total number of entries with a value in 'Category (M)': {total}")
    print(f"2. Percent correctly labeled: {percent_correct:.2f}%")

    # 3. Per-category accuracy
    print("3. Accuracy per category:")
    for cat, label in category_map.items():
        group = valid[valid['Category (M)'].str.lower() == cat]
        if not group.empty:
            correct_in_group = group['Correctly labeled'].sum()
            total_in_group = len(group)
            frac = 100 * correct_in_group / total_in_group
            print(f"   {cat.title()}: {frac:.2f}%")

    # 4. Fraction of papers in each label
    print("4. Label distribution:")
    cat_counts = valid['Category (M)'].str.lower().value_counts(normalize=True)
    main_counts = valid[main_group].value_counts(normalize=True)
    
    print("Category (M):")
    for k, v in cat_counts.items():
        print(f"     {k.title()}: {v:.2%}")
    print("Automated:")
    for k, v in main_counts.items():
        print(f"     {k[:40].strip()}: {v:.2%}")  # limit label length for neatness

    # 5. Return doi + correctness list
    result = valid[['doi', 'Correctly labeled']].reset_index(drop=True)
    return result

# Example usage:

mg="MG Score Avg Score"
#mg="MG Score Full Embedding"

results = evaluate_labeling(
    df_labels_new,
    query_list[0], #teacher_col
    query_list[1], #student_col
    query_list[2], #content_col
    query_list[3], #journal_col
    mg)

# View the returned doi + Correctly labeled result
print("\nReturned result (first few rows):")
print(results.head())

In [21]:

def evaluate_labeling_recall(df, teacher_col, student_col, content_col, journal_col, main_group):
    """
    Evaluate automated labeling performance with comprehensive classification metrics.
    
    This function provides an in-depth evaluation of automated labeling systems by
    computing standard classification metrics including accuracy, precision, recall,
    and false positive rates for each category. Unlike basic accuracy evaluation,
    this function treats each category as a binary classification problem to provide
    detailed per-class performance insights.
    
    Parameters
    ----------
    df : pandas.DataFrame
        Input dataframe containing the data to evaluate. Must include columns:
        - 'Category (M)': Manual category labels ('t', 's', 'c', 'jb', or variations)
        - 'doi': Document identifiers for tracking results
        - A column specified by `main_group` containing automated predictions
    teacher_col : str
        Full name/label corresponding to teacher category ('t')
    student_col : str
        Full name/label corresponding to student category ('s')  
    content_col : str
        Full name/label corresponding to content category ('c')
    journal_col : str
        Full name/label corresponding to journal business category ('jb')
    main_group : str
        Column name in df containing the automated labeling predictions to evaluate
        
    Returns
    -------
    pandas.DataFrame
        DataFrame with two columns:
        - 'doi': Document identifiers from input data
        - 'Correctly labeled': Boolean values indicating whether each entry
          was correctly labeled by the automated system
        Only includes rows that had valid manual categories.
        
    Side Effects
    ------------
    Prints comprehensive evaluation metrics to stdout:
    1. Total number of entries with valid manual categories
    2. Overall accuracy percentage
    3. Detailed per-category metrics:
       - Recall (sensitivity): TP / (TP + FN) - ability to find all instances
       - Precision: TP / (TP + FP) - accuracy of positive predictions
       - False Positive Rate: FP / (FP + TN) - rate of incorrect positive predictions
       - Category-specific accuracy: (TP + TN) / All - overall correctness for this category
    4. Label distribution comparison between manual and automated classifications
    
    Notes
    -----
    - Manual category labels are case-insensitive and whitespace is stripped
    - Entries with missing/NaN values in 'Category (M)' are excluded from evaluation
    - Each category is evaluated as a binary classification problem (category vs. not-category)
    - Category mapping uses short codes: 't'->teacher, 's'->student, 'c'->content, 'jb'->journal
    - Automated label names are truncated to 40 characters in output for readability
    - Confusion matrix elements (TP, TN, FP, FN) are computed for each category independently
    
    Classification Metrics Explained
    --------------------------------
    - **True Positives (TP)**: Correctly predicted as this category
    - **False Negatives (FN)**: Actually this category but predicted as another
    - **False Positives (FP)**: Predicted as this category but actually another
    - **True Negatives (TN)**: Correctly predicted as NOT this category
    - **Recall**: What fraction of actual instances were correctly identified?
    - **Precision**: What fraction of positive predictions were correct?
    - **False Positive Rate**: What fraction of negative cases were incorrectly flagged?
    
    See Also
    --------
    evaluate_labeling : Basic accuracy-only evaluation function
    sklearn.metrics.classification_report : Similar comprehensive metrics from scikit-learn
    """
    # Mapping of category labels to full column names
    category_map = {
        't': teacher_col.strip(),
        's': student_col.strip(),
        'c': content_col.strip(),
        'jb': journal_col.strip()
    }

    def compare_labels(row):
        category = row['Category (M)']
        if pd.isna(category):
            return np.nan
        expected_group = category_map.get(category.strip().lower())
        return expected_group == row[main_group]

    df['Correctly labeled'] = df.apply(compare_labels, axis=1)
    valid = df[~df['Category (M)'].isna()].copy()

    total = len(valid)
    correct = valid['Correctly labeled'].sum()
    percent_correct = 100 * correct / total if total > 0 else 0

    print(f"1. Total number of entries with a value in 'Category (M)': {total}")
    print(f"2. Percent correctly labeled (accuracy): {percent_correct:.2f}%")

    # Per-category accuracy (recall), precision, false positive rate
    print("3. Detailed metrics per category:")
    for cat, expected_val in category_map.items():
        # True Positives: predicted = expected = this category
        tp = valid[(valid['Category (M)'].str.lower() == cat) & (valid[main_group] == expected_val)]
        
        # False Negatives: actual is this category, but predicted is not
        fn = valid[(valid['Category (M)'].str.lower() == cat) & (valid[main_group] != expected_val)]

        # False Positives: predicted is this category, but actual is not
        fp = valid[(valid['Category (M)'].str.lower() != cat) & (valid[main_group] == expected_val)]

        # True Negatives: actual and predicted are both *not* this category
        tn = valid[(valid['Category (M)'].str.lower() != cat) & (valid[main_group] != expected_val)]

        tp_count = len(tp)
        fn_count = len(fn)
        fp_count = len(fp)
        tn_count = len(tn)

        recall = tp_count / (tp_count + fn_count) if (tp_count + fn_count) > 0 else 0
        precision = tp_count / (tp_count + fp_count) if (tp_count + fp_count) > 0 else 0
        fpr = fp_count / (fp_count + tn_count) if (fp_count + tn_count) > 0 else 0
        acc = (tp_count + tn_count) / (tp_count + tn_count + fp_count + fn_count)

        print(f"   {cat.title()}:")
        print(f"      Recall (TP / TP + FN): {recall:.2f}")
        print(f"      Precision (TP / TP + FP): {precision:.2f}")
        print(f"      False Positive Rate (FP / FP + TN): {fpr:.2f}")
        print(f"      Accuracy (TP + TN / All): {acc:.2f}")

    # Label distributions
    print("4. Label distribution:")
    cat_counts = valid['Category (M)'].str.lower().value_counts(normalize=True)
    main_counts = valid[main_group].value_counts(normalize=True)
    
    print("Category (M):")
    for k, v in cat_counts.items():
        print(f"     {k.title()}: {v:.2%}")
    print("Automated:")
    for k, v in main_counts.items():
        print(f"     {k[:40].strip()}: {v:.2%}")

    # Return doi and correctness
    result = valid[['doi', 'Correctly labeled']].reset_index(drop=True)
    return result

In [22]:
from sklearn.metrics.pairwise import cosine_similarity
import scipy.ndimage as nd
import numpy as np

def centroid_labeling_sentences(df, K=1, J=1):
    """
    Perform centroid-based topic labeling using sentence embeddings and archetypal papers.
    
    This function implements a sophisticated labeling approach that:
    1. Identifies archetypal papers for each topic category
    2. Extracts top-scoring sentences from these papers to create topic centroids
    3. Compares all papers against these centroids using cosine similarity
    4. Applies topic scoring to generate topic probabilities
    5. Assigns the most likely topic label to each paper
    
    Parameters
    ----------
    df : pandas.DataFrame
        Input dataframe containing papers to be labeled. Must include:
        - 'doi': Document identifiers for each paper
        - '{label}_score_weighted' columns for each label in query_list
    K : int, optional
        Number of top-scoring archetypal papers to use per topic category for 
        centroid creation (default: 1)
    J : int, optional
        Number of top-scoring sentence chunks to extract from each archetypal 
        paper for centroid calculation (default: 1)
        
    Returns
    -------
    pandas.DataFrame
        Input dataframe enhanced with additional columns:
        - 'Average Embeddings': Mean embedding vector for each paper
        - '{label}_centroid_topic_score': Topic score probability scores for each topic
        - 'Main Group Centroid': Predicted topic label (highest scoring category)
        
    Global Dependencies
    -------------------
    Requires the following global variables to be defined:
    - query_list : list
        List of topic labels/categories to classify papers into
    - text_chunks_and_embeddings_df : pandas.DataFrame
        Dataframe containing sentence-level data with columns:
        - 'doi': Document identifiers matching those in input df
        - 'embedding': Pre-computed sentence embedding vectors
        - '{label}' columns: Sentence-level scores for each topic
    - a : float
        Scaling factor for softmax temperature (controls prediction confidence)
        
    Notes
    -----
    - The function hard-codes archetypal papers for 'Journal business.' category
    - Uses cosine similarity to measure distance between paper embeddings and centroids
    - Applies topic score from Odden et al. 2024
    - Papers with no embeddings receive zero-vectors as their average embedding
    - Processing time is optimized using groupby operations for DOI lookups
    
    Algorithm Steps
    ---------------
    1. **Archetypal Selection**: Select top K papers per category based on weighted scores
    2. **Centroid Creation**: Extract top J sentences from each archetypal paper and 
       average their embeddings to create topic centroids
    3. **Paper Embedding**: Compute average embedding for each paper across all sentences
    4. **Similarity Calculation**: Measure cosine similarity between paper embeddings 
       and topic centroids
    5. **Topic Scoring**: Convert similarities to probabilities using the topic score equation from Odden et al. 2024
    6. **Label Assignment**: Assign topic with highest probability as main group
    
    Examples
    --------
    >>> # Assuming global variables are properly set up
    >>> labeled_df = centroid_labeling_sentences(papers_df, K=5, J=10)
    >>> print(labeled_df['Main Group Centroid'].value_counts())
    
    See Also
    --------
    cosine_similarity : Used for measuring embedding similarity
    numpy.mean : Used for averaging embeddings
    """
    
    df_labels=df
    print(f"K: {K} J: {J}") 
    
    # Step 1: Identify archetypal papers
    archetypal_papers = {}
    for label in query_list:
        top_k_dois = df_labels.sort_values(by=f"{label}_score_weighted", ascending=False).head(K)['doi'].tolist() #Here
        archetypal_papers[label] = top_k_dois


    # Step 2: Extract top sentences from archetypal papers
    centroids = {}
    for label, dois in archetypal_papers.items():
        top_embeddings = []
        for doi in dois:
            rows = text_chunks_and_embeddings_df[text_chunks_and_embeddings_df['doi'] == doi].copy() #Removed the _score tag on the label because that doens't exist for that df
            #print(rows.keys() )
            rows_sorted = rows.sort_values(by=f"{label}", ascending=False)
            top_j_embeddings = rows_sorted.head(J)['embedding'].tolist()
            top_embeddings.extend(top_j_embeddings)
        centroid = np.mean(np.stack(top_embeddings), axis=0)
        centroids[label] = centroid
    
    # Step 3: Average embeddings for all papers
    avg_embeddings = []
    # Create a dictionary to speed up lookup by doi
    doi_to_rows = text_chunks_and_embeddings_df.groupby('doi')
    # Pre-group the dataframe by 'doi'
    import time
    start = time.perf_counter()
    grouped_groupby = text_chunks_and_embeddings_df.groupby('doi')
    avg_embeddings = []
    for doi in df_labels['doi']:
        rows = grouped_groupby.get_group(doi) if doi in grouped_groupby.groups else None
    
        if rows is not None:
            embeddings_row = rows['embedding'].tolist()
        else:
            print(f"This paper contained no embeddings: {doi}")
            embeddings_row = []
    
        if embeddings_row:
            avg_embedding = np.mean(np.stack(embeddings_row), axis=0)
        else:
            avg_embedding = np.zeros_like(next(iter(centroids.values())))
    
        avg_embeddings.append(avg_embedding)
    
    #Average together all embeddings from each DOI and save as the centroid for that paper
    df_labels['Average Embeddings'] = avg_embeddings
    
    end = time.perf_counter()
    print(f"Time taken: {end - start} seconds")
    
    # Step 4: Cosine similarity to centroids
    centroid_names = []
    similarity_matrix = []
    
    for emb in df_labels['Average Embeddings']:
        sims = [cosine_similarity(emb.reshape(1, -1), centroids[label].reshape(1, -1))[0][0] for label in query_list]
        similarity_matrix.append(sims)
    
    similarity_matrix = np.array(similarity_matrix)
    
    # Step 5: Softmax scoring
    exp_values = np.exp(a * (1 - similarity_matrix))
    denominator = exp_values.sum(axis=1, keepdims=True)
    topic_scores = exp_values / denominator
    
    # Step 6: Add topic scores to df_labels
    for idx, label in enumerate(query_list):
        colname = f"{label}_centroid_topic_score"
        df_labels[colname] = topic_scores[:, idx]
    
    # Step 7: Identify main group
    df_labels['Main Group Centroid'] = df_labels[[f"{label}_centroid_topic_score" for label in query_list]].idxmax(axis=1)
    df_labels['Main Group Centroid'] = df_labels['Main Group Centroid'].str.replace("_centroid_topic_score", "")
    
    return df_labels
  
#Calcualte the centroids for the methods that use whole text
def centroid_labeling_whole_text(df, K=1, whole_text=True):
    """
    Perform advanced centroid-based topic labeling using full-text embeddings.
    
    This function implements an enhanced centroid-based classification approach that:
    1. Identifies archetypal papers for each topic category based on raw scores
    2. Creates topic centroids from full-text embeddings of archetypal papers
    3. Computes cosine similarity between all papers and topic centroids
    4. Applies Topic Score to generate topic probabilities
    5. Assigns the most likely topic label to each paper
    
    This approach differs from sentence-based methods by using document-level 
    embeddings, potentially capturing broader semantic themes and overall document structure.
    
    Parameters
    ----------
    df : pandas.DataFrame
        Input dataframe containing papers to be labeled. Must include:
        - 'doi': Document identifiers for each paper
        - '{label}_score' columns for each label in query_list (raw scores)
        - 'Full Text Embedding' column (when whole_text=True)
    K : int, optional
        Number of top-scoring archetypal papers to use per topic category for 
        centroid creation. Automatically adjusted if fewer papers are available
        (default: 1)
    whole_text : bool, optional
        Whether to use full-text embeddings. Currently only True is supported
        (default: True)
        
    Returns
    -------
    pandas.DataFrame
        Input dataframe enhanced with additional columns:
        - '{label}_Advanced Centroid dotp': Cosine similarity scores to each topic centroid
        - '{label}_Advanced Centroid TS': Topic probabilities
        - 'Main Group Advanced Centroid': Predicted topic label (highest scoring category)
        
    Global Dependencies
    -------------------
    Requires the following global variables to be defined:
    - query_list : list
        List of topic labels/categories to classify papers into
    - df_full_text : pandas.DataFrame
        Dataframe containing full-text embeddings with 'Full Text Embedding' column
    - a : float
        Temperature scaling factor for topic score (controls topic-mixedness)
    - nd : module
        Numerical computation module with rotate function (scipy.ndimage)
        
    Notes
    -----
    - Uses raw scores ('{label}_score') rather than weighted scores for archetypal selection
    - Automatically handles cases where K exceeds the number of available papers
    - Cosine similarity is used instead of dot products for better normalized comparison
    - The topic score transformation includes array rotation operations for proper alignment
    - Currently only supports whole_text=True mode; sentence-level fallback not implemented
    
    Algorithm Steps
    ---------------
    1. **Archetypal Selection**: Select top K papers per category based on raw scores
    2. **Centroid Creation**: Average full-text embeddings of archetypal papers 
       to create topic centroids
    3. **Similarity Calculation**: Compute cosine similarity between all paper 
       embeddings and topic centroids
    4. **Topic Scoring**: Topic score to convert similarities 
       to probabilities with array transformations
    5. **Label Assignment**: Assign topic with highest probability as main group
    
    Raises
    ------
    ValueError
        If whole_text=False (not currently supported)
    IndexError
        If required columns are missing from input dataframes
        
    Examples
    --------
    >>> # Assuming global variables are properly set up
    >>> labeled_df = advanced_centroid_labeling(papers_df, K=3, whole_text=True)
    >>> print(labeled_df['Main Group Advanced Centroid'].value_counts())
    >>> # Check similarity scores
    >>> similarity_cols = [col for col in labeled_df.columns if 'dotp' in col]
    >>> print(labeled_df[similarity_cols].describe())
    
    See Also
    --------
    cosine_similarity : Used for measuring embedding similarity
    numpy.mean : Used for averaging embeddings
    centroid_labeling_sentences : Alternative sentence-level approach
    """
    df_labels = df.copy()
    print(f"K: {K} whole_text: {whole_text}")

    # Step 1: Identify archetypal papers
    archetypal_papers = {}
    for label in query_list:
        k_actual = min(K, len(df_labels))
        top_k_dois = df_labels.sort_values(by=f"{label}_score", ascending=False).head(k_actual)['doi'].tolist()
        archetypal_papers[label] = top_k_dois
    
    # Step 2: Compute centroids
    centroids = {}
    for label, dois in archetypal_papers.items():
        top_embeddings = []

        for doi in dois:
            if whole_text:
                # Use full-text embedding directly
                paper_embedding = df_labels[df_labels['doi'] == doi]['Full Text Embedding'].values
                if len(paper_embedding) > 0:
                    top_embeddings.append(paper_embedding[0])
                else:
                    print("There appears to be no full-text embeddings")
            centroid = np.mean(np.stack(top_embeddings), axis=0)
            centroids[label] = centroid

    # Step 3: Average embeddings per paper
    avg_embeddings = []

    # Step 4: Cosine similarity to centroids
    dot_prods={}
    for label in query_list:
        new_label=label+"_Advanced Centroid dotp"
        sent_emb=[item for item in df_full_text['Full Text Embedding']]
        topic_emb=[centroids[label]]*len(sent_emb) #Copies the topic centroid embedding to be equal in length to sent_emb. For the case of the whole paper, this should just multiply it by one
        dot_prods[new_label]= np.diag(np.array(cosine_similarity(topic_emb, sent_emb)))  ##This uses cosine similarity instead of dot product. The vectors used here are normalized so that isn't a problem
    for new_label in [label+"_Advanced Centroid dotp" for label in query_list]:
        df_labels[new_label]=dot_prods[new_label]
    # Step 5: Topic Scoring 
    topic_scores=[]
    ac_dotp_labels=[label+"_Advanced Centroid dotp" for label in query_list]
    for index, row in df_labels.iterrows():
        exp_values =np.exp(a* (1-row[ac_dotp_labels].values.astype(float)))
        denominator = exp_values.sum(axis=0)
        topic_scores.append( exp_values / denominator )
    topic_scores=np.flip(nd.rotate(np.array(topic_scores),90), axis=0)
    ac_ts_labels=[label+"_Advanced Centroid TS" for label in query_list]
    for n, label in enumerate(ac_ts_labels):
        df_labels[label]=topic_scores[n]
    
    # Step 7: Identify main group
    df_labels['Main Group Advanced Centroid'] = df_labels[[f"{label}_Advanced Centroid TS" for label in query_list]].idxmax(axis=1)
    df_labels['Main Group Advanced Centroid'] = df_labels['Main Group Advanced Centroid'].str.replace("_Advanced Centroid TS", "")
    
    
    return df_labels


In [26]:
whole_text_results=centroid_labeling_whole_text(df_full_text, K=4)

K: 4 whole_text: True


In [30]:
df_full=whole_text_results.merge(df_labels_new[['doi', 'Category (M)']], how='left', on="doi")
df_labels_chunks=centroid_labeling_sentences(df_labels_new, K=4, J=10)


#These are the labels for all the main groups. I apologize that these are all different labels, just poor coding on my part while I tried to make it more convenient
mg1="Main Group Advanced Centroid" #Advanced centroids based on full text
mg2="MG Score Full Embedding" #Non advanced centroid, labels from full text
mg3="MG Score Avg Cos" #Not advanced, Chunked
mg4="Main Group Centroid" #Advanced, chunked


df1=df_full
df2=df_labels_new
df3=df_labels_new
df4=df_labels_chunks

##This is Group D, the one that matters
results = evaluate_labeling_recall(
    df1,
    query_list[0], #teacher_col
    query_list[1], #student_col
    query_list[2], #content_col
    query_list[3], #journal_col
    mg1)

results = evaluate_labeling_recall(
    df2,
    query_list[0], #teacher_col
    query_list[1], #student_col
    query_list[2], #content_col
    query_list[3], #journal_col
    mg2)


results = evaluate_labeling_recall(
    df3,
    query_list[0], #teacher_col
    query_list[1], #student_col
    query_list[2], #content_col
    query_list[3], #journal_col
    mg3)


results = evaluate_labeling_recall(
    df4,
    query_list[0], #teacher_col
    query_list[1], #student_col
    query_list[2], #content_col
    query_list[3], #journal_col
    mg4)

K: 4 J: 10
Time taken: 19.379907619208097 seconds
Here:
94
1. Total number of entries with a value in 'Category (M)': 94
2. Percent correctly labeled (accuracy): 58.51%
3. Detailed metrics per category:
   T:
      Recall (TP / TP + FN): 0.82
      Precision (TP / TP + FP): 0.42
      False Positive Rate (FP / FP + TN): 0.35
      Accuracy (TP + TN / All): 0.69
   S:
      Recall (TP / TP + FN): 0.88
      Precision (TP / TP + FP): 0.88
      False Positive Rate (FP / FP + TN): 0.03
      Accuracy (TP + TN / All): 0.96
   C:
      Recall (TP / TP + FN): 0.64
      Precision (TP / TP + FP): 0.64
      False Positive Rate (FP / FP + TN): 0.20
      Accuracy (TP + TN / All): 0.74
   Jb:
      Recall (TP / TP + FN): 0.09
      Precision (TP / TP + FP): 1.00
      False Positive Rate (FP / FP + TN): 0.00
      Accuracy (TP + TN / All): 0.78
4. Label distribution:
Category (M):
     C: 35.11%
     Jb: 24.47%
     T: 23.40%
     S: 17.02%
Automated:
     Teaching students.: 45.74%
     Physic

from tabulate import tabulate

for key in arch.keys():
    print(f"Subject: {key}")
    for item in arch[key]:
        print("Item: ")
        df_temp=df_full_text[df_full_text['doi']==item][['Teaching students._score', 'Student focus._score', 'Physics content._score', 'Journal business._score', 'doi', 'title']]
        print(tabulate(df_temp, headers='keys'))

#Optionally save the dataframes to pickle files

import pickle

file_path="full_text_refined_df.pkl"
with open(file_path, 'wb') as file:
    pickle.dump(df1, file, protocol=pickle.HIGHEST_PROTOCOL)