In [5]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
# Function to compute cosine similarity between two embeddings

def compute_similarity(paper_embedding, keyword_embeddings):
    similarities = cosine_similarity([paper_embedding], keyword_embeddings)
    return similarities.max()  # Return the maximum similarity

In [7]:
# Function to get combined embedding of mean, min, max and then take mean of these

def get_long_text_embedding(text, model, chunk_size=200):
    # Split text into chunks
    words = text.split()
    chunks = [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
    
    # Generate embeddings for each chunk
    chunk_embeddings = [model.encode(chunk) for chunk in chunks]
    chunk_embeddings = np.vstack(chunk_embeddings)
    
    # Calculate mean, max, and min embeddings
    mean_embedding = np.mean(chunk_embeddings, axis=0)
    max_embedding = np.max(chunk_embeddings, axis=0)
    min_embedding = np.min(chunk_embeddings, axis=0)
    
    # Take the mean of [mean_embedding, max_embedding, min_embedding]
    combined_embedding = np.mean([mean_embedding, max_embedding, min_embedding], axis=0)
    return combined_embedding