In [None]:
import pyarrow.feather as feather

raw_pub_df = feather.read_feather("publication_author_per_row.feather")
researcher_df = feather.read_feather("researcher_df.feather")


In [None]:
import json
import re
import pandas as pd
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer
import numpy as np

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2') 

In [None]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# 1. Load model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')

# 2. Parameters
CHUNK_SIZE = 5000  
BATCH_SIZE = 64     

df = raw_pub_df.copy()   # or samp_df if you're using a smaller set
df = df.reset_index(drop=True)

all_embeddings = []   # store results chunk by chunk

# 3. Loop over dataframe in chunks
for start in tqdm(range(0, len(df), CHUNK_SIZE)):
    end = start + CHUNK_SIZE
    
    chunk = df.iloc[start:end]
    abstracts = chunk["pub_abstract"].fillna("").tolist()

    # Encode one chunk
    emb = model.encode(
        abstracts,
        batch_size=BATCH_SIZE,
        convert_to_numpy=True,
        show_progress_bar=True
    )
    
    all_embeddings.append(emb)

# 4. Stack all chunks together
all_embeddings = np.vstack(all_embeddings)

# 5. Save back to df
df["embeddings"] = list(all_embeddings)

print("Done! Embedding dimension:", all_embeddings[0].shape[0])
print("Total rows embedded:", len(df))


In [None]:
import numpy as np
import pandas as pd

# ===== 1. Compute researcher-level embeddings =====

rows = []

for rid, group in df.groupby("researcher_id"):
    # Stack all embedding vectors for this researcher
    mat = np.vstack(group["embeddings"].values)

    # Average to get researcher-level embedding
    mean_emb = mat.mean(axis=0)

    rows.append({
        "researcher_id": rid,
        "embedding": mean_emb
    })

researcher_emb_df = pd.DataFrame(rows)

print("Number of researchers with embeddings:", len(researcher_emb_df))
researcher_emb_df.head()


In [None]:
researcher_df_with_emb = researcher_df.merge(
    researcher_emb_df,
    on="researcher_id",
    how="left"
)

print("Merged researcher_df with embeddings.")
researcher_df_with_emb.head()



In [None]:
researcher_df_with_emb = researcher_df_with_emb.drop(columns=["email"])
researcher_df_with_emb = researcher_df_with_emb[researcher_df_with_emb["pub_count"] != 0].copy()
feather.write_feather(researcher_df_with_emb, "researcher_df_with_emb_new.feather")

In [None]:
import numpy as np
import pandas as pd

# Convert embedding column to 2D numpy matrix
emb_matrix = np.vstack(researcher_df_with_emb['embedding'].values)

# Compute cosine similarity
similarity_matrix = model.similarity(emb_matrix, emb_matrix)

# Convert to DataFrame
similarity_id_df = pd.DataFrame(
    similarity_matrix,
    index=researcher_df_with_emb['researcher_id'],
    columns=researcher_df_with_emb['researcher_id']
)

similarity_id_df.head(10)


In [None]:
similarity_id_df.to_feather("similarity_id_df.feather")