In [1]:
import pandas as pd
import sklearn

In [3]:
authors = pd.read_hdf('/home/abhinav/workspace/code/expert-finder/LSPO_v1.h5')

In [4]:
def combine_texts(group):
    combined = (group['title'].fillna('') + ' ' + group['abstract'].fillna('')).str.strip()
    return pd.Series({
        'text': ' '.join(combined)
    })

combined_texts = authors.groupby('@path').apply(combine_texts).reset_index()

  combined_texts = authors.groupby('@path').apply(combine_texts).reset_index()


In [5]:
combined_texts

Unnamed: 0,@path,text
0,/0000-0001-5000-0736,Tectonothermal analysis of high-temperature my...
1,/0000-0001-5000-1341,New description of gradual substitution of gra...
2,/0000-0001-5000-3793,Analytical Models of the Performance of C-V2X ...
3,/0000-0001-5000-5991,Chemical aging and the hydrophobic-to-hydrophi...
4,/0000-0001-5000-6265,Pollution and economic development: an empiric...
...,...,...
125481,/0000-0003-4999-3106,Phase-space dynamics of opposition control in ...
125482,/0000-0003-4999-5734,Electromagnetic scattering from a multilayered...
125483,/0000-0003-4999-5822,Invited Article: Broadband highly efficient di...
125484,/0000-0003-4999-5996,A network approach to discerning the identitie...


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
tfidf_matrix = vectorizer.fit_transform(combined_texts['text'])

In [7]:
query = "Type 1a Supernovae"
query_vec = vectorizer.transform([query])  # shape: (1, num_terms)

In [8]:
from sklearn.metrics.pairwise import cosine_similarity

similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()  # shape: (num_authors,)

In [9]:
top_indices = similarities.argsort()[::-1][:25]
top_authors = combined_texts.iloc[top_indices]
top_authors['similarity'] = similarities[top_indices]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_authors['similarity'] = similarities[top_indices]


In [10]:
top_authors

Unnamed: 0,@path,text,similarity
110187,/0000-0003-2544-4516,Optimal Classification and Outlier Detection f...,0.3758
58012,/0000-0002-4173-193X,"Synthesis, crystal structure and computational...",0.349692
20659,/0000-0001-8257-3512,Supernovae and extragalactic astronomy with la...,0.336378
86568,/0000-0002-8743-3292,"Synthesis, X-Ray diffraction, theoretical and ...",0.331608
119288,/0000-0003-4016-9428,Sea-level constraints on the amplitude and sou...,0.315058
74401,/0000-0002-6762-5254,hERG 1b is critical for human cardiac repolari...,0.309603
114763,/0000-0003-3270-6844,Revealing Dusty Supernovae in High-redshift (U...,0.29948
35245,/0000-0002-0564-1101,Direct determination of the hubble parameter u...,0.296621
69042,/0000-0002-5920-1478,Direct determination of the hubble parameter u...,0.28705
90847,/0000-0002-9436-8871,Estimating weak lensing convergence correlatio...,0.278911


In [11]:
# Sample 3 DOIs per top author
author_info = (
    authors[['@path', 'author']] #, 'doi']
    #.dropna(subset=['doi'])  # Remove missing DOIs
    .groupby('@path')
    .agg({
        'author': 'first'
        #'doi': lambda x: list(x.unique())[:3]  # Sample up to 3 unique DOIs
    })
    .reset_index()
)

In [12]:
results = top_authors.merge(author_info, on='@path', how='left')
results = results[['@path', 'author', 'similarity']]

In [13]:
results

Unnamed: 0,@path,author,similarity
0,/0000-0003-2544-4516,"Williamson, Marc",0.3758
1,/0000-0002-4173-193X,"Hoffmann, Józef",0.349692
2,/0000-0001-8257-3512,"Kankare, Erkki",0.336378
3,/0000-0002-8743-3292,"Shihab, Mehdi Salih",0.331608
4,/0000-0003-4016-9428,"Kopp, Robert E.",0.315058
5,/0000-0002-6762-5254,"Jones, David K.",0.309603
6,/0000-0003-3270-6844,"Ma, Zhiyuan",0.29948
7,/0000-0002-0564-1101,"Potashov, M.",0.296621
8,/0000-0002-5920-1478,"Baklanov, P.",0.28705
9,/0000-0002-9436-8871,"Mitra, Ayan",0.278911


In [14]:
name_variations = (
    authors[['@path', 'author']]
    .dropna()
    .groupby('@path')['author']
    .apply(lambda names: list(sorted(set(names))))
    .reset_index()
    .rename(columns={'author': 'name_variations'})
)

In [15]:
results = results.merge(name_variations, on='@path', how='left')

In [16]:
results

Unnamed: 0,@path,author,similarity,name_variations
0,/0000-0003-2544-4516,"Williamson, Marc",0.3758,"[Williamson, Marc]"
1,/0000-0002-4173-193X,"Hoffmann, Józef",0.349692,"[Hoffmann, Józef]"
2,/0000-0001-8257-3512,"Kankare, Erkki",0.336378,"[Kankare, Erkki]"
3,/0000-0002-8743-3292,"Shihab, Mehdi Salih",0.331608,"[Shihab, Mehdi Salih]"
4,/0000-0003-4016-9428,"Kopp, Robert E.",0.315058,"[Kopp, Robert E.]"
5,/0000-0002-6762-5254,"Jones, David K.",0.309603,"[Jones, David K.]"
6,/0000-0003-3270-6844,"Ma, Zhiyuan",0.29948,"[Ma, Zhiyuan]"
7,/0000-0002-0564-1101,"Potashov, M.",0.296621,"[Potashov, M., Potashov, M. Sh.]"
8,/0000-0002-5920-1478,"Baklanov, P.",0.28705,"[Baklanov, P., Baklanov, P. V.]"
9,/0000-0002-9436-8871,"Mitra, Ayan",0.278911,"[Mitra, Ayan]"
