In [None]:
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from transformers import BertTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity 

#### Creating Papers Corpus for Embedding and Semantics Matching

In [None]:
#  100 unique paper abstracts
with open("Test-Data/papers_corpus_test.json", 'r', encoding='utf-8') as Papers:
    Papers = json.load(Papers)
    dfPapers = pd.DataFrame.from_dict(Papers, orient='columns')

# create paper subset for embedding and semantic matching
dfPapersCorpus = dfPapers[["paperId", "abstract"]]

In [None]:
# inspect nulls and drop
dfPapersNulls = dfPapersCorpus[dfPapersCorpus.isnull().any(axis=1)]
dfPapersCorpus = dfPapersCorpus.drop(dfPapersNulls.index, axis=0)

In [None]:
print(dfPapersCorpus.loc[dfPapersCorpus['paperId'] == '3ffd20f1b61313d5b17d6b5db1a144d8e664e968'].values)

#### Creating Authors Corpus for Embedding and Semantics Matching

In [None]:
#  90 unique authors with ~4100 papers. About 45 papers per author 
with open("Test-Data/authors_corpus_test.json", 'r', encoding='utf-8') as Authors:
    Authors = json.load(Authors)
    dfAuthors = pd.json_normalize(Authors, record_path=['papers'], meta='authorId')

# create paper subset for embedding and semantic matching
dfAuthorsCorpus = dfAuthors[["authorId", "abstract"]]

In [None]:
# merge all abstracts into single cell for each author
dfAuthorsCorpus = dfAuthorsCorpus.groupby(['authorId'], as_index=False).agg({'abstract': ' '.join})

# inspect the contents of one of the cells
print(dfAuthorsCorpus.loc[dfAuthorsCorpus['authorId'] == '40544263'].values)

#### Prep for NN

In [None]:
# BERT input (replace all this and below with pytorch)
author_abstract = dfAuthorsCorpus.abstract.values
author_labels = dfAuthorsCorpus.authorId.values
paper_abstract = dfPapersCorpus.abstract.values

print(len(author_abstract))
print(len(author_labels))
print(len(paper_abstract))

In [None]:
n_gram_range = (1, 1)
stop_words = "english"
top_n_words = 10
np.random.seed(2021-12-30)
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [None]:
n_gram_range = (1, 1)
stop_words = "english"
top_n_words = 10
np.random.seed(2021-12-30)
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

auth2paper_final = []
#for i in tqdm(range(0, 10)): 
for i in tqdm(range(0, len(author_abstract))): 
    count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([author_abstract[i]])
    candidates = count.get_feature_names()
    candidate_embeddings = model.encode(candidates)
    
    auth2paper_cosim = []
    
    #for k in range(0, 20): 
    for k in range(0, len(paper_abstract)):   
        doc_embedding = model.encode([paper_abstract[k]])
        cosine_sim = cosine_similarity(doc_embedding, candidate_embeddings)
        #top_n_keywords = [candidates[index] for index in cosine_sim.argsort()[0][-top_n_words:]]
        top_n_candidate_embeddings = np.reshape(np.mean(np.array([candidate_embeddings[index] for index in cosine_sim.argsort()[0][-top_n_words:]]), axis=0), (1, -1))
        #top_n_candidate_embeddings = np.reshape(arr_list, (1, -1))
        cosine_sim_top_n = cosine_similarity(doc_embedding, top_n_candidate_embeddings)
        unravelled = float(np.ravel(cosine_sim_top_n))
        auth2paper_cosim.append(unravelled)

    auth2paper_final.append(auth2paper_cosim)

In [None]:
paper_index = []
paper_top_scores = []

for i in tqdm(range(0,len(auth2paper_final))):
    sample_list1 = [] 
    sample_list2 = [] 
    for index, value in sorted(enumerate(auth2paper_final[i]), reverse=True, key=lambda x: x[1])[:10]:
        sample_list1.append(index)
        sample_list2.append(value)
    paper_index.append(sample_list1)
    paper_top_scores.append(sample_list2)

In [None]:
# df_au_id <- clean author IDs
final_df = pd.DataFrame({'Author ID': dfAuthorsCorpus['authorId'],
                         'Top 10 Cosine Similarity Scores': paper_top_scores,
                         'Paper Indices':paper_index
                        })

In [None]:
final_df

In [None]:
final_df.to_csv("BERT-SemanticMatching_Results-Test.csv")