In [7]:
import sys
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from transformers import BertTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#print('Loading BERT tokenizer...')
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
## create the author_abstracts df

with open("AuthAbs_full.json", 'r', encoding='utf-8') as test:
    test = json.load(test)
    Author_df = pd.DataFrame.from_dict(test[0], orient='index')

Author_df.fillna('', inplace=True)
Author_df['Corpus'] = Author_df.astype(str).values.sum(axis=1)
Author_df = Author_df.filter(["Corpus"]).reset_index().rename(columns={"index": "AuthorID"})
Author_df['Corpus'].replace('', np.nan, inplace=True)
Author_df.dropna(subset=['Corpus'], inplace=True)
#Authordf['Corpus'].str.replace('\d+', '')  ## remove all numbers?


In [None]:
Author_df.info()

In [None]:
## create the papers df

paper_abstracts = []
with open("E:\Data\Kaggle-Covid\papers\papers2.json", 'r', encoding='utf-8') as papers:
    papers = json.load(papers)
    for j in tqdm(papers):
        if j:
            try:        
                paper_abstracts.append(j["abstract"])
                #paper_abstracts.append(j["abstract"])  ## convert this to a dict 
            except:   
                #print the error message from sys
                print("error:", sys.exc_info()[0])
        else:
            continue

Paper_df = pd.DataFrame(paper_abstracts, columns=["Paper_Abstracts"])

In [None]:
Paper_df['Paper_Abstracts'].replace('', np.nan, inplace=True)
Paper_df.dropna(subset=['Paper_Abstracts'], inplace=True)
Paper_df.info()

In [None]:
# BERT input (replace all this and below with pytorch)
author_abstract = Author_df.Corpus.values
author_labels = Author_df.AuthorID.values
paper_abstract = Paper_df.Paper_Abstracts.values

In [None]:
n_gram_range = (1, 1)
stop_words = "english"
top_n_words = 10
np.random.seed(2021-12-30)
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

auth2paper_final = []
for i in tqdm(range(0, 100)): 
#for i in tqdm(range(0, len(author_abstract))): 
    count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([author_abstract[i]])
    candidates = count.get_feature_names()
    candidate_embeddings = model.encode(candidates)
    
    auth2paper_cosim = []
    
    for k in range(0, 1000): 
    #for k in range(0, len(paper_abstract)):   
        doc_embedding = model.encode([paper_abstract[k]])
        cosine_sim = cosine_similarity(doc_embedding, candidate_embeddings)
        #top_n_keywords = [candidates[index] for index in cosine_sim.argsort()[0][-top_n_words:]]
        top_n_candidate_embeddings = np.reshape(np.mean(np.array([candidate_embeddings[index] for index in cosine_sim.argsort()[0][-top_n_words:]]), axis=0), (1, -1))
        #top_n_candidate_embeddings = np.reshape(arr_list, (1, -1))
        cosine_sim_top_n = cosine_similarity(doc_embedding, top_n_candidate_embeddings)
        unravelled = float(np.ravel(cosine_sim_top_n))
        auth2paper_cosim.append(unravelled)

    auth2paper_final.append(auth2paper_cosim)


In [None]:
paper_index = []
paper_top_scores = []

for i in tqdm(range(0,len(auth2paper_final))):
    sample_list1 = [] 
    sample_list2 = [] 
    for index, value in sorted(enumerate(auth2paper_final[i]), reverse=True, key=lambda x: x[1])[:10]:
        sample_list1.append(index)
        sample_list2.append(value)
    paper_index.append(sample_list1)
    paper_top_scores.append(sample_list2)

In [None]:
# df_au_id <- clean author IDs
final_df = pd.DataFrame({'Author ID': Author_df['AuthorID'].head(100),
                         'Top 10 Cosine Similarity Scores': paper_top_scores,
                         'Paper Indices':paper_index
                        })

In [None]:
final_df

In [None]:
final_df.to_csv("final.csv")