In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import json
import matplotlib.pyplot as plt
from tqdm import tqdm
from langdetect import detect
from langdetect import DetectorFactory
from pprint import pprint
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_sci_lg
import os
import gensim
import gensim.corpora as corpora
from gensim.models import ldamodel
from gensim import corpora, models

# JSON file loader to DF 

Load the following cells only if the csv file isn't available and JSON is on hand 

In [None]:
with open('AuthAbs_full.json') as f:
    data = json.load(f)

In [None]:
#load the first half of the JSONS
pd_authabs1 = pd.DataFrame.from_dict(data[0], orient='index')

In [None]:
pd_authabs1

In [None]:
#load the second part of the JSONS
pd_authabs2 = pd.DataFrame.from_dict(data[1], orient='index')

In [None]:
pd_authabs2

In [None]:
#Combine the two dataframes

pd_authabs = pd.concat([pd_authabs1,pd_authabs2])

In [None]:
pd_authabs

In [None]:
#remove the the row that's full of Nulls/nans
new_df = pd_authabs.dropna(how='all')

In [None]:
#assign column names
new_df.columns = ['Abstract','1','2','3','4','5','6','7','8','9','10','11','12']

Change the index column to an Author ID column. This will be later changed to an independent column for later analysis

In [None]:
new_df.index.name = 'Author ID'

In [None]:
new_df['Author ID'] = new_df.index

In [None]:
#reset indices to 0 (replacing the AuthorID that were previously preceived as indices)
new_df.reset_index(drop=True, inplace=True)

In [None]:
new_df

In [None]:
#create a column that's a concatenation of all text columns
new_df['Abstract_new'] = new_df['Abstract'].astype(str) + new_df['1'].astype(str) + new_df['2'].astype(str) + new_df['3'].astype(str) + new_df['4'].astype(str) + new_df['5'].astype(str) + new_df['6'].astype(str) + new_df['7'].astype(str) + new_df['8'].astype(str) + new_df['9'].astype(str) + new_df['10'].astype(str) + new_df['11'].astype(str) + new_df['12'].astype(str)

In [None]:
new_df

In [None]:
new_df = new_df.drop(['Abstract','1','2','3','4','5','6','7','8','9','10','11','12'], axis=1)

In [None]:
new_df

In [None]:
new_df.to_csv("AuthorAbs.csv", index=False)

Now that the DF cleaning is done, LDA work is next

# LDA Model Training and Testing

In [None]:
#load in author abstracts
new_df = pd.read_csv('AuthorAbs.csv')

#create one columned df with abstracts only
df_text = new_df['Abstract_new']

In [None]:
#load JSON files for the abstracts

paper_abstracts = []
with open("papers2.json", 'r', encoding='utf-8') as papers:
    papers = json.load(papers)
    for j in papers:
        if j:
            try:        
                paper_abstracts.append(j["abstract"])
            except:   
                #print the error message from sys
                print("error:", sys.exc_info()[0])
        else:
            continue

In [None]:
paper_abstracts

create an LDA model for all the authors' papers

Logic adopted from GENSIM's official tutorial (https://radimrehurek.com/gensim/auto_examples/tutorials/run_lda.html)

In [None]:
#creating the dictionary, word splitting adapted from (https://www.geeksforgeeks.org/python-program-split-join-string/)
dictionary_a = corpora.Dictionary([" ".join(df_text).split()]) 
print(f'{len(dictionary_a)} different terms in the corpus')
#creating the bag of words object
bow_corpus_a = [dictionary_a.doc2bow(text.split()) for text in df_text]

#train LDA models
lda_model_bow_a = models.LdaModel(corpus=bow_corpus_a, id2word=dictionary_a, num_topics=10,
                                random_state=47)

lda_model_bow_a

In [None]:
abs_lda_a = []

for i in tqdm(range(0,len(df_text))):    
    #attain dictionary for the abstract
    abs_vec = dictionary_a.doc2bow(df_text[i].split())    
    #extract topics from the LDA model
    abs_lda_vec = lda_model_bow_a[abs_vec]
    print (f'document {i} feature vector: ', abs_lda_vec)    
#     pprint(lda_model_bow_a.print_topics(10, num_words=5)) #prints the topics with their respective top-word probability
     
    print('\n')
    abs_lda_a.append(abs_lda_vec)

In [None]:
len(abs_lda_a)

In [None]:
#feature vectors of paper abstracts
abstract_lda = []

#load in the first 1000 abstracts
for i in tqdm(range(0,len(paper_abstracts[:1000]))):
    ab_ = dictionary_a.doc2bow(paper_abstracts[i].split())
    abs_lda = lda_model_bow_a[ab_]
    print ('document topics: ', abs_lda)
    abstract_lda.append(abs_lda)

----------------

Comparing every author to all the papers, getting similarity scores and aggregating them onto a list

In [None]:
##################
# abs_lda_a: authors' abstracts feature vectors
# abstract_lda: abstract feature vectors
##################

cos_scores = []

for i in tqdm(range(0,len(abs_lda_a))):
    sample_list = []
    
    for j in range(0,len(abstract_lda)):
        sample_list.append(gensim.matutils.cossim(abs_lda_a[i],abstract_lda[j]))
    
    cos_scores.append(sample_list)

In [None]:
cos_scores

In [None]:
#list to extract the top 10 abstracts for each author
indices = []
top_scores = []

for i in tqdm(range(0,len(cos_scores))):
    sample_list1 = []
    sample_list2 = []
    for index, value in sorted(enumerate(cos_scores[i]), reverse=True, key=lambda x: x[1])[:10]:
        sample_list1.append(index)
        sample_list2.append(value)
    indices.append(sample_list1)
    top_scores.append(sample_list2)

-------------

Creating a DF that contains AuthorIDs, their respective top 10 cosine indices and the paper indicies, and saving it

In [None]:
new_df['Author ID']

In [None]:
len(top_scores)

In [None]:
# df_au_id <- clean author IDs

final_df = pd.DataFrame({'Author ID':new_df['Author ID'],
                         'Top 10 Cosine Similarity Scores': top_scores,
                         'Paper Indices':indices
                        })

In [None]:
#Print out the first 100 authors
final_df[:100]

In [None]:
final_df[:100].to_csv('final_cossim.csv', index=False)