In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import json
import matplotlib.pyplot as plt
from tqdm import tqdm
from langdetect import detect
from langdetect import DetectorFactory
from pprint import pprint
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_sci_lg
import os
import gensim
import gensim.corpora as corpora
from gensim.models import ldamodel
from gensim import corpora, models

In [None]:
with open('AuthAbs_full.json') as f:
    data = json.load(f)

In [None]:
#load the first half of the JSONS
pd_authabs1 = pd.DataFrame.from_dict(data[0], orient='index')

In [None]:
pd_authabs1

In [None]:
#load the second part of the JSONS
pd_authabs2 = pd.DataFrame.from_dict(data[1], orient='index')

In [None]:
pd_authabs2

In [None]:
#Combine the two dataframes

pd_authabs = pd.concat([pd_authabs1,pd_authabs2])

In [None]:
pd_authabs

In [None]:
#remove the the row that's full of Nulls/nans
new_df = pd_authabs.dropna(how='all')

In [None]:
#assign column names
new_df.columns = ['Abstract','1','2','3','4','5','6','7','8','9','10','11','12']

Change the index column to an Author ID column. This will be later changed to an independent column for later analysis

In [None]:
new_df.index.name = 'Author ID'

In [None]:
new_df['Author ID'] = new_df.index

In [None]:
#reset indices to 0 (replacing the AuthorID that were previously preceived as indices)
new_df.reset_index(drop=True, inplace=True)

In [None]:
new_df

In [None]:
#create a column that's a concatenation of all text columns
new_df['Abstract_new'] = new_df['Abstract'].astype(str) + new_df['1'].astype(str) + new_df['2'].astype(str) + new_df['3'].astype(str) + new_df['4'].astype(str) + new_df['5'].astype(str) + new_df['6'].astype(str) + new_df['7'].astype(str) + new_df['8'].astype(str) + new_df['9'].astype(str) + new_df['10'].astype(str) + new_df['11'].astype(str) + new_df['12'].astype(str)

In [None]:
new_df

In [None]:
new_df = new_df.drop(['Abstract','1','2','3','4','5','6','7','8','9','10','11','12'], axis=1)

In [None]:
new_df

In [None]:
new_df.to_csv("AuthorAbs.csv", index=False)

Now that the DF cleaning is done, LDA work is next

In [None]:
#create one columned df with abstracts only
df_text = new_df['Abstract_new']

In [None]:
df_text.shape

In [None]:
#function to convert sentences to individual words

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

In [None]:
data_w = df_text.values.tolist()
data_words = list(sent_to_words(data_w))

In [None]:
def extractDigits(lst):
    return [[el] for el in lst]

In [None]:
from pprint import pprint

NUM_TOPICS_PER_CLUSTER = 10

abs_lda = []

for i in tqdm(range(0,len(data_words))):
    
    if not data_words[i]:
        continue   
    #Change a list of words to list of lists
    ll_words = extractDigits(data_words[i])
    # Create Dictionary
    id2word_t = corpora.Dictionary(ll_words)    
    # Create Corpus
    texts_t = ll_words   
    # Term Document Frequency
    corpus_t = [id2word_t.doc2bow(text) for text in texts_t]

    # Build LDA model
    lda_model = gensim.models.LdaModel(corpus=corpus_t,
                                           id2word=id2word_t,
                                           num_topics=NUM_TOPICS_PER_CLUSTER)
        
    abs_lda.append(lda_model[corpus_t])

In [None]:
#identify any empty list entries (placeholders for the abstracts)

empty = []

for i in tqdm(range(0,len(data_words))):
    if not data_words[i]:
        print(f"datapoint {i} is empty")
        empty.append(i)
        continue


In [None]:
#clean up the author id df from the empty abstracts

df_au_id = new_df['Author ID']

In [None]:
df_au_id

In [None]:
df_au_id = df_au_id.reset_index(drop=True)

In [None]:
#clean authorid df
df_au_id

In [None]:
abs_lda

In [None]:
abs_lda[0][0]

In [None]:
len(abs_lda)

In [None]:
test_abs = abs_lda[0]
# len(test_abs)
for topic in test_abs:
    print(topic)

# print(topic for topic in test_abs)

In [None]:
# Save the LDA model

import pickle

with open("abs_lda.txt","wb") as fp:
    pickle.dump(abs_lda,fp)

# Paper Topics

In [None]:
#load JSON files

paper_abstracts = []
with open("papers.json", 'r', encoding='utf-8') as papers:
    papers = json.load(papers)
    for j in papers:
        if j:
            try:        
                paper_abstracts.append(j["abstract"])
            except:   
                #print the error message from sys
                print("error:", sys.exc_info()[0])
        else:
            continue

In [None]:
paper_abstracts

In [None]:
len(paper_abstracts)

In [None]:
# create a dataframe from the list

paper_abs_df = pd.DataFrame(paper_abstracts, columns=['Abstracts'])
paper_abs_df

In [None]:
paper_abs_df['Abstracts'][0] #test access to abstracts

In [None]:
#change the df sentences onto words
sentences_abs = paper_abs_df.values.tolist()
sentences_words = list(sent_to_words(sentences_abs))

In [None]:
#inspect the words
sentences_words[0]

In [None]:
NUM_TOPICS_PER_CLUSTER = 10

papers_lda = []
test_lda = []
topics_lda = []

for i in tqdm(range(0,len(sentences_words))):
    
    if not sentences_words[i]:
        continue   
    #Change a list of words to list of lists
    ll_words = extractDigits(sentences_words[i])
    # Create Dictionary
    id2word_t = corpora.Dictionary(ll_words)    
    # Create Corpus
    texts_t = ll_words   
    # Converting list of documents (corpus) into Document Term Matrix using 
    #id2word_t prepared above.
    corpus_t = [id2word_t.doc2bow(text) for text in texts_t]

    # Build LDA model
    lda_model = models.LdaModel(corpus=corpus_t,
                                       id2word=id2word_t,
                                       num_topics=NUM_TOPICS_PER_CLUSTER,
                                       random_state=47        
                                      )
        
    papers_lda.append(lda_model[corpus_t])
    topics = lda_model.get_topics()
    topics_1 = lda_model.print_topics(num_words=5)
    for topic in topics:
        test_lda.append(topic)
    for topics in topics_1:
        topics_lda.append(topics)

In [None]:
len(papers_lda)

In [None]:
papers_lda

In [None]:
len(papers_lda)

Prints out the keywords for each topic for each paper

In [None]:
topics_lda

gives you a full (sparse) array where each row is a topic, and each column a vocabulary word

https://stackoverflow.com/questions/61596101/calculating-cosine-similarity-from-a-gensim-model

In [None]:
len(test_lda)

# Similarity computations

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
ttttt = test_lda[0:10]
ttttt_1 = test_lda[10:20]

In [None]:
len(ttttt)

In [None]:
twt = cosine_similarity(ttttt_1, ttttt_1)

In [None]:
twt

In [None]:
pd_twt = pd.DataFrame(twt, columns=['Word1', 'Word2', 'Word3', 'Word4', 'Word5', 'Word6', 'Word7', 'Word8', 'Word9', 'Word10'])

In [None]:
pd_twt

In [None]:
similarity_t = gensim.matutils.cossim(papers_lda[0][0],papers_lda[1][0]) 

In [None]:
similarity_t

# THIS IS WHERE THE REAL WORK IS

create an LDA model for all the authors' papers

In [None]:
#creating the dictionary
dictionary_a = corpora.Dictionary([" ".join(df_text).split()]) 
print(f'{len(dictionary_a)} different terms in the corpus')
#creating the bag of words object
bow_corpus_a = [dictionary_a.doc2bow(text.split()) for text in df_text]

#train LDA models
lda_model_bow_a = models.LdaModel(corpus=bow_corpus_a, id2word=dictionary_a, num_topics=10,
                                random_state=47)

lda_model_bow_a

In [None]:
abs_lda_a = []

for i in tqdm(range(0,len(df_text))):    
    #attain dictionary for the abstract
    abs_vec = dictionary_a.doc2bow(df_text[i].split())    
    #extract topics from the LDA model
    abs_lda_vec = lda_model_bow_a[abs_vec]
    print (f'document {i} feature vector: ', abs_lda_vec)    
#     pprint(lda_model_bow_a.print_topics(10, num_words=5)) #prints the topics with their respective top-word probability
     
    print('\n')
    abs_lda_a.append(abs_lda_vec)

In [None]:
len(abs_lda_a)

In [None]:
#feature vectors of paper abstracts
abstract_lda = []

for i in tqdm(range(0,len(paper_abstracts))):
    ab_ = dictionary_a.doc2bow(paper_abstracts[i].split())
    abs_lda = lda_model_bow_a[ab_]
    print ('document topics: ', abs_lda)
    abstract_lda.append(abs_lda)

----------------

Comparing every author to all the papers, getting similarity scores and aggregating them onto a list

In [None]:
##################
# abs_lda_a: authors' abstracts feature vectors
# abstract_lda: abstract feature vectors
##################

cos_scores = []

for i in tqdm(range(0,len(abs_lda_a))):
    sample_list = []
    
    for j in range(0,len(abstract_lda)):
        sample_list.append(gensim.matutils.cossim(abs_lda_a[i],abstract_lda[j]))
    
    cos_scores.append(sample_list)

In [None]:
cos_scores

In [None]:
#list to extract the top 10 abstracts for each author
indices = []
top_scores = []

for i in tqdm(range(0,len(cos_scores))):
    sample_list1 = []
    sample_list2 = []
    for index, value in sorted(enumerate(cos_scores[i]), reverse=True, key=lambda x: x[1])[:10]:
        sample_list1.append(index)
        sample_list2.append(value)
    indices.append(sample_list1)
    top_scores.append(sample_list2)

-------------

Creating a DF that contains AuthorIDs, their respective top 10 cosine indices and the paper indicies

In [None]:
new_df['Author ID']

In [None]:
len(top_scores)

In [None]:
# df_au_id <- clean author IDs

final_df = pd.DataFrame({'Author ID':new_df['Author ID'],
                         'Top 10 Cosine Similarity Scores': top_scores,
                         'Paper Indices':indices
                        })

In [None]:
final_df

In [None]:
final_df.to_csv('final_cossim.csv', index=False)

----------------------

create an LDA model for all the papers

In [None]:
#creating the dictionary
dictionary = corpora.Dictionary([" ".join(paper_abstracts).split()]) 
print(f'{len(dictionary)} different terms in the corpus')
#creating the bag of words object
bow_corpus = [dictionary.doc2bow(text.split()) for text in paper_abstracts]

#train LDA models
lda_model_bow = models.LdaModel(corpus=bow_corpus, id2word=dictionary, num_topics=10,
                                passes=1, random_state=47)

lda_model_bow

In [None]:
abs_lda_t = []

for i in tqdm(range(0,len(paper_abstracts))):    
    #attain dictionary for the abstract
    abs_vec = dictionary.doc2bow(paper_abstracts[i].split())    
    #extract topics from the LDA model
    abs_lda_vec = lda_model_bow[abs_vec]
    print (f'document {i} feature vector: ', abs_lda_vec)    
    pprint(lda_model.print_topics(10, num_words=5)) #prints the topics with their respective top-word probability
     
    print('\n')
    abs_lda_t.append(abs_lda_vec)

In [None]:
print(len(paper_abstracts))