In [1]:
import pandas as pd
import os
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import pickle
import nltk
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re

nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sijieliu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sijieliu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [29]:
# load data
data = pd.read_csv('final_hdsi_faculty_updated.csv', index_col=0)
# data

In [3]:
data = data[data['abstract'].notna()]
data['year'] = data['year'].astype(int)
data = data[data['year'] >= 2015] # aaron being excluded then

In [4]:
# convert abtracts to lowercase

data['abstract'] = \
data['abstract'].map(lambda x: x.lower())

In [5]:
# stemming and removing stopwords
redundant = ['abstract', 'purpose', 'paper', 'goal']
ss = SnowballStemmer(language="english")

def preprocess_abstract(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3 and token not in redundant:
            result.append(ss.stem(token))
    return " ".join(result)

data['abstract_processed'] = data['abstract'].apply(preprocess_abstract)

In [6]:
counts = CountVectorizer().fit_transform(data['abstract_processed'])

In [7]:
# organzie author's abstracts by year
authors = {}
for author in data.HDSI_author.unique():
    authors[author] = {
        2015 : list(),
        2016 : list(),
        2017 : list(),
        2018 : list(),
        2019 : list(),
        2020 : list(),
        2021 : list()
    }
for i, row in data.iterrows():
    authors[row['HDSI_author']][row['year']].append(row['abstract_processed'])

In [30]:
authors.keys()

dict_keys(['Yusu Wang', 'Babak Salimi', 'Arya Mazumdar', 'Berk Ustun', 'Gal Mishne', 'Mikhail Belkin', 'Tsu-Wei (Lily) Weng', 'Yian Ma', 'Zhiting Hu', 'Benjamin Smarr', 'Armin Schwartzman', 'R. Stuart Geiger', 'Arun Kumar', 'Barna Saha', 'Jingbo Shang', 'Yoav Freund', 'Alex Cloninger', 'Jelena Bradic', 'Rayan Saab', 'Mikio Aoi', 'David Danks', 'Margaret (Molly) Roberts', 'Bradley Voytek', 'Virginia De Sa', 'Rajesh Gupta', 'Dimitris Politis', 'Ilkay Altintas', 'Robin Knight', 'Shankar Subramaniam', 'Angela Yu', 'Eran Mukamel', 'Shannon Ellis', 'Henrik Christensen', 'Julian McAuley', 'Larry Smarr', 'Rose Yu', 'Vineet Bafna', 'Michael Pazzani', 'Tara Javidi', 'Young-Han Kim', 'Ery, Arias-Castro ', 'Michael Holst', 'Ronghui (Lily) Xu', 'Ruth Williams', 'Terry Sejnowski', 'Frank Wuerthwein', 'Albert Hsiao', 'Lucila Ohno-Machado', 'George Sugihara', 'Justin Eldridge'])

In [8]:
all_docs = []
for author, author_dict in authors.items():
    for year, documents in author_dict.items():
        all_docs.append(" ".join(documents))

In [9]:
# initate LDA model
countVec = CountVectorizer()
counts = countVec.fit_transform(all_docs)
names = countVec.get_feature_names()

In [11]:
# 25 topics model 
modeller = LatentDirichletAllocation(n_components=25, n_jobs=-1, random_state=123)
result = modeller.fit_transform(counts)

# display top words for each topic in the model
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
display_topics(modeller, names, 15)

Topic 0:
model pregnanc outcom data women trial diseas dose studi clinic gestat event prednison adjust trajectori
Topic 1:
problem algorithm graph distribut bound approxim consid test random number propos rate base function sampl
Topic 2:
model estim method test sampl propos asymptot predict distribut data result base bootstrap linear signal
Topic 3:
microbiom microbi associ sampl studi divers sequenc communiti human data microbiota diseas method differ result
Topic 4:
data neuron method tree dimension graph structur topolog base imag metric space detect analysi time
Topic 5:
dynam causal nonlinear model seri time predict forecast structur interact stock recruit empir function approach
Topic 6:
model size imag effect method human function observ map gaussian statist estim scale threshold field
Topic 7:
model activ neural cell genom method network data result dynam human brain function signal studi
Topic 8:
cell type neuron gene brain regulatori singl methyl specif function express tran

In [12]:
# time-author_topic

# column names
topicnames = ["Topic" + str(i) for i in range(25)]

# index names
docnames = ["Doc" + str(i) for i in range(len(all_docs))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(result, columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic


In [19]:
# add author and year
df_document_topic['author'] = np.nan
df_document_topic['year'] = np.nan
df_document_topic.shape

year_paper_count = {}
for author in authors.keys():
    if author not in year_paper_count.keys():
        year_paper_count[author] = 0
    year_paper_count[author] += len(authors[author])

author_list = list(year_paper_count.keys())
for i in range(0, 350, 7):
    df_document_topic.iloc[i:i+7, 26] = author_list[i//7]
    year = 2015
    for j in range(i, i+7):
        df_document_topic.iloc[j, 27] = str(year)
        year += 1
time_author_topic = df_document_topic
time_author_topic.to_csv('Data/time_author_topic.csv')

In [14]:
pickle.dump(modeller, open('models/agg_author_model_25_topics.pkl', 'wb'))
pickle.dump(counts, open('models/agg_dtm.pkl', 'wb'))
pickle.dump(countVec, open('models/agg_vectorizer', 'wb'))