In [1]:
import pandas as pd
import os
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import pickle
import nltk
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re

nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sijieliu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sijieliu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# load data
data = pd.read_csv('final_hdsi_faculty_updated.csv', index_col=0)
data.head()


Unnamed: 0,year,authors,title,abstract,times_cited,concepts,journal.title,HDSI_author
0,2021,"[{'raw_affiliation': [], 'first_name': 'Chen',...",Elder-Rule-Staircodes for Augmented Metric Spaces,,0,"['space', 'metric spaces']",SIAM Journal on Applied Algebra and Geometry,Yusu Wang
1,2020,[{'raw_affiliation': ['Cold Spring Harbor Labo...,Semantic segmentation of microscopic neuroanat...,Understanding of neuronal circuitry at cellula...,3,"['hybrid architecture', 'semantic segmentation...",Nature Machine Intelligence,Yusu Wang
2,2020,"[{'raw_affiliation': ['MOSEK ApS, Copenhagen, ...",On homotopy types of Vietoris–Rips complexes o...,We study Vietoris–Rips complexes of metric wed...,5,"['Vietoris–Rips complexes', 'wedge sum', 'metr...",Journal of Applied and Computational Topology,Yusu Wang
3,2020,[{'raw_affiliation': ['Computer Science and En...,Detection and skeletonization of single neuron...,Neuroscientific data analysis has traditionall...,0,"['collection of neurons', 'hand-tuned paramete...",bioRxiv,Yusu Wang
4,2020,"[{'raw_affiliation': [], 'first_name': 'Dingka...",Detection and skeletonization of single neuron...,Neuroscientific data analysis has traditionall...,0,"['collection of neurons', 'hand-tuned paramete...",arXiv,Yusu Wang


In [5]:
data = data[data['abstract'].notna()]
data['year'] = data['year'].astype(int)
data = data[data['year'] >= 2015]

In [6]:
# convert abtracts to lowercase

data['abstract'] = \
data['abstract'].map(lambda x: x.lower())

In [7]:
# stemming and removing stopwords
redundant = ['abstract', 'purpose', 'paper', 'goal']
ss = SnowballStemmer(language="english")

def preprocess_abstract(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3 and token not in redundant:
            result.append(ss.stem(token))
    return " ".join(result)

data['abstract_processed'] = data['abstract'].apply(preprocess_abstract)

In [8]:
counts = CountVectorizer().fit_transform(data['abstract_processed'])

In [9]:
# organzie author's abstracts by year
authors = {}
for author in data.HDSI_author.unique():
    authors[author] = {
        2015 : list(),
        2016 : list(),
        2017 : list(),
        2018 : list(),
        2019 : list(),
        2020 : list(),
        2021 : list()
    }
for i, row in data.iterrows():
    authors[row['HDSI_author']][row['year']].append(row['abstract_processed'])

In [10]:
all_docs = []
for author, author_dict in authors.items():
    for year, documents in author_dict.items():
        all_docs.append(" ".join(documents))

In [11]:
# initate LDA model
countVec = CountVectorizer()
counts = countVec.fit_transform(all_docs)
names = countVec.get_feature_names()

In [107]:
# 25 topics model 
modeller = LatentDirichletAllocation(n_components=25, n_jobs=-1, random_state=123)
result = modeller.fit_transform(counts)

# display top words for each topic in the model
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
display_topics(modeller, names, 15)

Topic 0:
measur regurgit vector quantiz finit system singular differ left multiplanar right normal stress reproduc magnitud
Topic 1:
microbiom microbi studi associ sampl sequenc divers human commun data diseas microbiota method differ result
Topic 2:
trust right autonom technolog causal provid interest vehicl moral differ propos reason valu argu reveal
Topic 3:
model data patient studi clinic measur outcom trial result pregnanc women estim includ treatment diseas
Topic 4:
approxim model langevin chemic diffus time markov chain propos coupl nois system rel background function
Topic 5:
erron inflat skew instanti tendenc deleteri compel neonat subcort jitter etoc multineuron overinfl psd timelin
Topic 6:
model adapt polici human state task predict learn decis face stop inform bayesian target behavior
Topic 7:
variabl femal circadian male temperatur rhythm cycl sentiment disrupt word earli dwcox theme bodi stress
Topic 8:
code scheme capac channel rate decod achiev bound problem gener mult

In [12]:
# time-author_topic

# column names
topicnames = ["Topic" + str(i) for i in range(25)]

# index names
docnames = ["Doc" + str(i) for i in range(len(all_docs))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(result, columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

NameError: name 'result' is not defined

In [None]:
pickle.dump(modeller, open('models\\agg_author_model_25_topics.pkl', 'wb'))
pickle.dump(counts, open('models\\agg_dtm.pkl', 'wb'))
pickle.dump(countVec, open('models\\agg_vectorizer', 'wb'))