# LDA (15 Topics)

Tutorial based off of: https://www.analyticsvidhya.com/blog/2021/06/part-3-topic-modeling-and-latent-dirichlet-allocation-lda-using-gensim-and-sklearn/

In [1]:
import pandas as pd

# for text preprocessing
import re
# import spacy
# nlp = spacy.load('en_core_web_sm')

from nltk.corpus import stopwords 
import nltk
nltk.download('stopwords')
from nltk.stem.wordnet import WordNetLemmatizer
import string
from nltk.stem.porter import *
import gensim
from gensim.utils import simple_preprocess

# import vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# import numpy for matrix operation
import numpy as np

# import LDA from sklearn
from sklearn.decomposition import LatentDirichletAllocation

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/siddhipatel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# to suppress warnings
from warnings import filterwarnings
filterwarnings('ignore')

In [3]:
# import data

hdsi_faculty = pd.read_csv('final_hdsi_faculty_updated.csv')
print(hdsi_faculty.columns)
hdsi_faculty.head(3)

Index(['Unnamed: 0', 'year', 'authors', 'title', 'abstract', 'times_cited',
       'concepts', 'journal.title', 'HDSI_author'],
      dtype='object')


Unnamed: 0.1,Unnamed: 0,year,authors,title,abstract,times_cited,concepts,journal.title,HDSI_author
0,0,2021,"[{'raw_affiliation': [], 'first_name': 'Chen',...",Elder-Rule-Staircodes for Augmented Metric Spaces,,0,"['space', 'metric spaces']",SIAM Journal on Applied Algebra and Geometry,Yusu Wang
1,1,2020,[{'raw_affiliation': ['Cold Spring Harbor Labo...,Semantic segmentation of microscopic neuroanat...,Understanding of neuronal circuitry at cellula...,3,"['hybrid architecture', 'semantic segmentation...",Nature Machine Intelligence,Yusu Wang
2,2,2020,"[{'raw_affiliation': ['MOSEK ApS, Copenhagen, ...",On homotopy types of Vietoris–Rips complexes o...,We study Vietoris–Rips complexes of metric wed...,5,"['Vietoris–Rips complexes', 'wedge sum', 'metr...",Journal of Applied and Computational Topology,Yusu Wang


In [4]:
# data preprocssing
hdsi_faculty["abstract"].fillna(hdsi_faculty["title"], inplace=True) # if no abstract, replace w/ title of article
hdsi_faculty = hdsi_faculty[hdsi_faculty["year"] > 2014]
#combining all the documents into a list by author and year:

authors = {}
for author in hdsi_faculty['HDSI_author'].unique():
    authors[author] = {
        2015 : list(),
        2016 : list(),
        2017 : list(),
        2018 : list(),
        2019 : list(),
        2020 : list(),
        2021 : list()
    }
    
for i, row in hdsi_faculty.iterrows():
    authors[row['HDSI_author']][row['year']].append(row['abstract'])
    
corpus = []
for author, author_dict in authors.items():
    for year, documents in author_dict.items():
        corpus.append(" ".join(documents))

corpus # corpus of abstracts

['',
 'The geometrical tree structures of axonal and dendritic processes play important roles in determining the architecture and capabilities of neuronal circuitry. Morphological features based on this tree structure have played a central role in classifying neurons for over a century. Yet geometrical trees are not automatically adapted to the basic mathematical tool used widely in data analysis, namely vector spaces and linear algebra, since tree geometries cannot be naturally added and subtracted. Current methods for analysis reduce trees to feature vectors in more or less ad hoc ways. A more natural mathematical object suited to characterizing neuronal tree geometries, is a metric space, where only distances between objects need be defined. In recent years, there have been significant developments in the fields of computational topology and geometry that promise to be useful for the analysis of neuronal geometries. In this paper, we adapt these tools to the problem of characterizin

In [7]:
# text preprocessing on corpus

# stop loss words 
stop = set(stopwords.words('english'))

exclude = set(string.punctuation)

# lemmatization
lemma = WordNetLemmatizer()

# one function for all the steps:
def clean(doc):
    
    # convert text into lower case + split into words
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    
    # remove any stop words present
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)  
    
    # remove punctuations + normalize the text
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())  
    
    return normalized

# clean data stored in a new list

clean_corpus = [clean(doc).split() for doc in corpus]
    
# clean_corpus
clean_corpus

[[],
 ['geometrical',
  'tree',
  'structure',
  'axonal',
  'dendritic',
  'process',
  'play',
  'important',
  'role',
  'determining',
  'architecture',
  'capability',
  'neuronal',
  'circuitry',
  'morphological',
  'feature',
  'based',
  'tree',
  'structure',
  'played',
  'central',
  'role',
  'classifying',
  'neuron',
  'century',
  'yet',
  'geometrical',
  'tree',
  'automatically',
  'adapted',
  'basic',
  'mathematical',
  'tool',
  'used',
  'widely',
  'data',
  'analysis',
  'namely',
  'vector',
  'space',
  'linear',
  'algebra',
  'since',
  'tree',
  'geometry',
  'cannot',
  'naturally',
  'added',
  'subtracted',
  'current',
  'method',
  'analysis',
  'reduce',
  'tree',
  'feature',
  'vector',
  'le',
  'ad',
  'hoc',
  'way',
  'natural',
  'mathematical',
  'object',
  'suited',
  'characterizing',
  'neuronal',
  'tree',
  'geometry',
  'metric',
  'space',
  'distance',
  'object',
  'need',
  'defined',
  'recent',
  'year',
  'significant',
  'deve

In [8]:
# covert text into numerical representation

# Converting text into numerical representation
cv_vectorizer = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)
print(cv_vectorizer)

# Array from Count Vectorizer 
cv_arr = cv_vectorizer.fit_transform(clean_corpus)

# Creating vocabulary array which will represent all the corpus 
vocab_cv = cv_vectorizer.get_feature_names()

# get the vocab list
vocab_cv 

CountVectorizer(lowercase=False,
                tokenizer=<function <lambda> at 0x7fdada20eca0>)


['0',
 '00',
 '000',
 '00001',
 '00002',
 '00003',
 '00005',
 '00006',
 '00007',
 '0001',
 '00014',
 '00017',
 '0001–005',
 '0002',
 '00041',
 '00042',
 '00043',
 '00045',
 '0004900006',
 '0005',
 '00085',
 '0009',
 '00096',
 '000±07',
 '000×',
 '001',
 '0011',
 '00111',
 '0013',
 '0014',
 '00150032',
 '0017',
 '002',
 '0020conclusiona',
 '0022',
 '00227',
 '0029',
 '003',
 '00302',
 '0033',
 '0035',
 '0037',
 '0039',
 '004',
 '0044',
 '00459',
 '0048',
 '00498conclusionscompared',
 '005',
 '006',
 '0061',
 '0068',
 '007',
 '008',
 '008–065',
 '009',
 '01',
 '010',
 '011',
 '0110',
 '011068',
 '012',
 '012234',
 '013',
 '013014',
 '013036',
 '013°',
 '014',
 '015',
 '016',
 '017',
 '018',
 '0185',
 '0188',
 '019',
 '0195',
 '01d',
 '01dots',
 '01n',
 '02',
 '0203',
 '02033',
 '021',
 '0213',
 '022',
 '022094',
 '022784',
 '0239',
 '024',
 '0251',
 '025°',
 '026',
 '0274',
 '028',
 '029',
 '03',
 '030',
 '0304',
 '03051',
 '031',
 '0311',
 '032',
 '033',
 '0347',
 '0347103819',
 '035',


In [9]:
# implementation of LDA:
    
# Create object for the LDA class 
# Inside this class LDA: define the components:
lda_model = LatentDirichletAllocation(n_components = 15, n_jobs=-1, random_state=123)

# fit transform on model on our count_vectorizer : running this will return our topics 
X_topics = lda_model.fit_transform(cv_arr)

# .components_ gives us our topic distribution 
topic_words = lda_model.components_

In [10]:
n_top_words = 15

for i, topic_dist in enumerate(topic_words):
    
    # np.argsort to sorting an array or a list or the matrix acc to their values
    sorted_topic_dist = np.argsort(topic_dist)
    
    # Next, to view the actual words present in those indexes we can make the use of the vocab created earlier
    topic_words = np.array(vocab_cv)[sorted_topic_dist]
    
    # so using the sorted_topic_indexes we ar extracting the words from the vocabulary
    # obtaining topics + words
    # this topic_words variable contains the Topics  as well as the respective words present in those Topics
    topic_words = topic_words[:-n_top_words:-1]
    print ("Topic", str(i+1), topic_words)

Topic 1 ['model' 'network' 'method' 'data' 'neural' 'learning' 'dynamic' 'system'
 'time' 'using' 'brain' 'show' 'algorithm' 'result']
Topic 2 ['problem' 'algorithm' 'graph' 'bound' 'rate' 'scheme' 'show' 'proposed'
 'method' 'distribution' 'paper' 'model' 'matrix' 'number']
Topic 3 ['±' 'flow' 'data' 'imaging' 'individual' 'female' 'study' 'variability'
 'cardiac' 'circadian' 'across' 'control' 'mri' 'rv']
Topic 4 ['system' 'r' 'ur' 'across' 'max' '‖' 'difference' 'singular' 'population'
 'coupling' 'information' 'sjl' 'left' 'network']
Topic 5 ['recruitment' 'perturbation' 'stock' 'certified' 'robustness' 'salmon'
 'verification' 'semantic' 'bycatch' 'handwriting' 'interaction' 'current'
 'size' 'radius']
Topic 6 ['cell' 'gene' 'type' 'data' 'analysis' 'expression' 'neuron' 'brain'
 'method' 'network' 'using' 'dna' 'study' 'regulatory']
Topic 7 ['robot' 'data' 'model' 'environment' 'dynamic' 'algorithm' 'system' 'edm'
 'ecosystem' 'year' 'software' 'time' 'using' 'task']
Topic 8 ['pa

In [11]:
# To view what topics are assigned to the douments:

doc_topic = lda_model.transform(cv_arr)  

# iterating over ever value till the end value
for n in range(doc_topic.shape[0]):
    
    # argmax() gives maximum index value
    topic_doc = doc_topic[n].argmax()
    
    # document is n+1  
    print ("Document", n+1, " -- Topic:" ,topic_doc)

Document 1  -- Topic: 0
Document 2  -- Topic: 5
Document 3  -- Topic: 5
Document 4  -- Topic: 1
Document 5  -- Topic: 14
Document 6  -- Topic: 12
Document 7  -- Topic: 10
Document 8  -- Topic: 0
Document 9  -- Topic: 11
Document 10  -- Topic: 11
Document 11  -- Topic: 0
Document 12  -- Topic: 11
Document 13  -- Topic: 11
Document 14  -- Topic: 11
Document 15  -- Topic: 0
Document 16  -- Topic: 1
Document 17  -- Topic: 1
Document 18  -- Topic: 1
Document 19  -- Topic: 1
Document 20  -- Topic: 1
Document 21  -- Topic: 1
Document 22  -- Topic: 0
Document 23  -- Topic: 11
Document 24  -- Topic: 12
Document 25  -- Topic: 1
Document 26  -- Topic: 11
Document 27  -- Topic: 0
Document 28  -- Topic: 0
Document 29  -- Topic: 0
Document 30  -- Topic: 0
Document 31  -- Topic: 1
Document 32  -- Topic: 1
Document 33  -- Topic: 0
Document 34  -- Topic: 1
Document 35  -- Topic: 0
Document 36  -- Topic: 0
Document 37  -- Topic: 0
Document 38  -- Topic: 0
Document 39  -- Topic: 0
Document 40  -- Topic: 

In [21]:
#### below is taken from Irene's code

# column names
topicnames = ["Topic" + str(i) for i in range(15)]

# index names
docnames = ["Doc" + str(i) for i in range(350)]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(doc_topic, columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

In [22]:
#### below is taken from Irene's code

df_document_topic['author'] = np.nan
df_document_topic['year'] = np.nan
df_document_topic.shape

year_paper_count = {}
for author in authors.keys():
    if author not in year_paper_count.keys():
        year_paper_count[author] = 0
    year_paper_count[author] += len(authors[author])

author_list = list(year_paper_count.keys())
for i in range(0, 350, 7):
    df_document_topic.iloc[i:i+7, 16] = author_list[i//7]
    year = 2015
    for j in range(i, i+7):
        df_document_topic.iloc[j, 17] = str(year)
        year += 1
time_author_topic = df_document_topic

# Sankey Diagram

Using Irene's code: https://github.com/IreneLiu2018/textmining/blob/master/sankey_viz.ipynb

In [26]:
# how each author is related to each topic overall

data = time_author_topic

averaged = data.groupby('author').mean().drop(['dominant_topic'], axis=1)

# filter the data by dropping value outside .95 percentile
filtered = averaged.mask(averaged < averaged.quantile(.95), other=0)

# get labels, sources, targets, values prepared for developing sankey diagram
# read sankey diagram library for the purposes of them

labels = filtered.index.to_list() #name of faculty & topics
labels.extend(filtered.columns.to_list())

sources = []
targets = []
values = [] # proportions
index_counter = 0
for index, row in filtered.iterrows():
    for i, value in enumerate(row):
        if value != 0:
            sources.append(index_counter)
            targets.append(50 + i)
            values.append(value)
    index_counter += 1
    
# split those average value that representing the relationships into ranks [1, 10]
def split_into_ranks(array):
    ranks = []
    for value in array:
        for i, percentage in enumerate(np.arange(.1, 1.1, .1)):
            if value <= np.quantile(array, percentage):
                ranks.append(i + 1)
                break
    return ranks

import numpy as np
values_array = np.array(values)
values_final = split_into_ranks(values_array)


In [27]:
import plotly.graph_objects as go

fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = labels, # flow name
      color = 'purple',
      # customdata = link_labels,
      hovertemplate='%{customdata} Total Flow: %{value}<extra></extra>'
    ),
    link = dict(
      source = sources, # indices correspond to labels, eg A1, A2, A1, B1, ...
      target = targets, # targets nodes
      value = values_final # flow volumn 
  ))])

fig.update_layout(title_text="Author Topic Connections", font_size=10, height=2000, paper_bgcolor="LightSteelBlue")
fig.show()