## Import Data

In [None]:
import pandas as pd
import gdown
import re
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import gensim.corpora as corpora
from pprint import pprint

from gensim.models.coherencemodel import CoherenceModel


file_id = "1u7rvNf84a__E3HaPfhcC83MLENOieBZe"
url = f'https://drive.google.com/uc?id={file_id}'
output = 'data.csv'
gdown.download(url, output, quiet=False)
df = pd.read_csv('data.csv')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Downloading...
From: https://drive.google.com/uc?id=1u7rvNf84a__E3HaPfhcC83MLENOieBZe
To: /content/data.csv
100%|██████████| 1.31G/1.31G [00:12<00:00, 101MB/s] 


### View the Data

In [None]:
df

Unnamed: 0,DocID,Title,Abstract,BodyText
0,S0001457513002972,A system of safety management practices and wo...,Objective The overall research objective was t...,"In particular, the individual worker interface..."
1,S0001457513004806,Network-level accident-mapping: Distance based...,The objective of an accident-mapping algorithm...,The location coordinates of the accident is de...
2,S0001457514003091,Measuring errors and violations on the road: A...,The Driver Behavior Questionnaire (DBQ) is a s...,"Briefly, the Cohort II study randomly sampled ..."
3,S0001457515001098,Operating under the influence: Three year reci...,Operating a motor vehicle under the influence ...,"In addition to loss of life, the economic cost..."
4,S000145751500127X,Real-time driver drowsiness feedback improves ...,Driver drowsiness has been implicated as a maj...,"However, the effects of feedback on other appr..."
...,...,...,...,...
40086,S8756328219302856,PYY is a negative regulator of bone mass and s...,Objective: Bone loss in anorexia nervosa and f...,"Scans were performed at 50 kV, 200 μA, 0.5 mm ..."
40087,S8756328219304004,Development of protocols for the first serial ...,There is an unmet need for a high-resolution t...,We have generated and compared SBF SEM data fr...
40088,S8756328219304715,Overexpression of Pitx1 attenuates the senesce...,To explore the role of low expression of Pitx1...,"However, this process needs to be confirmed. C..."
40089,S8756328219304739,"The effect of pubertal timing, as reflected by...",Objective: To examine the relationship between...,"In terms of limitations, whereas timing of the..."


## Data Cleaning

In [None]:
#create a copy which we will prepare differently from a gpt approach. We take only the first 1000 papers in this case to avoid colab crashing issues.
lda_df = df.head(1000).copy()

# Remove punctuation and lowercase
lda_df['BodyText'] = lda_df['BodyText'].map(lambda x: re.sub('[,\.!?]', '', x))
lda_df['BodyText'] = lda_df['BodyText'].map(lambda x: x.lower())

#Remove stop words as well as customed defined vocabulary (words I don't think add much information but show up frequently)
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'et', 'al', 'using', 'also', 'fig', 'study', 'may', 'however', 'two', 'one', 'used', 'within', 'data', 'used',
                   'time', 'different', 'analysis', 'model', 'table', 'studies', 'high', 'effect', 'results', 'signficant', 'found', 'based', 'observed', 'could', 'well',
                   'number', 'effects', 'first', 'eg', 'shown', 'associated', 'reported', 'compared', 'three', 'sample', 'would', 'similar', 'see', 'individuals', 'higher', 'due',
                   'non', 'min', 'low', 'increased', 'therefore', 'significant', 'range', 'related', 'change', 'mean', 'mm', 'thus', 'performed', 'likely', 'potential', 'specific',
                   'size', 'hz', 'showed', 'levels', 'models', 'ms'])

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc))
             if word not in stop_words] for doc in texts]


data = lda_df.BodyText.values.tolist()
data_words = list(sent_to_words(data))

# remove stop words
data_words = remove_stopwords(data_words)

# Create Dictionary
id2word = corpora.Dictionary(data_words)

# Create Corpus
texts = data_words

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

## Make the LDA Model

In [None]:
# number of topics
num_topics = 50

# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,
                                       iterations=50)

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]



[(40,
  '0.004*"species" + 0.003*"cells" + 0.002*"cell" + 0.002*"mice" + '
  '0.002*"group" + 0.002*"protein" + 0.002*"samples" + 0.002*"control" + '
  '0.002*"including" + 0.001*"expression"'),
 (29,
  '0.003*"cells" + 0.002*"protein" + 0.002*"group" + 0.002*"cell" + '
  '0.002*"level" + 0.002*"participants" + 0.002*"species" + 0.001*"values" + '
  '0.001*"treatment" + 0.001*"method"'),
 (1,
  '0.002*"cell" + 0.002*"cells" + 0.002*"protein" + 0.002*"participants" + '
  '0.002*"level" + 0.002*"activity" + 0.002*"species" + 0.002*"patients" + '
  '0.002*"including" + 0.001*"although"'),
 (27,
  '0.003*"cells" + 0.002*"protein" + 0.002*"control" + 0.002*"cell" + '
  '0.002*"figure" + 0.002*"participants" + 0.002*"test" + 0.002*"function" + '
  '0.002*"species" + 0.001*"process"'),
 (45,
  '0.002*"participants" + 0.002*"samples" + 0.002*"cells" + 0.002*"protein" + '
  '0.002*"information" + 0.001*"structure" + 0.001*"energy" + 0.001*"factors" '
  '+ 0.001*"level" + 0.001*"control"'),
 (44

In [None]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('nCoherence Score: ', coherence_lda)


nCoherence Score:  0.23275802816420865


In [None]:
!pip install funcy
!pip install tzdata
!pip install --no-deps pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

Collecting funcy
  Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy
Successfully installed funcy-2.0
Collecting tzdata
  Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m341.8/341.8 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tzdata
Successfully installed tzdata-2023.3
Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyLDAvis
Successfully installed pyLDAvis-3.4.1


In [None]:
lda_model

  and should_run_async(code)


<gensim.models.ldamulticore.LdaMulticore at 0x7a03387be0b0>

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

pyLDAvis.enable_notebook()
lda_viz = gensimvis.prepare(lda_model, corpus, id2word)


  and should_run_async(code)


## Final Result: Visualize the topics

In [None]:
# You can select a topic by clicking a topic bubble, or typing the topic number in the "Selected Topic" box.
#The lambda slider is a custom metric created by LDAvis
#lambda = 1 is normal relevance, i.e., how frequently the word appears in that topics
#lambda = 0 is lift, i.e., likelihood in the topic divided by liklihood in the corpus (this will highlight words that are unique to this topic)

#We could begin to determine which topics represent which families by using lambda = 0
#check topic 8 for instance, given apatite, fluoroapatite and chloroapatite come up first,
#this topic assuredly deals with clay, and is likely about geogology
#topic 43 appears to deal with wind energy and sound, etc.
lda_viz

  and should_run_async(code)
