# Project ToDo List: Week 1

In [None]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

## Explore LDA with Gensim

In [3]:
# download NLTK stopwords
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ethanpotthoff/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# NLTK stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

### Import and Preprocess Data

Kaggle Dataset:
https://www.kaggle.com/datasets/abisheksudarshan/topic-modeling-for-research-articles?resource=download

In [5]:
# import dataset
df = pd.read_csv("project_data/Test.csv", usecols = ['ABSTRACT','Computer Science', 'Mathematics', 'Physics', 'Statistics'])
df.head()

Unnamed: 0,ABSTRACT,Computer Science,Mathematics,Physics,Statistics
0,fundamental frequency (f0) approximation from ...,0,0,0,1
1,"this large-scale study, consisting of 24.5 mil...",1,0,0,1
2,we present a stability analysis of the plane c...,0,0,1,0
3,we construct finite time blow-up solutions to ...,0,1,0,0
4,planetary nebulae (pne) constitute an importan...,0,0,1,0


In [11]:
# tokenize words and clean text using Gensim's simple_preprocess(). deacc=True removes punctuation.

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

data = df.ABSTRACT.values.tolist()
data_words = list(sent_to_words(data))
data_words = remove_stopwords(data_words)

print(data_words[:1])

[['fundamental', 'frequency', 'approximation', 'polyphonic', 'music', 'includes', 'tasks', 'multiple', 'melody', 'vocal', 'bass', 'line', 'estimation', 'historically', 'problems', 'approached', 'separately', 'recently', 'help', 'learning', 'based', 'approaches', 'present', 'multitask', 'deep', 'learning', 'architecture', 'jointly', 'estimates', 'outputs', 'considering', 'various', 'tasks', 'including', 'multiple', 'melody', 'vocal', 'bass', 'line', 'estimation', 'trained', 'help', 'large', 'semi', 'automatically', 'annotated', 'dataset', 'show', 'multitask', 'model', 'outperforms', 'single', 'task', 'counterparts', 'explore', 'effect', 'various', 'design', 'decisions', 'inside', 'approach', 'show', 'performs', 'better', 'least', 'competitively', 'compared', 'strong', 'baseline', 'methods']]


In [13]:
# calculate word frequencies (mapped generated word ids to word frequencies) - to access word an id represents, call dictionary[id]
# create Gensim Dictionary
dictionary = corpora.Dictionary(data_words)

# create corpus
corpus = [dictionary.doc2bow(word) for word in data_words]

print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 2), (22, 1), (23, 1), (24, 1), (25, 2), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 2), (33, 1), (34, 2), (35, 2), (36, 1), (37, 1), (38, 2), (39, 2), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 2), (51, 1), (52, 1), (53, 1), (54, 2), (55, 1), (56, 2), (57, 2)]]


In [19]:
# Human readable format of corpus (term-frequency)
[[(dictionary[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('annotated', 1),
  ('approach', 1),
  ('approached', 1),
  ('approaches', 1),
  ('approximation', 1),
  ('architecture', 1),
  ('automatically', 1),
  ('based', 1),
  ('baseline', 1),
  ('bass', 2),
  ('better', 1),
  ('compared', 1),
  ('competitively', 1),
  ('considering', 1),
  ('counterparts', 1),
  ('dataset', 1),
  ('decisions', 1),
  ('deep', 1),
  ('design', 1),
  ('effect', 1),
  ('estimates', 1),
  ('estimation', 2),
  ('explore', 1),
  ('frequency', 1),
  ('fundamental', 1),
  ('help', 2),
  ('historically', 1),
  ('includes', 1),
  ('including', 1),
  ('inside', 1),
  ('jointly', 1),
  ('large', 1),
  ('learning', 2),
  ('least', 1),
  ('line', 2),
  ('melody', 2),
  ('methods', 1),
  ('model', 1),
  ('multiple', 2),
  ('multitask', 2),
  ('music', 1),
  ('outperforms', 1),
  ('outputs', 1),
  ('performs', 1),
  ('polyphonic', 1),
  ('present', 1),
  ('problems', 1),
  ('recently', 1),
  ('semi', 1),
  ('separately', 1),
  ('show', 2),
  ('single', 1),
  ('strong', 1),


### Build Gensim LDA Model

https://radimrehurek.com/gensim/models/ldamodel.html

In [20]:
# build LDA model with Gensim

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [26]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics(num_words=20))

[(0,
  '0.025*"inside" + 0.018*"considering" + 0.011*"network" + 0.009*"method" + '
  '0.009*"networks" + 0.009*"learning" + 0.008*"algorithm" + 0.008*"sampling" '
  '+ 0.008*"performance" + 0.007*"neural" + 0.007*"state" + 0.007*"algorithms" '
  '+ 0.007*"system" + 0.007*"time" + 0.006*"computational" + 0.006*"propose" + '
  '0.006*"problem" + 0.005*"based" + 0.005*"proposed" + 0.005*"control"'),
 (1,
  '0.034*"inside" + 0.028*"considering" + 0.012*"hamiltonians" + 0.010*"monte" '
  '+ 0.009*"hmc" + 0.009*"carlo" + 0.008*"hamiltonian" + 0.007*"metropolis" + '
  '0.007*"sampling" + 0.006*"generalized" + 0.006*"mmhmc" + 0.006*"modified" + '
  '0.005*"problem" + 0.005*"show" + 0.005*"algorithm" + 0.005*"paper" + '
  '0.005*"dimensional" + 0.004*"two" + 0.004*"statistics" + 0.004*"linear"'),
 (2,
  '0.036*"inside" + 0.011*"considering" + 0.009*"magnetic" + 0.009*"coupling" '
  '+ 0.009*"field" + 0.005*"potential" + 0.005*"anisotropic" + 0.005*"nature" '
  '+ 0.005*"momentum" + 0.005*"obse

#### Model Evaluation Metrics

https://radimrehurek.com/gensim/models/coherencemodel.html

In [27]:
# model evaluation

# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
# https://radimrehurek.com/gensim/models/coherencemodel.html
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.211530593778528

Coherence Score:  0.4246014285352172


## Explore Conceptnet

Using Conceptnet API: https://github.com/commonsense/conceptnet5/wiki/API

There are three methods for accessing data through the ConceptNet 5 API: lookup, search, and association.
- __Lookup__ is for when you know the URI of an object in ConceptNet, and want to see a list of edges that include it.
- __Search__ finds a list of edges that match certain criteria.
- __Association__ is for finding concepts similar to a particular concept or a list of concepts.

In [32]:
import requests
obj = requests.get('http://api.conceptnet.io/c/en/dog').json()

In [34]:
obj.keys()

dict_keys(['@context', '@id', 'edges', 'version', 'view'])

In [40]:
obj['edges'][2]

{'@id': '/a/[/r/RelatedTo/,/c/en/dog/,/c/en/pet/]',
 '@type': 'Edge',
 'dataset': '/d/verbosity',
 'end': {'@id': '/c/en/pet',
  '@type': 'Node',
  'label': 'pet',
  'language': 'en',
  'term': '/c/en/pet'},
 'license': 'cc:by/4.0',
 'rel': {'@id': '/r/RelatedTo', '@type': 'Relation', 'label': 'RelatedTo'},
 'sources': [{'@id': '/and/[/s/process/split_words/,/s/resource/verbosity/]',
   '@type': 'Source',
   'contributor': '/s/resource/verbosity',
   'process': '/s/process/split_words'},
  {'@id': '/s/resource/verbosity',
   '@type': 'Source',
   'contributor': '/s/resource/verbosity'}],
 'start': {'@id': '/c/en/dog',
  '@type': 'Node',
  'label': 'dog',
  'language': 'en',
  'term': '/c/en/dog'},
 'surfaceText': '[[dog]] is related to [[pet]]',
 'weight': 9.82975075981075}