In [22]:
### Topic Modelling in Python with NLTK and Gensim ### 


In [7]:
# We will use Latent Dirichlet Allocation to model 
# We will apply LDA to convert a set of reserach papers to a set
# of topics. 

# **Research paper topic modelling** is an unsupervised ML 
#  method that helps us discover hidden semantic structures in a paper, 
# that allows us to learn topic representations of papers in a corpus. 
# The model can be applied to ay kinds of lables on documens, 
# such as tags on post on the website. 

# *****************************************************************8

### The Process ### 

    # - Pick the number of topics ahead of time even if wer're not 
    #   sure of that the topics are. 
    # - Each document is represented as a distribution over topics 
    # - Each topic is represented as a distribution over words. 
    
# The research paper text data is just a bunch of unlabeles texts
        

PATH = "C:\\Users\\jairp\\Desktop\\BackUP\\CODE-20180719T021021Z-001\\CODE\\Python\\NLP\\Towards Data Science\\Datasets\\dataset_research_texts.csv"

texts = open(PATH,"r")
texts_list = []

for line in texts: 
        texts_list.append(line)


['Innovation in Database Management: Computer Science vs. Engineering.\n', 'High performance prime field multiplication for GPU.\n', 'enchanted scissors: a scissor interface for support in cutting and interactive fabrication.\n', 'Detection of channel degradation attack by Intermediary Node in Linear Networks.\n', 'Pinning a Complex Network through the Betweenness Centrality Strategy.\n', 'Analysis and Design of Memoryless Interconnect Encoding Scheme.\n', 'Dynamic bluescreens.\n', 'A Quantitative Assured Forwarding Service.\n', 'Automatic sanitization of social network data to prevent inference attacks.\n', 'A &#916;&#931; IR-UWB radar with sub-mm ranging capability for human body monitoring systems.\n', 'Architecture of a multi-slot main memory system for 3.2 Gbps operation.\n', 'Rule-based Service Customization via Houdini.\n', 'Business Policy Modeling and Enforcement in Databases.\n', 'A high speed and high linearity OTA in 1-V power supply voltage.\n', 'PREDIcT: Towards Predictin

In [8]:
### Text Cleaning ### 
    
import nltk
import spacy 
from spacy.lang.en import English 
parser = English()

spacy.load('en_core_web_sm')

def tokenize(text): 
    lda_tokens = []
    tokens = parser(text)
    for token in tokens: 
        if token.orth_.isspace(): 
            continue
        elif token.like_url: 
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'): 
            lda_tokens.append('SCREEN_NAME')
        else :
            lda_tokens.append(token.lower_)
            
    return lda_tokens

# We use NLTK's Wordnet to find the meaning of words, synonyms, 
# antonyms, and more. In addition, we use WordNEtLemmatizer to 
# get the root word. 
    

import nltk 
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer 

# Get the lemmas for a word
def get_lemma(word): 
    lemma = wn.morphy(word)
    if lemma is None: 
        return word 
    else: 
        return lemma  
    
    
# Another lemmatizer funciton 
def get_lemma2(word): 
    return WordNetLemmatizer().lemmatize(word)


# Filter out stopwords  
en_stop = set(nltk.corpus.stopwords.words('english'))

# Now we can define a function to prepare the text 
# for topic modelling:

def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

# Open up our data, read line by line, for each line, prepare text 
# for each LDA, then add to a list. 

# Now we can see how our text data are converted: 
    
import random 

text_data = [] 
with open(PATH) as f: 
    for line in f: 
        tokens = prepare_text_for_lda(line)
        if random.random() > .99: 
            print(tokens)
            text_data.append(tokens)

['output', 'characteristic', 'class', 'amplifier', 'nonlinear', 'shunt', 'capacitance', 'versus', 'supply', 'voltage']
['energy', 'efficient', 'collaborative', 'sensing', 'mobile', 'phone']
['phase', 'noise', 'oscillator', 'implantable', 'biomedical', 'application']
['multiband', 'concurrent', 'sampling', 'base', 'front', 'biotelemetry', 'application']
['generic', 'solution', 'warehousing', 'business', 'process']
['base', 'crowd', 'generation', 'animation', 'cloth', 'rendering', '15.000', 'unique', 'human', 'character']
['html2rss', 'automatic', 'generation', 'base', 'structure', 'analysis', 'document']
['scalable', 'secret', 'generation', 'exploit', 'channel', 'phase', 'randomness', 'wireless', 'network']
['linear', 'sparse', 'array', 'synthesis', 'convex', 'optimization']
['address', 'privacy', 'management', 'crisis', 'online', 'social', 'network']
['satellite', 'satellite', 'network', 'novel', 'architecture', 'satellite', 'network']
['trajectory', 'improve', 'delivery', 'vehicular',

In [9]:
          
### LDA with Gensim ### 
            
# First, we are creating a dictionary from the data,  
# Then convert to bag of words corpus and save the dictionary
# and corpus for future use. 
         
import pickle
from gensim import corpora 

dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

# Save the thing in pickle 
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

## We are asking LDA to find 5 topics in the data: 

import gensim 

# load the LDA model 
ldamodel = gensim.models.LdaModel(corpus, num_topics = 5, 
                                  id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')

# Get the topics 
topics  = ldamodel.print_topics(num_words=4)
for topic in topics: 
    print(topic)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


(0, '0.031*"application" + 0.031*"noise" + 0.017*"optimization" + 0.017*"quantizer"')
(1, '0.055*"network" + 0.055*"satellite" + 0.038*"novel" + 0.038*"simulation"')
(2, '0.027*"processing" + 0.027*"base" + 0.027*"human" + 0.027*"generation"')
(3, '0.024*"design" + 0.024*"amplifier" + 0.024*"characteristic" + 0.024*"output"')
(4, '0.029*"base" + 0.029*"network" + 0.029*"generation" + 0.016*"system"')


In [10]:
# Let's try a different document: 
    
new_doc = "There is a huge power performance in the systems"
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[(76, 1)]
[(0, 0.103329234), (1, 0.59359294), (2, 0.10003019), (3, 0.10002709), (4, 0.10302057)]


In [12]:
# Now we are asking LDA to find 3 topics in the data: 
ldamodel = gensim.models.LdaModel(corpus, num_topics = 3, 
                                  id2word=dictionary, passes=15)
ldamodel.save('model3.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics: 
    print(topic)

(0, '0.023*"application" + 0.023*"phase" + 0.023*"linear" + 0.023*"network"')
(1, '0.039*"network" + 0.030*"satellite" + 0.030*"system" + 0.021*"base"')
(2, '0.028*"base" + 0.027*"generation" + 0.016*"efficient" + 0.016*"processing"')


In [14]:
# We can also find 10 topics 
    
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)
    

(0, '0.053*"network" + 0.028*"secret" + 0.028*"scalable" + 0.028*"wireless"')
(1, '0.095*"satellite" + 0.064*"network" + 0.034*"noise" + 0.034*"application"')
(2, '0.037*"system" + 0.037*"human" + 0.037*"computing" + 0.037*"cloud"')
(3, '0.056*"optimization" + 0.056*"linear" + 0.056*"convex" + 0.056*"synthesis"')
(4, '0.036*"network" + 0.036*"nonlinear" + 0.036*"capacitance" + 0.036*"social"')
(5, '0.043*"base" + 0.043*"simulation" + 0.043*"average" + 0.043*"converter"')
(6, '0.053*"processing" + 0.053*"system" + 0.053*"efficient" + 0.053*"relational"')
(7, '0.043*"base" + 0.043*"generation" + 0.043*"html2rss" + 0.043*"document"')
(8, '0.028*"base" + 0.028*"domain" + 0.028*"animation" + 0.028*"character"')
(9, '0.031*"application" + 0.031*"successive" + 0.031*"multiband" + 0.031*"modulator"')


In [17]:
# We can now output a visualization of the possible topics

dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl','rb'))
lda = gensim.models.LdaModel.load('model5.gensim')

import pyLDAvis.gensim

lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [18]:
### pyLDAvis
    
# pyLDAvis is designed to help users interpret the topics in 
#  a topic model that has been firt to a corpus of data. 
# The package extracts information from a fitted LDA topic 
# model to infrom an interactive web-based classification 


# Visualizing 5 topics: 

lda3 = gensim.models.ldamodel.LdaModel.load('model3.gensim')
lda_display3 = pyLDAvis.gensim.prepare(lda3, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display3)

# Saliency: a measure of how much the term tells you about the topic.
# Relevance: a weighted average of the probability of the word given the topic and the word given the topic normalized by the probability of the topic.
# The size of the bubble measures the importance of the topics, relative to the data.

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [19]:
# Visualizing 3 topics: 

lda3 = gensim.models.ldamodel.LdaModel.load('model3.gensim')
lda_display3 = pyLDAvis.gensim.prepare(lda3, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display3)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [20]:
# Visualizing 10 topics 

lda10 = gensim.models.ldamodel.LdaModel.load('model10.gensim')
lda_display10 = pyLDAvis.gensim.prepare(lda10, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display10)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
