In [26]:
# Enabling print for all lines
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Checking the working directory
import os
os.getcwd()

import time

'C:\\Users\\kalya\\Python\\NLP'

**1. Create a Dictionary from a list of sentences**

- In gensim, the dictionary contains a map of all words (tokens) to its unique id
- We create a dictionary from a paragraph of sentences, from a text file that contains multiple lines of text and from multiple such text files contained in a directory
- For the second and third cases, we can do it without loading the entire file into memory

In [27]:
import gensim
from gensim import corpora
from pprint import pprint

# How to create a dictionary from a list of sentences?
documents = ["The Saudis are preparing a report that will acknowledge that", 
             "Saudi journalist Jamal Khashoggi's death was the result of an", 
             "interrogation that went wrong, one that was intended to lead", 
             "to his abduction from Turkey, according to two sources."]

documents_2 = ["One source says the report will likely conclude that", 
                "the operation was carried out without clearance and", 
                "transparency and that those involved will be held", 
                "responsible. One of the sources acknowledged that the", 
                "report is still being prepared and cautioned that", 
                "things could change."]

# Tokenize(split) the sentences into words
texts = [[text for text in doc.split()] for doc in documents]

# Create dictionary
dictionary = corpora.Dictionary(texts)

# Get information about the dictionary
print(dictionary)
type(dictionary)

Dictionary(33 unique tokens: ['Saudis', 'The', 'a', 'acknowledge', 'are']...)


gensim.corpora.dictionary.Dictionary

In [28]:
# Unique ids for each of these tokens
print(dictionary.token2id)

{'Saudis': 0, 'The': 1, 'a': 2, 'acknowledge': 3, 'are': 4, 'preparing': 5, 'report': 6, 'that': 7, 'will': 8, 'Jamal': 9, "Khashoggi's": 10, 'Saudi': 11, 'an': 12, 'death': 13, 'journalist': 14, 'of': 15, 'result': 16, 'the': 17, 'was': 18, 'intended': 19, 'interrogation': 20, 'lead': 21, 'one': 22, 'to': 23, 'went': 24, 'wrong,': 25, 'Turkey,': 26, 'abduction': 27, 'according': 28, 'from': 29, 'his': 30, 'sources.': 31, 'two': 32}


In [29]:
# If we get new documents in the future, it is also possible to update an existing dictionary to include the new words

documents_2 = ["The intersection graph of paths in trees",
               "Graph minors IV Widths of trees and well quasi ordering",
               "Graph minors A survey"]

texts_2 = [[text for text in doc.split()] for doc in documents_2]
dictionary.add_documents(texts_2)

# Dictionary should have been updated with the new words (tokens)
print(dictionary)
print(dictionary.token2id)

Dictionary(48 unique tokens: ['Saudis', 'The', 'a', 'acknowledge', 'are']...)
{'Saudis': 0, 'The': 1, 'a': 2, 'acknowledge': 3, 'are': 4, 'preparing': 5, 'report': 6, 'that': 7, 'will': 8, 'Jamal': 9, "Khashoggi's": 10, 'Saudi': 11, 'an': 12, 'death': 13, 'journalist': 14, 'of': 15, 'result': 16, 'the': 17, 'was': 18, 'intended': 19, 'interrogation': 20, 'lead': 21, 'one': 22, 'to': 23, 'went': 24, 'wrong,': 25, 'Turkey,': 26, 'abduction': 27, 'according': 28, 'from': 29, 'his': 30, 'sources.': 31, 'two': 32, 'graph': 33, 'in': 34, 'intersection': 35, 'paths': 36, 'trees': 37, 'Graph': 38, 'IV': 39, 'Widths': 40, 'and': 41, 'minors': 42, 'ordering': 43, 'quasi': 44, 'well': 45, 'A': 46, 'survey': 47}


We have successfully created a Dictionary object. Gensim will use this dictionary to create a bag-of-words corpus where the words in the documents are replaced with its respective id provided by this dictionary

**2. Create a bag of words corpus in gensim**

- Bag of Words is a corpus object that contains the word id and its frequency in each document
- Its as gensim’s equivalent of a Document-Term matrix
- Once you have the updated dictionary, we pass the tokenized list of words to the Dictionary.doc2bow() to create BoW

In [30]:
from gensim.utils import simple_preprocess

# List with 2 sentences
my_docs = ["Who let the dogs out?",
           "Who? Who? Who? Who?"]

# Tokenize the docs
tokenized_list = [simple_preprocess(doc) for doc in my_docs]

# Create the Corpus
mydict = corpora.Dictionary()
mycorpus = [mydict.doc2bow(doc, allow_update=True) for doc in tokenized_list]
pprint(mycorpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)], [(4, 4)]]


- The (0, 1) in line 1 means, the word with id=0 appears once in the 1st document
- Likewise, the (4, 4) in the second list item means the word with id 4 appears 4 times in the second document

In [31]:
# To convert the id’s to words, you will need the dictionary to do the conversion
word_counts = [[(mydict[id], count) for id, count in line] for line in mycorpus]
pprint(word_counts)

[[('dogs', 1), ('let', 1), ('out', 1), ('the', 1), ('who', 1)], [('who', 4)]]


**3. Create the TFIDF matrix (corpus) in gensim**

- The Term Frequency – Inverse Document Frequency(TF-IDF) is also a bag-of-words model but unlike the regular corpus
- TFIDF down weights tokens (words) that appears frequently across documents
- Tf-Idf is computed by multiplying a local component like term frequency (TF) with a global component, that is, inverse document frequency (IDF) and optionally normalizing the result to unit length
- Words that occur frequently across documents will get downweighted
- There are multiple variations of formulas for TF and IDF existing
- Gensim uses the SMART Information retrieval system that can be used to implement these variations`(check models.TfidfModel)`

In [32]:
from gensim import models
import numpy as np

documents = ["This is the first line",
             "This is the second sentence",
             "This third document"]

# Create the Dictionary and Corpus
mydict = corpora.Dictionary([simple_preprocess(line) for line in documents])
corpus = [mydict.doc2bow(simple_preprocess(line)) for line in documents]

# Show the Word Weights in Corpus
for doc in corpus:
    print([[mydict[id], freq] for id, freq in doc])

# Create the TF-IDF model
tfidf = models.TfidfModel(corpus, smartirs='ntc')

# Show the TF-IDF weights
for doc in tfidf[corpus]:
    print("Difference in weights", [[mydict[id], np.around(freq, decimals=2)] for id, freq in doc])

[['first', 1], ['is', 1], ['line', 1], ['the', 1], ['this', 1]]
[['is', 1], ['the', 1], ['this', 1], ['second', 1], ['sentence', 1]]
[['this', 1], ['document', 1], ['third', 1]]
Difference in weights [['first', 0.63], ['is', 0.31], ['line', 0.63], ['the', 0.31], ['this', 0.13]]
Difference in weights [['is', 0.31], ['the', 0.31], ['this', 0.13], ['second', 0.63], ['sentence', 0.63]]
Difference in weights [['this', 0.15], ['document', 0.7], ['third', 0.7]]


- There is difference in weights of the words between the original corpus and the tfidf weighted corpus
- The words ‘is’ and ‘the’ occur in two documents and were weighted down
- The word ‘this’ appearing in all three documents was removed altogether
- In simple terms, words that occur more frequently across the documents get smaller weights

In [33]:
# Loading in-built data using gensim
import gensim.downloader as api

# Get information about the model or dataset
api.info('glove-wiki-gigaword-50')
# w2v_model = api.load("glove-wiki-gigaword-50")

{'num_records': 400000,
 'file_size': 69182535,
 'base_dataset': 'Wikipedia 2014 + Gigaword 5 (6B tokens, uncased)',
 'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/glove-wiki-gigaword-50/__init__.py',
 'license': 'http://opendatacommons.org/licenses/pddl/',
 'parameters': {'dimension': 50},
 'description': 'Pre-trained vectors based on Wikipedia 2014 + Gigaword, 5.6B tokens, 400K vocab, uncased (https://nlp.stanford.edu/projects/glove/).',
 'preprocessing': 'Converted to w2v format with `python -m gensim.scripts.glove2word2vec -i <fname> -o glove-wiki-gigaword-50.txt`.',
 'read_more': ['https://nlp.stanford.edu/projects/glove/',
  'https://nlp.stanford.edu/pubs/glove.pdf'],
 'checksum': 'c289bc5d7f2f02c6dc9f2f9b67641813',
 'file_name': 'glove-wiki-gigaword-50.gz',
 'parts': 1}

**4. Create bigrams and trigrams using Phraser models**

In [34]:
# text8 dataset is the 'First 100,000,000 bytes of plain text from Wikipedia'
dataset = api.load("text8")
dataset = [wd for wd in dataset]

dct = corpora.Dictionary(dataset)
corpus = [dct.doc2bow(line) for line in dataset]

# Build the bigram models
bigram = gensim.models.phrases.Phrases(dataset, min_count=3, threshold=10)

# Construct bigram
print(bigram[dataset[0]])

['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working_class', 'radicals', 'including', 'the', 'diggers', 'of', 'the', 'english', 'revolution', 'and', 'the', 'sans_culottes', 'of', 'the', 'french_revolution', 'whilst', 'the', 'term', 'is', 'still', 'used', 'in', 'a', 'pejorative_way', 'to', 'describe', 'any', 'act', 'that', 'used', 'violent', 'means', 'to', 'destroy', 'the', 'organization', 'of', 'society', 'it', 'has', 'also', 'been', 'taken_up', 'as', 'a', 'positive', 'label', 'by', 'self', 'defined', 'anarchists', 'the', 'word', 'anarchism', 'is', 'derived_from', 'the', 'greek', 'without', 'archons', 'ruler', 'chief', 'king', 'anarchism', 'as', 'a', 'political_philosophy', 'is', 'the', 'belief_that', 'rulers', 'are', 'unnecessary', 'and', 'should_be', 'abolished', 'although', 'there_are', 'differing_interpretations', 'of', 'what', 'this', 'means', 'anarchism', 'also', 'refers_to', 'related', 'social_movements', 'that', 'advocate',

In [54]:
import nltk

dataset = [wd for wd in dataset]
for word in dataset:
    print(word)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [17]:
# Same procedure like bigram model, except output of bigram is the input for trigram
# Build the trigram models
trigram = gensim.models.phrases.Phrases(bigram[dataset], threshold=10)

# Construct trigram
print(trigram[bigram[dataset[0]]])

['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working_class', 'radicals', 'including', 'the', 'diggers', 'of', 'the', 'english', 'revolution', 'and', 'the', 'sans_culottes', 'of', 'the', 'french_revolution', 'whilst', 'the', 'term', 'is', 'still', 'used', 'in', 'a', 'pejorative_way', 'to_describe', 'any', 'act', 'that', 'used', 'violent', 'means', 'to', 'destroy', 'the', 'organization', 'of', 'society', 'it', 'has', 'also_been', 'taken_up', 'as', 'a', 'positive', 'label', 'by', 'self', 'defined', 'anarchists', 'the', 'word', 'anarchism', 'is', 'derived_from', 'the', 'greek', 'without', 'archons', 'ruler', 'chief', 'king', 'anarchism', 'as', 'a', 'political_philosophy', 'is', 'the', 'belief_that', 'rulers', 'are', 'unnecessary', 'and', 'should_be', 'abolished', 'although', 'there_are', 'differing_interpretations', 'of', 'what', 'this', 'means', 'anarchism', 'also', 'refers_to', 'related', 'social_movements', 'that', 'advocate', 'the'

**5. Topic models with Latent Dirichlet Allocation (LDA) and Latent Semantic Indexing (LSI)**

- Each document in the text is considered as a combination of topics
- Each topic is considered as a combination of related words
- In both cases, number of topics as input needs to be provided
- Model will provide the topic keywords for each topic and the percentage contribution of topics in each document
- Quality of topics is highly dependent on the quality of text processing and the number of topics you provide to the algorithm

In [35]:
# Import necessary packages
from gensim.models import LdaModel, LdaMulticore
import gensim.downloader as api
from gensim.utils import simple_preprocess, lemmatize
import pattern
from nltk.corpus import stopwords
import re
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)
stop_words = stopwords.words('english')

In [36]:
# !pip install pattern

In [37]:
# Import the dataset and get the text and real topic of each news article
dataset = api.load("text8")
data = [d for d in dataset]

In [40]:
re.compile('(NN|JJ|RB)')

re.compile(r'(NN|JJ|RB)', re.UNICODE)

In [42]:
# Prepare the downloaded data by removing stopwords and lemmatize it
data_processed = []

for i, doc in enumerate(data[:100]):
    doc_out = []
    for wd in doc:
        if wd not in stop_words:  # remove stopwords
            lemmatized_word = lemmatize(wd)#, allowed_tags=re.compile('(NN|JJ|RB)'))  # lemmatize
            if lemmatized_word:
                doc_out = doc_out + [lemmatized_word[0].split(b'/')[0].decode('utf-8')]
    data_processed.append(doc_out)

# Print a small sample    
print(data_processed[0][:5]) 

['anarchism', 'originated', 'term', 'abuse', 'first']


In [43]:
# Create the Inputs of LDA model: Dictionary and Corpus from data_processed(list of list of words)
dct = corpora.Dictionary(data_processed)
corpus = [dct.doc2bow(line) for line in data_processed]

2020-07-12 20:30:05,741 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-07-12 20:30:06,375 : INFO : built Dictionary(42634 unique tokens: ['abacu', 'ability', 'able', 'abnormal', 'abolish']...) from 100 documents (total 503631 corpus positions)


In [44]:
# Train the LDA model with 7 topics(arbitrary). LdaMulticore() supports parallel processing
start = time.time()
lda_model = LdaMulticore(corpus=corpus, id2word=dct, random_state=100, num_topics=7, passes=10, chunksize=1000, batch=False,
                         alpha='asymmetric', decay=0.5, offset=64, eta=None, eval_every=0, iterations=100,
                         gamma_threshold=0.001, per_word_topics=True)
print(time.time()-start)

2020-07-12 20:30:09,297 : INFO : using asymmetric alpha [0.26219156, 0.19027454, 0.14931786, 0.12287004, 0.104381524, 0.090729296, 0.080235206]
2020-07-12 20:30:09,299 : INFO : using symmetric eta at 0.14285714285714285
2020-07-12 20:30:09,333 : INFO : using serial LDA version on this node
2020-07-12 20:30:09,613 : INFO : running online LDA training, 7 topics, 10 passes over the supplied corpus of 100 documents, updating every 3000 documents, evaluating every ~0 documents, iterating 100x with a convergence threshold of 0.001000
2020-07-12 20:30:09,923 : INFO : training LDA model using 3 processes
2020-07-12 20:30:11,046 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #100/100, outstanding queue size 1
2020-07-12 20:30:53,811 : INFO : topic #6 (0.080): 0.001*"also" + 0.001*"used" + 0.000*"state" + 0.000*"many" + 0.000*"first" + 0.000*"see" + 0.000*"name" + 0.000*"time" + 0.000*"work" + 0.000*"include"
2020-07-12 20:30:53,814 : INFO : topic #5 (0.091): 0.001*"first" + 0.

2020-07-12 20:30:58,321 : INFO : topic #5 (0.091): 0.009*"agave" + 0.001*"apollo" + 0.001*"asia" + 0.001*"agassi" + 0.001*"first" + 0.001*"aruba" + 0.001*"also" + 0.001*"state" + 0.001*"year" + 0.001*"used"
2020-07-12 20:30:58,324 : INFO : topic #2 (0.149): 0.001*"also" + 0.000*"first" + 0.000*"state" + 0.000*"include" + 0.000*"used" + 0.000*"many" + 0.000*"time" + 0.000*"agave" + 0.000*"american" + 0.000*"language"
2020-07-12 20:30:58,326 : INFO : topic #1 (0.190): 0.001*"american" + 0.001*"also" + 0.000*"first" + 0.000*"state" + 0.000*"used" + 0.000*"new" + 0.000*"see" + 0.000*"many" + 0.000*"year" + 0.000*"make"
2020-07-12 20:30:58,329 : INFO : topic #0 (0.262): 0.001*"also" + 0.001*"state" + 0.000*"include" + 0.000*"person" + 0.000*"many" + 0.000*"first" + 0.000*"used" + 0.000*"american" + 0.000*"call" + 0.000*"see"
2020-07-12 20:30:58,332 : INFO : topic diff=0.102904, rho=0.119438
2020-07-12 20:30:58,335 : INFO : PROGRESS: pass 7, dispatched chunk #0 = documents up to #100/100, ou

51.35553693771362


In [45]:
# Save the model
lda_model.save('lda_model.model')

# See the topics
lda_model.print_topics(-1)

2020-07-12 20:31:10,773 : INFO : saving LdaState object under lda_model.model.state, separately None
2020-07-12 20:31:10,929 : INFO : saved lda_model.model.state
2020-07-12 20:31:10,966 : INFO : saving LdaMulticore object under lda_model.model, separately ['expElogbeta', 'sstats']
2020-07-12 20:31:10,969 : INFO : storing np array 'expElogbeta' to lda_model.model.expElogbeta.npy
2020-07-12 20:31:11,034 : INFO : not storing attribute id2word
2020-07-12 20:31:11,035 : INFO : not storing attribute dispatcher
2020-07-12 20:31:11,036 : INFO : not storing attribute state
2020-07-12 20:31:11,039 : INFO : saved lda_model.model
2020-07-12 20:31:11,075 : INFO : topic #0 (0.262): 0.001*"also" + 0.001*"state" + 0.000*"include" + 0.000*"person" + 0.000*"many" + 0.000*"first" + 0.000*"used" + 0.000*"american" + 0.000*"call" + 0.000*"see"
2020-07-12 20:31:11,077 : INFO : topic #1 (0.190): 0.001*"american" + 0.000*"also" + 0.000*"first" + 0.000*"state" + 0.000*"used" + 0.000*"new" + 0.000*"see" + 0.000

[(0,
  '0.001*"also" + 0.001*"state" + 0.000*"include" + 0.000*"person" + 0.000*"many" + 0.000*"first" + 0.000*"used" + 0.000*"american" + 0.000*"call" + 0.000*"see"'),
 (1,
  '0.001*"american" + 0.000*"also" + 0.000*"first" + 0.000*"state" + 0.000*"used" + 0.000*"new" + 0.000*"see" + 0.000*"many" + 0.000*"year" + 0.000*"make"'),
 (2,
  '0.000*"also" + 0.000*"first" + 0.000*"state" + 0.000*"include" + 0.000*"used" + 0.000*"many" + 0.000*"time" + 0.000*"agave" + 0.000*"american" + 0.000*"language"'),
 (3,
  '0.001*"also" + 0.001*"see" + 0.001*"used" + 0.001*"first" + 0.001*"time" + 0.001*"state" + 0.001*"american" + 0.001*"year" + 0.001*"name" + 0.001*"lincoln"'),
 (4,
  '0.005*"also" + 0.004*"state" + 0.004*"american" + 0.003*"used" + 0.003*"first" + 0.003*"see" + 0.003*"time" + 0.003*"many" + 0.003*"year" + 0.002*"make"'),
 (5,
  '0.015*"agave" + 0.002*"asia" + 0.002*"agassi" + 0.002*"aruba" + 0.001*"apollo" + 0.001*"also" + 0.001*"state" + 0.001*"first" + 0.001*"var" + 0.001*"year"')

In [46]:
# In practice, LSI is much faster to train than LDA, but has lower accuracy
from gensim.models import LsiModel

# Build the LSI Model
start = time.time()
lsi_model = LsiModel(corpus=corpus, id2word=dct, num_topics=7, decay=0.5)
print(time.time()-start)

# View Topics
pprint(lsi_model.print_topics(-1))

2020-07-12 20:31:21,201 : INFO : using serial LSI version on this node
2020-07-12 20:31:21,202 : INFO : updating model with new documents
2020-07-12 20:31:21,203 : INFO : preparing a new chunk of documents
2020-07-12 20:31:21,460 : INFO : using 100 extra samples and 2 power iterations
2020-07-12 20:31:21,462 : INFO : 1st phase: constructing (42634, 107) action matrix
2020-07-12 20:31:21,584 : INFO : orthonormalizing (42634, 107) action matrix
2020-07-12 20:31:25,993 : INFO : 2nd phase: running dense svd on (107, 100) matrix
2020-07-12 20:31:26,346 : INFO : computing the final decomposition
2020-07-12 20:31:26,347 : INFO : keeping 7 factors (discarding 60.734% of energy spectrum)
2020-07-12 20:31:26,634 : INFO : processed documents up to #100
2020-07-12 20:31:26,748 : INFO : topic #0(1120.793): 0.230*"also" + 0.176*"state" + 0.165*"american" + 0.159*"used" + 0.155*"first" + 0.134*"see" + 0.132*"many" + 0.131*"time" + 0.127*"year" + 0.115*"make"
2020-07-12 20:31:26,751 : INFO : topic #1(

5.563936233520508
[(0,
  '0.230*"also" + 0.176*"state" + 0.165*"american" + 0.159*"used" + '
  '0.155*"first" + 0.134*"see" + 0.132*"many" + 0.131*"time" + 0.127*"year" + '
  '0.115*"make"'),
 (1,
  '-0.935*"agave" + -0.164*"asia" + -0.099*"aruba" + -0.063*"plant" + '
  '-0.059*"state" + -0.053*"var" + -0.046*"east" + -0.044*"congress" + '
  '-0.043*"century" + -0.042*"island"'),
 (2,
  '-0.479*"american" + -0.181*"football" + -0.179*"player" + -0.177*"war" + '
  '-0.148*"british" + 0.141*"used" + -0.127*"play" + -0.117*"ball" + '
  '-0.115*"line" + 0.113*"also"'),
 (3,
  '0.220*"apollo" + -0.218*"state" + -0.198*"lincoln" + 0.189*"player" + '
  '0.188*"football" + 0.184*"play" + 0.179*"line" + 0.160*"ball" + '
  '0.138*"used" + -0.127*"government"'),
 (4,
  '0.316*"lincoln" + 0.277*"apollo" + 0.271*"atheism" + 0.267*"god" + '
  '0.160*"atheist" + -0.154*"island" + -0.141*"aluminium" + 0.134*"abraham" + '
  '0.127*"aristotle" + -0.101*"acid"'),
 (5,
  '0.405*"apollo" + -0.170*"football

**6. Training a data set with Word2Vec**

In [1]:
from gensim.models.word2vec import Word2Vec
from multiprocessing import cpu_count
import gensim.downloader as api

In [3]:
# Download dataset
dataset = api.load("text8")
data = [d for d in dataset]

In [4]:
# Split the data into 2 parts. Part 2 will be used later to update the model
data_part1 = data[:1000]
data_part2 = data[1000:]

# Train Word2Vec model - Defaults result vector size = 100
model = Word2Vec(data_part1, min_count = 0, workers=cpu_count())

In [5]:
# Get the word vector for given word
model['topic']

# Similar word
model.most_similar('cookies')

  
  """


[('nectar', 0.854192852973938),
 ('grilled', 0.8456602692604065),
 ('chopsticks', 0.8436591625213623),
 ('bags', 0.8430781364440918),
 ('baked', 0.840359628200531),
 ('apples', 0.839759349822998),
 ('cooked', 0.8371293544769287),
 ('pipes', 0.8367469906806946),
 ('cakes', 0.8366749286651611),
 ('grapes', 0.8364530801773071)]

In [6]:
# Save and Load Model
model.save('newmodel')
model = Word2Vec.load('newmodel')

**6.1 Update an existing Word2Vec model with new data**

In [7]:
# Update the model with new data
model.build_vocab(data_part2, update=True)
model.train(data_part2, total_examples=model.corpus_count, epochs=model.iter)
model['topic']

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


array([ 0.88522404, -1.3550783 ,  2.425115  , -1.8167701 ,  0.09146538,
        2.9669452 , -0.4362503 , -0.440485  , -0.38269123, -1.3947804 ,
        0.07911377, -0.01555447,  0.5657819 ,  2.2213743 , -1.2608438 ,
       -1.5656185 ,  0.6076971 ,  0.22566587, -0.85461324, -0.8468682 ,
        0.7770127 , -0.30719855,  0.53273875, -0.17061728, -0.61228734,
       -0.11454373,  0.6451138 , -0.32120687, -2.0947063 , -0.41426334,
       -0.15256032,  0.48141328, -0.90980715,  0.9422829 ,  0.61670774,
       -0.64421004,  0.33573928, -0.09030112,  0.9555507 ,  0.47109857,
        0.7083128 , -0.47757503, -0.3714962 ,  0.3163366 , -0.603824  ,
        0.51636225,  0.4272613 ,  0.28486392,  1.2327639 ,  0.68985814,
       -0.6767952 , -0.7865079 , -0.9108312 ,  0.08205472,  1.2882267 ,
       -0.3640386 , -0.39231315,  0.41391712,  1.1377299 , -0.50432134,
        0.18047155, -0.9536308 , -1.0494094 ,  2.1461446 ,  0.2572285 ,
       -0.11528365, -0.3238931 ,  0.02178064, -0.24388723,  0.09

**7. Extract word vectors using pre-trained Word2Vec and FastText models**

In [9]:
%%time
import gensim.downloader as api

# Download the models
fasttext_model300 = api.load('fasttext-wiki-news-subwords-300')
from gensim.models.fasttext import FastText

Wall time: 5min 23s


In [2]:
%%time
word2vec_model300 = api.load('word2vec-google-news-300')
from gensim.models import Word2Vec

Wall time: 2min 26s


In [3]:
%%time
glove_model300 = api.load('glove-wiki-gigaword-300')
glove_model300

Wall time: 2min 34s


<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x21cbaddf5c8>

In [14]:
# Get word embeddings
word2vec_model300.most_similar('support')
# Word2Vec.most_similar('support')

# from gensim.models import Word2Vec
# # Word2Vec.wv.vocab
# wv = Word2Vec()
# vw_list = "breakfast cereal dinner lunch".split()
# vw_list
# wv.doesnt_match(vw_list)

[('supporting', 0.6251285076141357),
 ('suport', 0.6071149706840515),
 ('suppport', 0.6053199768066406),
 ('Support', 0.6044272780418396),
 ('supported', 0.6009396314620972),
 ('backing', 0.6007589101791382),
 ('supports', 0.5269277691841125),
 ('assistance', 0.520713746547699),
 ('sup_port', 0.5192489624023438),
 ('supportive', 0.5110025405883789)]

We have 3 different embedding models and to evaluate which one performs better using the respective model’s evaluate_word_analogies() on a standard analogies dataset

In [15]:
# Word2ec_accuracy
word2vec_model300.evaluate_word_analogies(analogies="questions-words.txt")[0]

# fasttext_accuracy
fasttext_model300.evaluate_word_analogies(analogies="questions-words.txt")[0]
# FastText.evaluate_word_analogies(analogies="questions-words.txt")[0]

# GloVe accuracy
glove_model300.evaluate_word_analogies(analogies="questions-words.txt")[0]

0.7401448525607863

0.8827876424099353

0.7195422354510931

**8. Create document vectors using Doc2Vec**

Unlike Word2Vec, a Doc2Vec model provides a vectorised representation of a group of words taken collectively as a single unit. It is not a simple average of the word vectors of the words in the sentence

In [16]:
import gensim
import gensim.downloader as api

# Download dataset
dataset = api.load("text8")
data = [d for d in dataset]

In [17]:
# The training data for Doc2Vec should be a list of TaggedDocuments. To create one, we pass a list of words and a unique
# integer as input to the models.doc2vec.TaggedDocument()

def create_tagged_document(list_of_list_of_words):
    for i, list_of_words in enumerate(list_of_list_of_words):
        yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])

train_data = list(create_tagged_document(data))

print(train_data[:1])

[TaggedDocument(words=['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working', 'class', 'radicals', 'including', 'the', 'diggers', 'of', 'the', 'english', 'revolution', 'and', 'the', 'sans', 'culottes', 'of', 'the', 'french', 'revolution', 'whilst', 'the', 'term', 'is', 'still', 'used', 'in', 'a', 'pejorative', 'way', 'to', 'describe', 'any', 'act', 'that', 'used', 'violent', 'means', 'to', 'destroy', 'the', 'organization', 'of', 'society', 'it', 'has', 'also', 'been', 'taken', 'up', 'as', 'a', 'positive', 'label', 'by', 'self', 'defined', 'anarchists', 'the', 'word', 'anarchism', 'is', 'derived', 'from', 'the', 'greek', 'without', 'archons', 'ruler', 'chief', 'king', 'anarchism', 'as', 'a', 'political', 'philosophy', 'is', 'the', 'belief', 'that', 'rulers', 'are', 'unnecessary', 'and', 'should', 'be', 'abolished', 'although', 'there', 'are', 'differing', 'interpretations', 'of', 'what', 'this', 'means', 'anarchism', 'also', 'refers'

In [18]:
# The input is prepared. To train the model, we initialize the Doc2Vec model, build the vocabulary and then finally train it

# Init the Doc2Vec model
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

# Build the Volabulary
model.build_vocab(train_data)

# Train the Doc2Vec model
model.train(train_data, total_examples=model.corpus_count, epochs=model.epochs)

In [19]:
# To get the document vector of a sentence, pass it as a list of words to the infer_vector() method
print(model.infer_vector(['australian', 'captain', 'elected', 'to', 'bowl']))

[-0.2566483  -0.575424    0.16708931  0.1678163  -0.34758413 -0.09677021
  0.52557653 -0.17069203 -0.1173829  -0.08467584 -0.21260308 -0.05808348
 -0.05184652  0.16461983 -0.27753395 -0.27451926 -0.04151861  0.26055896
  0.24731347  0.08211682 -0.22650313 -0.17226401 -0.24536379  0.3636108
  0.01631917  0.24950188 -0.5555312   0.10363539  0.17609511  0.09978993
  0.28348967  0.30820653  0.06417907  0.55594707 -0.19024536  0.07824706
  0.10854443 -0.09870841  0.70921695  0.06350834  0.14941354  0.14258994
 -0.02939754  0.3112465   0.2247404  -0.04233663  0.03702605 -0.32701755
  0.27227554 -0.23906282]


**9. Compute similarity metrics like cosine similarity and soft cosine similarity**

Soft cosine similarity is similar to cosine similarity but in addition considers the semantic relationship between the words through its vector representation

In [20]:
# We need a word embedding model like Word2Vec or FastText. First, compute the similarity_matrix. Then convert the input
# sentences to bag-of-words corpus and pass them to the softcossim() along with the similarity matrix

from gensim.matutils import softcossim
from gensim import corpora

sent_1 = 'Sachin is a cricket player and a opening batsman'.split()
sent_2 = 'Dhoni is a cricket player too He is a batsman and keeper'.split()
sent_3 = 'Anand is a chess player'.split()

In [24]:
from gensim.utils import simple_preprocess

# Create gensim dictionary form a single tet file
dictionary = corpora.Dictionary(simple_preprocess(line, deacc=True) for line in open('sample.txt', encoding='utf-8'))

# Token to Id map
print(dictionary.token2id, end='')

{'about': 0, 'dollars': 1, 'each': 2, 'for': 3, 'hour': 4, 'million': 5, 'produce': 6, 'the': 7, 'two': 8, 'we': 9, 'work': 10, 'conservative': 11, 'estimate': 12, 'fifty': 13, 'hours': 14, 'is': 15, 'it': 16, 'rather': 17, 'takes': 18, 'time': 19, 'us': 20, 'any': 21, 'copyright': 22, 'edited': 23, 'entered': 24, 'etext': 25, 'get': 26, 'proofread': 27, 'selected': 28, 'to': 29, 'analyzed': 30, 'and': 31, 'etc': 32, 'letters': 33, 'searched': 34, 'this': 35, 'written': 36, 'audience': 37, 'hundred': 38, 'if': 39, 'one': 40, 'our': 41, 'projected': 42, 'readers': 43, 'value': 44, 'at': 45, 'dollar': 46, 'estimated': 47, 'nominally': 48, 'per': 49, 'text': 50, 'then': 51, 'as': 52, 'release': 53, 'six': 54, 'thirty': 55, 'year': 56, 'etexts': 57, 'files': 58, 'in': 59, 'month': 60, 'more': 61, 'of': 62, 'or': 63, 'total': 64, 'computerized': 65, 'just': 66, 'population': 67, 'reach': 68, 'these': 69, 'away': 70, 'billion': 71, 'given': 72, 'over': 73, 'should': 74, 'give': 75, 'goal': 7

In [25]:
# Prepare the similarity matrix
similarity_matrix = fasttext_model300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)

# Prepare a dictionary and a corpus.
documents = [sent_1, sent_2, sent_3]
dictionary = corpora.Dictionary(documents)

# Convert the sentences into bag-of-words vectors.
sent_1 = dictionary.doc2bow(sent_1)
sent_2 = dictionary.doc2bow(sent_2)
sent_3 = dictionary.doc2bow(sent_3)

# Compute soft cosine similarity
print(softcossim(sent_1, sent_2, similarity_matrix))
print(softcossim(sent_1, sent_3, similarity_matrix))
print(softcossim(sent_2, sent_3, similarity_matrix))

  


0.8188388234472342
0.6403088906291006
0.7023288404386088


  
  from ipykernel import kernelapp as app
  app.launch_new_instance()


In [22]:
# Which word from the given list doesn't go with the others?
print(fasttext_model300.doesnt_match(['india', 'australia', 'pakistan', 'china', 'beetroot']))

# Compute cosine distance between two words.
print(fasttext_model300.distance('king', 'queen'))

# Compute cosine distances from given word or vector to all words in `other_words`.
print(fasttext_model300.distances('king', ['queen', 'man', 'woman']))

# Compute cosine similarities
print(fasttext_model300.cosine_similarities(fasttext_model300['king'],
                                            vectors_all=(fasttext_model300['queen'],
                                                         fasttext_model300['man'], fasttext_model300['woman'],
                                                         fasttext_model300['queen'] + fasttext_model300['man'])))  
# Get the words closer to w1 than w2
print(glove_model300.words_closer_than(w1='king', w2='kingdom'))

# Find the top-N most similar words.
print(fasttext_model300.most_similar(positive='king', negative=None, topn=5, restrict_vocab=None, indexer=None))

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


beetroot
0.22957539558410645
[0.22957546 0.465837   0.547001  ]
[0.77042454 0.534163   0.45299897 0.76572555]
['prince', 'queen', 'monarch']
[('king-', 0.7838029265403748), ('boy-king', 0.7704817652702332), ('queen', 0.7704246044158936), ('prince', 0.7700967192649841), ('kings', 0.7668930292129517)]


**10. Summarize text documents**

- Gensim implements the textrank summarization using the `summarize()` function in the summarization module
- Pass in the tet string along with either the output summarization ratio or the maximum count of words in the summarized output
- There is no need to split the sentence into a tokenized list because gensim does the splitting using the built-in `split_sentences()` method in the `gensim.summarization.texcleaner` module

In [28]:
from gensim.summarization import summarize, keywords
from pprint import pprint
from smart_open import smart_open

text = " ".join((line for line in smart_open('sample.txt', encoding='utf-8')))

# Summarize the paragraph
pprint(summarize(text, word_count=20))

# Important keywords from the paragraph
print(keywords(text))

2020-07-11 02:34:06,532 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-07-11 02:34:06,534 : INFO : built Dictionary(71 unique tokens: ['dollar', 'hour', 'million', 'produc', 'work']...) from 24 documents (total 101 corpus positions)
2020-07-11 02:34:06,551 : INFO : Building graph
2020-07-11 02:34:06,552 : INFO : Filling graph
2020-07-11 02:34:06,555 : INFO : Removing unreachable nodes of graph
2020-07-11 02:34:06,556 : INFO : Pagerank graph
2020-07-11 02:34:07,375 : INFO : Sorting pagerank scores


('total should reach over 200 billion Etexts given away this year.\n'
 'The Goal of Project Gutenberg is to Give Away One Trillion Etext')
etext
etexts
projected
project
million
copyright
estimate
estimated
funding
letters
given


Refer [this](https://www.machinelearningplus.com/nlp/gensim-tutorial/#14howtotrainword2vecmodelusinggensim) for additional link

In [29]:
# NLTK lemmatizer
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

[wordnet_lemmatizer.lemmatize(words, pos='v') for words in sentence.split(' ')]

['He',
 'be',
 'run',
 'and',
 'eat',
 'at',
 'same',
 'time.',
 'He',
 'have',
 'bad',
 'habit',
 'of',
 'swim',
 'after',
 'play',
 'long',
 'hours',
 'in',
 'the',
 'Sun.']

In [47]:
import nltk
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

sentence = "He was running and eating at same time. He has bad habit of swimming after playing long hours in the Sun."
punctuations = "?:!.,;"
sentence_words = nltk.word_tokenize(sentence)
# print(sentence_words)

for word in sentence_words:
    if word in punctuations:
        sentence_words.remove(word)

# print(sentence_words)
print("{0:20}{1:20}".format("Word","Lemma"))

for word in sentence_words:
    print ("{0:20}{1:20}".format(word,wordnet_lemmatizer.lemmatize(word, pos="v")))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kalya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Word                Lemma               
He                  He                  
was                 be                  
running             run                 
and                 and                 
eating              eat                 
at                  at                  
same                same                
time                time                
He                  He                  
has                 have                
bad                 bad                 
habit               habit               
of                  of                  
swimming            swim                
after               after               
playing             play                
long                long                
hours               hours               
in                  in                  
the                 the                 
Sun                 Sun                 


In [30]:
# Gensim lemmatizer
from gensim.utils import lemmatize
#allowed_tags=re.compile('(NN|JJ|RB)')
[lemmatize(words) for words in sentence.split(' ')]

[[],
 [b'be/VB'],
 [b'run/VB'],
 [],
 [b'eating/NN'],
 [],
 [b'same/JJ'],
 [b'time/NN'],
 [],
 [b'have/VB'],
 [b'bad/JJ'],
 [b'habit/NN'],
 [],
 [b'swimming/NN'],
 [],
 [b'play/VB'],
 [b'long/RB'],
 [b'hour/NN'],
 [],
 [],
 [b'sun/NN']]

In [14]:
import bs4 as bs
import urllib.request
import re
import nltk

scrapped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Artificial_intelligence')
article = scrapped_data .read()

parsed_article = bs.BeautifulSoup(article,'lxml')

paragraphs = parsed_article.find_all('p')

article_text = ""

for p in paragraphs:
    article_text += p.text

In [15]:
# Cleaing the text
processed_article = article_text.lower()
processed_article = re.sub('[^a-zA-Z]', ' ', processed_article )
processed_article = re.sub(r'\s+', ' ', processed_article)

# Preparing the dataset
all_sentences = nltk.sent_tokenize(processed_article)

all_words = [nltk.word_tokenize(sent) for sent in all_sentences]

# Removing Stop Words
from nltk.corpus import stopwords
for i in range(len(all_words)):
    all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]

In [16]:
from gensim.models import Word2Vec

word2vec = Word2Vec(all_words, min_count=2)

In [25]:
vocabulary = word2vec.wv.vocab
print(vocabulary.keys())

dict_keys(['computer', 'science', 'artificial', 'intelligence', 'ai', 'sometimes', 'called', 'machine', 'demonstrated', 'machines', 'unlike', 'natural', 'displayed', 'humans', 'animals', 'leading', 'field', 'study', 'intelligent', 'agents', 'device', 'perceives', 'environment', 'takes', 'actions', 'maximize', 'chance', 'successfully', 'achieving', 'goals', 'term', 'often', 'used', 'describe', 'computers', 'mimic', 'cognitive', 'functions', 'human', 'mind', 'learning', 'problem', 'solving', 'become', 'increasingly', 'capable', 'tasks', 'considered', 'require', 'definition', 'phenomenon', 'known', 'effect', 'done', 'yet', 'instance', 'character', 'recognition', 'frequently', 'things', 'routine', 'technology', 'modern', 'capabilities', 'generally', 'classified', 'include', 'understanding', 'speech', 'competing', 'level', 'game', 'systems', 'chess', 'go', 'operating', 'cars', 'content', 'networks', 'military', 'founded', 'academic', 'discipline', 'years', 'since', 'experienced', 'several',

In [8]:
import nltk

word_data = "The best performance can bring in sky high success."
nltk_tokens = nltk.word_tokenize(word_data)

list(nltk.bigrams(nltk_tokens))

[('The', 'best'),
 ('best', 'performance'),
 ('performance', 'can'),
 ('can', 'bring'),
 ('bring', 'in'),
 ('in', 'sky'),
 ('sky', 'high'),
 ('high', 'success'),
 ('success', '.')]