In [1]:
#Gensim library
#Loading gensim

from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel


In [2]:
#create a corpus from a list of text
common_dictionary = Dictionary(common_texts)

common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]


In [3]:
#Train the model
lda = LdaModel(common_corpus, num_topics=10)

#new corpus of unseen documents
other_texts = [
    ['data', 'unstructured', 'time'],
    ['bigdata', 'intelligence', 'natural'],
    ['language', 'machine', 'computer']
]
other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]
unseen_doc = other_corpus[0]



In [4]:
#get topic probability distribution for a document
vector = lda[unseen_doc]
print(vector)

[(0, 0.050000045), (1, 0.05000921), (2, 0.05000004), (3, 0.050000045), (4, 0.05000004), (5, 0.050000045), (6, 0.050000045), (7, 0.5499905), (8, 0.05000004), (9, 0.05000004)]


# Exercise 1

# Identify topics from news

Load the dataset=>

The dataset we'll use is the 20newsgroup dataset that is available from sklearn. 
This dataset has news articles grouped into 20 news categories.

In [5]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.datasets import fetch_20newsgroups


In [8]:
newsgroups_train = fetch_20newsgroups(subset='train', shuffle = True)
newsgroups_test = fetch_20newsgroups(subset='test', shuffle = True)


In [9]:
print(list(newsgroups_train.target_names))

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


##### there are different themes here indicating news in sports, religion, politics, tech etc

Data Preprocessing

We will perform the following steps:

● Tokenization: Split the text into sentences and sentences into words. Lowercase the words and remove punctuations.

● Words that have fewer than 3 characters are removed.

● All stopwords are removed.

● Words are lemmatized in such a way that words in third person are changed to first person and verbs in past and future tenses are changed into present.

● Words are stemmed in such a way that words are reduced to their root form.



In [10]:
#sample news
newsgroups_train.data[:2]
# this needs cleaning, removal of stop words, stemming, lemmatization, etc

["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n",
 "From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLines: 

Data Preprocessing

We will perform the following steps:

●	Tokenization: Split the text into sentences and sentences into words. Lowercase the words and remove punctuations.

●	Words that have fewer than 3 characters are removed.

●	All stopwords are removed.

●	Words are lemmatized in such a way that words in third person are changed to first person and verbs in past and future tenses are changed into present.

●	Words are stemmed in such a way that words are reduced to their root form.


In [11]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np


In [12]:
np.random.seed(400)

In [13]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/labsuser/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [14]:
#trying to see if lemmatizer works
print(WordNetLemmatizer().lemmatize('went', pos = 'v'))

go


In [15]:
#see if dtrmming works
import pandas as pd
stemmer = SnowballStemmer('english')
original_words = ['caress','flies','dies','mules',  'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]

pd.DataFrame(data={'original word':original_words, 'stemmed':singles })



Unnamed: 0,original word,stemmed
0,caress,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [17]:
# apply lem and stemming on the whole dataset
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result


In [18]:
#preview the doc after preprocessing
document_num = 50 #id for the doc
doc_sample = 'This disk has failed many times. I would like to get it replaced.'

print("Original document: ")
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print("\n\nTokenized and lemmatized document: ")
print(preprocess(doc_sample))

Original document: 
['This', 'disk', 'has', 'failed', 'many', 'times.', 'I', 'would', 'like', 'to', 'get', 'it', 'replaced.']


Tokenized and lemmatized document: 
['disk', 'fail', 'time', 'like', 'replac']


In [21]:
#now we preprocess all the news headline we have, we iterate the list of docs in our training sample 
processed_docs = []

for doc in newsgroups_train.data:
    processed_docs.append(preprocess(doc))

print(processed_docs[:2])


[['lerxst', 'thing', 'subject', 'nntp', 'post', 'host', 'organ', 'univers', 'maryland', 'colleg', 'park', 'line', 'wonder', 'enlighten', 'door', 'sport', 'look', 'late', 'earli', 'call', 'bricklin', 'door', 'small', 'addit', 'bumper', 'separ', 'rest', 'bodi', 'know', 'tellm', 'model', 'engin', 'spec', 'year', 'product', 'histori', 'info', 'funki', 'look', 'mail', 'thank', 'bring', 'neighborhood', 'lerxst'], ['guykuo', 'carson', 'washington', 'subject', 'clock', 'poll', 'final', 'summari', 'final', 'clock', 'report', 'keyword', 'acceler', 'clock', 'upgrad', 'articl', 'shelley', 'qvfo', 'innc', 'organ', 'univers', 'washington', 'line', 'nntp', 'post', 'host', 'carson', 'washington', 'fair', 'number', 'brave', 'soul', 'upgrad', 'clock', 'oscil', 'share', 'experi', 'poll', 'send', 'brief', 'messag', 'detail', 'experi', 'procedur', 'speed', 'attain', 'rat', 'speed', 'card', 'adapt', 'heat', 'sink', 'hour', 'usag', 'floppi', 'disk', 'function', 'floppi', 'especi', 'request', 'summar', 'day',

In [22]:
#BOW in the dataset

#we do  this here by creating a dict from processed_docs

dictionary = gensim.corpora.Dictionary(processed_docs)


In [23]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break


0 addit
1 bodi
2 bricklin
3 bring
4 bumper
5 call
6 colleg
7 door
8 earli
9 engin
10 enlighten


Gensim filter_extremes
filter out tokrns that appear in:
1. less than no_below docs(absolute number) or
2. more than no_above docs(fraction of total corpus size, not abs number)
3. after (1) and (2) keep only the first keep_n most freq tokens(or keep all if None)

In [24]:
dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n= 100000)


Gensim doc2bow


Convert document (a list of words) into the bag-of-words format = list of (token_id, token_count) 2-tuples. Each word is assumed to be a tokenized and normalized string (either unicode or utf8-encoded). No further preprocessing is done on the words in the document; apply tokenization, stemming, etc. before calling this method.

● Create the bag-of-words model for each document i.e for each document we create a dictionary reporting how many words and how many times those words appear. Save this to 'bow_corpus'

In [25]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

document_num = 20
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))


Word 18 ("rest") appears 1 time.
Word 166 ("clear") appears 1 time.
Word 336 ("refer") appears 1 time.
Word 350 ("true") appears 1 time.
Word 391 ("technolog") appears 1 time.
Word 437 ("christian") appears 1 time.
Word 453 ("exampl") appears 1 time.
Word 476 ("jew") appears 1 time.
Word 480 ("lead") appears 1 time.
Word 482 ("littl") appears 3 time.
Word 520 ("wors") appears 2 time.
Word 721 ("keith") appears 3 time.
Word 732 ("punish") appears 1 time.
Word 803 ("california") appears 1 time.
Word 859 ("institut") appears 1 time.
Word 917 ("similar") appears 1 time.
Word 990 ("allan") appears 1 time.
Word 991 ("anti") appears 1 time.
Word 992 ("arriv") appears 1 time.
Word 993 ("austria") appears 1 time.
Word 994 ("caltech") appears 2 time.
Word 995 ("distinguish") appears 1 time.
Word 996 ("german") appears 1 time.
Word 997 ("germani") appears 3 time.
Word 998 ("hitler") appears 1 time.
Word 999 ("livesey") appears 2 time.
Word 1000 ("motto") appears 2 time.
Word 1001 ("order") appear

Train LDA model using gensim models LdaMulticore and save to lda_model

In [31]:
lda_model =gensim.models.LdaMulticore(bow_corpus,
                                       num_topics=8,
                                       id2word= dictionary,
                                       passes=10, #like epochs
                                       workers = 2) #like 2 parallel processing ofdata)

for each topic we explore the words that occur in that topic and its relative weights

In [32]:
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.008*"bike" + 0.005*"game" + 0.005*"team" + 0.004*"run" + 0.004*"virginia" + 0.004*"player" + 0.004*"play" + 0.004*"homosexu" + 0.003*"pitch" + 0.003*"motorcycl"


Topic: 1 
Words: 0.009*"govern" + 0.007*"armenian" + 0.006*"israel" + 0.005*"kill" + 0.005*"isra" + 0.004*"american" + 0.004*"turkish" + 0.004*"countri" + 0.004*"weapon" + 0.004*"live"


Topic: 2 
Words: 0.016*"game" + 0.014*"team" + 0.011*"play" + 0.008*"hockey" + 0.008*"player" + 0.005*"season" + 0.005*"canada" + 0.004*"leagu" + 0.004*"score" + 0.004*"toronto"


Topic: 3 
Words: 0.010*"card" + 0.010*"window" + 0.008*"driver" + 0.007*"sale" + 0.006*"price" + 0.005*"speed" + 0.005*"appl" + 0.005*"monitor" + 0.005*"video" + 0.005*"drive"


Topic: 4 
Words: 0.014*"file" + 0.010*"program" + 0.009*"window" + 0.006*"encrypt" + 0.006*"chip" + 0.006*"data" + 0.006*"imag" + 0.006*"avail" + 0.005*"version" + 0.004*"code"


Topic: 5 
Words: 0.012*"space" + 0.009*"nasa" + 0.006*"scienc" + 0.005*"orbit" + 0.004*"resear

# Exercise 2

## Word Analogies

In this demo, we will show you how to train and evaluate Word2Vec models in your business data using NLP.



In [33]:
import warnings
warnings.filterwarnings('ignore')
from nltk.tokenize import sent_tokenize, word_tokenize 
import gensim 
from gensim.models import Word2Vec

In [38]:
sample = open("word_analogy.txt", "r", encoding='cp1252') 
s = sample.read() 

In [39]:
f = s.replace("\n", " ") 

In [40]:
data=[]

In [41]:
# iterate through each sentence in the file 
for i in sent_tokenize(f): 
    temp = [] 
      
    # tokenize the sentence into words 
    for j in word_tokenize(i): 
        temp.append(j.lower()) 
  
    data.append(temp) 

Create CBOW Model and print results



In [42]:
model1 = gensim.models.Word2Vec(data, min_count = 1,  
                              size = 100, window = 5)

In [43]:
print("Cosine similarity between 'alice' " + 
               "and 'wonderland' - CBOW : ", 
    model1.similarity('alice', 'wonderland')) 
      
print("Cosine similarity between 'alice' " +
                 "and 'machines' - CBOW : ", 
      model1.similarity('alice', 'machines')) 

Cosine similarity between 'alice' and 'wonderland' - CBOW :  0.9994715
Cosine similarity between 'alice' and 'machines' - CBOW :  0.9927303


Create a Skip Gram Model and print results

In [44]:
model2 = gensim.models.Word2Vec(data, min_count = 1, size = 100, 
                                             window = 5, sg = 1) 

In [45]:
print("Cosine similarity between 'alice' " +
          "and 'wonderland' - Skip Gram : ", 
    model2.similarity('alice', 'wonderland')) 
      
print("Cosine similarity between 'alice' " +
            "and 'machines' - Skip Gram : ", 
      model2.similarity('alice', 'machines')) 

Cosine similarity between 'alice' and 'wonderland' - Skip Gram :  0.89384156
Cosine similarity between 'alice' and 'machines' - Skip Gram :  0.8489359
