In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('data.csv')
document = data['English Text (Google Translate)']

In [3]:
document.head()

0         God bless him for being a great human being.
1    I love the humanity of our astronaut, a man wi...
2    Two great men making history. Thank you for al...
3    Nayib and Frank, two great examples of humilit...
4    Colombia lacks a precedent with this one, I am...
Name: English Text (Google Translate), dtype: object

In [4]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/smit/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#### Lemmatize and remove stopwords



In [5]:
def lemmatize_stemming(text):
    stemmer = SnowballStemmer("english")
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [6]:
processed_docs = document.map(preprocess)
processed_docs[:10]

0                                [bless, great, human]
1    [love, human, astronaut, high, level, academ, ...
2    [great, make, histori, thank, allow, feel, pro...
3    [nayib, frank, great, exampl, humil, wisdom, s...
4    [colombia, lack, preced, faith, follow, presid...
5                           [great, proud, salvadoran]
6              [doubt, admir, salvadoran, feel, proud]
7    [success, salvadoran, charact, help, conquest,...
8                           [salvadoran, pride, greet]
9    [astronaut, doctor, colonel, frank, rubio, pro...
Name: English Text (Google Translate), dtype: object

#### Bag of Words

In [7]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [8]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [9]:
count = 0
for k,v in dictionary.iteritems():
    count+=1
    print(k,v)

0 bless
1 great
2 astronaut
3 love
4 pride
5 countri
6 proud
7 thank
8 exampl
9 frank
10 nayib
11 salvador
12 wisdom
13 bukel
14 presid
15 salvadoran
16 admir
17 good
18 peopl
19 greet
20 rubio
21 congratul
22 live
23 beauti
24 continu
25 like
26 famili
27 best
28 world


In [10]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

#### TF-IDF

In [11]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)

In [12]:
corpus_tfidf = tfidf[bow_corpus]

In [13]:
for doc in corpus_tfidf:
    print(doc)
    break

[(0, 0.45524926832276413), (1, 0.8903640287498075)]


#### Running LDA using BoW

In [14]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [15]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.129*"love" + 0.125*"bless" + 0.104*"thank" + 0.079*"presid" + 0.066*"bukel" + 0.062*"countri" + 0.061*"continu" + 0.056*"famili" + 0.053*"salvador" + 0.039*"peopl"
Topic: 1 
Words: 0.150*"peopl" + 0.149*"salvadoran" + 0.141*"great" + 0.108*"pride" + 0.081*"bless" + 0.045*"exampl" + 0.044*"congratul" + 0.041*"countri" + 0.037*"world" + 0.032*"salvador"
Topic: 2 
Words: 0.161*"presid" + 0.147*"bless" + 0.130*"salvador" + 0.072*"countri" + 0.066*"admir" + 0.058*"continu" + 0.048*"bukel" + 0.040*"beauti" + 0.039*"wisdom" + 0.037*"like"
Topic: 3 
Words: 0.179*"good" + 0.135*"thank" + 0.127*"bless" + 0.083*"world" + 0.080*"peopl" + 0.063*"salvadoran" + 0.053*"presid" + 0.050*"live" + 0.040*"love" + 0.040*"bukel"
Topic: 4 
Words: 0.117*"countri" + 0.115*"presid" + 0.096*"bukel" + 0.092*"bless" + 0.079*"like" + 0.074*"nayib" + 0.057*"thank" + 0.045*"great" + 0.041*"proud" + 0.040*"salvadoran"
Topic: 5 
Words: 0.317*"presid" + 0.145*"bukel" + 0.093*"bless" + 0.055*"famili" + 

#### Running LDA using TF-IDF

In [16]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)


for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.185*"beauti" + 0.146*"good" + 0.140*"famili" + 0.065*"presid" + 0.055*"bless" + 0.051*"like" + 0.048*"salvador" + 0.037*"continu" + 0.032*"bukel" + 0.029*"world"
Topic: 1 Word: 0.158*"world" + 0.106*"continu" + 0.095*"peopl" + 0.082*"thank" + 0.077*"salvador" + 0.062*"bless" + 0.058*"countri" + 0.041*"presid" + 0.034*"good" + 0.034*"pride"
Topic: 2 Word: 0.172*"great" + 0.138*"peopl" + 0.129*"congratul" + 0.099*"exampl" + 0.089*"bless" + 0.064*"salvador" + 0.041*"salvadoran" + 0.034*"presid" + 0.034*"wisdom" + 0.031*"rubio"
Topic: 3 Word: 0.193*"like" + 0.153*"proud" + 0.073*"great" + 0.062*"salvadoran" + 0.054*"bless" + 0.051*"countri" + 0.048*"presid" + 0.042*"famili" + 0.041*"pride" + 0.040*"peopl"
Topic: 4 Word: 0.158*"bukel" + 0.128*"love" + 0.095*"salvador" + 0.090*"presid" + 0.084*"bless" + 0.078*"nayib" + 0.062*"thank" + 0.036*"like" + 0.030*"world" + 0.028*"continu"
Topic: 5 Word: 0.227*"bless" + 0.175*"countri" + 0.102*"presid" + 0.055*"bukel" + 0.055*"love" 