# Topic modeling

Now we worked on Simon Sinek's posts to understand how to use the gensim library & LDA, let's generalize the method to the entire authors corpus.

## Libraries

In [8]:
import pandas as pd
import pickle
from gensim import matutils, models
import numpy
import scipy.sparse
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer

## Pre-processing

In [2]:
#Import the corpus
data = pd.read_pickle("contentCorpus.pkl")
#Remove useless columns for this analysis. 
data.drop(['#Reactions','#Comments','Location','Followers','Time_spent','Media_type'],axis=1,inplace=True)

In [5]:
#Because we want to define topics across all authors, we don't care about each post individually
#Thus let's group posts content by author
authorCorpus = pd.read_pickle('contentAuthorCorpus.pkl')
authorCorpus.drop(['Followers'],axis=1,inplace=True)

In [6]:
#Transform list of authors into an index
authorCorpus.set_index('Name',inplace=True)

In [7]:
#Let's transform the dataframe into a serie
authorCorpus = authorCorpus.squeeze()

In [32]:
#Let's keep nouns & adjectives only
#Let's create a function to pull out nouns & adjectives from a string of text

from nltk import word_tokenize, pos_tag

def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

In [33]:
#We transform our list of content with the function we have just created
CorpusNounsAdj = authorCorpus.apply(nouns_adj)
CorpusNounsAdj

Name
Nicholas Wyman               robert lerman healthy future work employees sk...
Jonathan Wolfer              proud new feature douglass year amazing specia...
Karen Gross                  piece i suggestions educators central part liv...
Kaia Niambi Shivers Ph.D.    i native read low re high note wonderful ark r...
Daniel Cohen-I'm Flyering    passion qualities t spot cv sourcers activitie...
                                                   ...                        
Quentin Michael Allums       career someone ’ s jog morning everyone run ’ ...
AJ Wilcox                    i excited part webinar things advertising im s...
Kevin O'Leary                crypto currencies winner demand digital curren...
Amy Blaschka                 news ’ career progress isn ’ external s intern...
Simon Sinek                  charge willing others charge people dangers or...
Name: Content, Length: 68, dtype: object

In [8]:
# We create a document-term matrix using CountVectorizer, and exclude common English stop words

cv = CountVectorizer(stop_words='english')
dataCv = cv.fit_transform(CorpusNounsAdj)
dataDtm = pd.DataFrame(dataCv.toarray(), columns=cv.get_feature_names_out(),index=CorpusNounsAdj.index)

print(dataDtm.shape)
dataDtm

(68, 45779)


Unnamed: 0_level_0,aaa,aaas,aai,aamkt,aams,aandetelevision,aapl,aaplillustrate,aarogyasetu,aaron,...,𝐲𝐨𝐮,𝗖𝗼𝗻𝗻𝗲𝗰𝘁𝗶𝗻𝗴,𝗛𝘂𝗺𝗮𝗻𝘀,𝗟𝗶𝘃𝗲,𝘼𝙍𝙀,𝘾𝙤𝙢𝙢𝙚𝙣𝙩,𝙈𝙮,𝙖𝙘𝙘𝙚𝙥𝙩𝙖𝙣𝙘𝙚,𝙗𝙚𝙡𝙤𝙬,𝙥𝙚𝙧𝙨𝙤𝙣𝙖𝙡
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Nicholas Wyman,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Jonathan Wolfer,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Karen Gross,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Kaia Niambi Shivers Ph.D.,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Daniel Cohen-I'm Flyering,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Quentin Michael Allums,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
AJ Wilcox,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,1,1,1,1,1
Kevin O'Leary,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
Amy Blaschka,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,0,0


## 1st attempt

In [9]:
# We're going to put the term-document matrix into a new gensim format, from df --> sparse matrix --> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(dataDtm)
corpus = matutils.Sparse2Corpus(sparse_counts) #It's actually a dtm

In [10]:
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [11]:
# Now that we have the corpus (term-document matrix) and id2word (dictionary of location: term),
# we need to specify two other parameters as well - the number of topics and the number of passes
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=10)
lda.print_topics()

[(0,
  '0.204*"aarp" + 0.128*"aatmanirbharapp" + 0.127*"abdi" + 0.124*"aams" + 0.045*"abacusagency" + 0.044*"abdomen" + 0.026*"aasciences" + 0.023*"aaron" + 0.022*"aback" + 0.019*"abandonment"'),
 (1,
  '0.068*"abbacchi" + 0.053*"abadikorek" + 0.052*"aberrant" + 0.049*"aberman" + 0.045*"abhay" + 0.043*"abidjan" + 0.041*"aai" + 0.040*"abhishant" + 0.037*"abbass" + 0.036*"abhishek"')]

In [12]:
#4topics
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, passes=5)
lda.print_topics()

[(0,
  '0.194*"abadikorek" + 0.164*"abacusagency" + 0.114*"abdomen" + 0.113*"abdus" + 0.061*"abhishek" + 0.031*"abibev" + 0.031*"abia" + 0.028*"abhisek" + 0.017*"abhi" + 0.016*"aas"'),
 (1,
  '0.122*"aarp" + 0.068*"abbacchi" + 0.052*"aberrant" + 0.048*"aberman" + 0.045*"abhay" + 0.042*"abidjan" + 0.039*"abhishant" + 0.037*"abbass" + 0.030*"ab" + 0.028*"abadesi"'),
 (2,
  '0.474*"abdi" + 0.104*"aasciences" + 0.056*"aberration" + 0.031*"aarogyasetu" + 0.024*"abe" + 0.022*"abagun" + 0.019*"aandetelevision" + 0.017*"aamkt" + 0.012*"abdication" + 0.006*"abhisek"'),
 (3,
  '0.289*"aams" + 0.279*"aatmanirbharapp" + 0.048*"aai" + 0.039*"aback" + 0.035*"abercrombie" + 0.033*"abdominal" + 0.031*"abandonment" + 0.028*"abdulkader" + 0.025*"abadesi" + 0.025*"abdullah"')]

## 2nd attempt

Let's remove some words from this topics

In [42]:
add_stop_words = ["abadesi","abbi","abdullah","abhishant","aback","aarogyasetu","abeid","aaa","abigail","aams"
                 ,"aberman","abidjan","abhishek","abbass","abhishant","abacusagency","abi","ab","abdi"]
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

In [14]:
# Recreate a document-term matrix with this correction
cv = CountVectorizer(stop_words=stop_words)
data_cv = cv.fit_transform(CorpusNounsAdj)
dataDtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
dataDtm



Unnamed: 0,aaas,aai,aamkt,aandetelevision,aapl,aaplillustrate,aaron,aaronberson,aarp,aarps,...,𝐲𝐨𝐮,𝗖𝗼𝗻𝗻𝗲𝗰𝘁𝗶𝗻𝗴,𝗛𝘂𝗺𝗮𝗻𝘀,𝗟𝗶𝘃𝗲,𝘼𝙍𝙀,𝘾𝙤𝙢𝙢𝙚𝙣𝙩,𝙈𝙮,𝙖𝙘𝙘𝙚𝙥𝙩𝙖𝙣𝙘𝙚,𝙗𝙚𝙡𝙤𝙬,𝙥𝙚𝙧𝙨𝙤𝙣𝙖𝙡
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
64,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,1,1,1,1,1
65,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
66,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,0,0


In [15]:
# We're going to put the term-document matrix into a new gensim format, from df --> sparse matrix --> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(dataDtm)
corpus = matutils.Sparse2Corpus(sparse_counts) #It's actually a dtm

In [18]:
#4topics
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, passes=10)
lda.print_topics()

[(0,
  '0.486*"aams" + 0.155*"aai" + 0.045*"abad" + 0.026*"abbey" + 0.023*"abia" + 0.021*"abagun" + 0.005*"abcsharktank" + 0.002*"aarti" + 0.001*"aaas" + 0.000*"abbott"'),
 (1,
  '0.288*"aarp" + 0.195*"abdi" + 0.083*"aberrant" + 0.057*"aaron" + 0.046*"aasciences" + 0.037*"abadesi" + 0.030*"aback" + 0.030*"abdullah" + 0.027*"aberration" + 0.024*"abbi"'),
 (2,
  '0.071*"abbacchi" + 0.063*"abadikorek" + 0.058*"aberman" + 0.054*"abhay" + 0.051*"abidjan" + 0.048*"abhishant" + 0.045*"abbass" + 0.043*"abhishek" + 0.037*"ab" + 0.036*"abdus"'),
 (3,
  '0.420*"aatmanirbharapp" + 0.118*"abdomen" + 0.062*"abercrombie" + 0.049*"abdominal" + 0.046*"abandonment" + 0.045*"abdulkader" + 0.021*"abe" + 0.017*"abdul" + 0.015*"abdullah" + 0.008*"abhi"')]

Ok.  
So there are a lot of words with no sense.

## 3rd attempt

We will try to remove words with no sense from the corpus.

To do so, we will use the dictionary present in NLTK : nltk.words.

It contains nearly all english words. 


In [5]:
import nltk
nltk.download('words')

englishWordsDic = nltk.corpus.words.words()
print ("There are {} words in this dictionary".format(len(englishWordsDic)))

There are 236736 words in this dictionary


[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Jeremy\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [90]:
#Let's create the function to filter only english words
def isEnglish(text):
    '''Given a string of text, tokenize the text and check if each word is an english word.'''
    tokenized = word_tokenize(text)
    englishWord = [word for word in tokenized if (word in englishWordsDic) == True] 
    return ' '.join(englishWord)

Now we have defined the function, we can apply it to the corpus.  
However, because it take a very long time to process, we will focus on two authors content at first and apply LDA to it.

In [92]:
corpusTest = CorpusNounsAdj.iloc[[6,67]]
corpusTest

Name
Dale Corley    true words nathan stephens true true proud emp...
Simon Sinek    charge willing others charge people dangers or...
Name: Content, dtype: object

In [93]:
corpusTestCleaned = corpusTest.apply(isEnglish)
corpusTestCleaned

Name
Dale Corley    true true true business brand true true true b...
Simon Sinek    charge willing charge people organization orga...
Name: Content, dtype: object

In [94]:
#We define stop words
stop_words = text.ENGLISH_STOP_WORDS

In [21]:
#Document term matrix
cv = CountVectorizer(stop_words=stop_words)
data_cv = cv.fit_transform(corpusTest)
dataDtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
dataDtm

NameError: name 'stop_words' is not defined

In [96]:
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [97]:
# We're going to put the term-document matrix into a new gensim format, from df --> sparse matrix --> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(dataDtm)
corpus = matutils.Sparse2Corpus(sparse_counts) #It's actually a dtm

In [98]:
#4topics
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, passes=20)
lda.print_topics()

[(0,
  '0.001*"abitofoptimism" + 0.001*"ability" + 0.001*"polarized" + 0.001*"possibilities" + 0.001*"positivity" + 0.001*"positive" + 0.001*"porqué" + 0.001*"popular" + 0.001*"poet" + 0.001*"players"'),
 (1,
  '0.001*"abitofoptimism" + 0.001*"ability" + 0.001*"polarized" + 0.001*"possibilities" + 0.001*"positivity" + 0.001*"positive" + 0.001*"porqué" + 0.001*"popular" + 0.001*"poet" + 0.001*"players"'),
 (2,
  '0.916*"abitofoptimism" + 0.000*"ability" + 0.000*"polarized" + 0.000*"possibilities" + 0.000*"positivity" + 0.000*"positive" + 0.000*"porqué" + 0.000*"popular" + 0.000*"poet" + 0.000*"players"'),
 (3,
  '0.077*"ability" + 0.001*"abitofoptimism" + 0.001*"polarized" + 0.001*"possibilities" + 0.001*"positivity" + 0.001*"positive" + 0.001*"porqué" + 0.001*"popular" + 0.001*"poet" + 0.001*"players"')]

In [85]:
"abitofoptimism" in englishWordsDic

False

### Partial conclusions

The LDA algorithm doesn't work.  
It's probably due to the fact that we have only few document (we gather posts by author).  
So let's use each post individually and do LDA on that.

## 4th attempt

In [3]:
#Import the corpus
contentCorpus = pd.read_pickle("contentCorpus.pkl")
#Remove useless columns for this analysis. 
contentCorpus.drop(['Name','#Reactions','#Comments','Location','Followers','Time_spent','Media_type'],axis=1,inplace=True)
contentCorpus

Unnamed: 0,Content
0,robert lerman writes that achieving a healthy...
1,national disability advocate sara hart weir m...
3,exploring in this months talent management hr...
4,i count myself fortunate to have spent time wi...
5,online job platforms are a different way of wo...
...,...
34007,igniter of the year well i know that im an op...
34008,executives who prioritize the shareholder are ...
34009,like many i too have been reflecting as we nea...
34010,if you say customer first that means your empl...


In [4]:
import nltk
from nltk import word_tokenize, pos_tag

In [4]:
nltk.download('words')
englishWordsDic = nltk.corpus.words.words()
print ("There are {} words in this dictionary".format(len(englishWordsDic)))

NameError: name 'nltk' is not defined

Let's create the function to filter only english nouns & adjectives

In [17]:
#Let's keep nouns & adjectives only
#Let's create a function to pull out nouns & adjectives from a string of text

def corpusCleaning(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives belonging to english dictionary.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    is_english = lambda word : word in englishWordsDic
    tokenized = word_tokenize(text)
    cleanedtext = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos) and is_english(word)] 
    return ' '.join(cleanedtext)

In [9]:
#Transform the corpus into a Serie
contentCorpus = contentCorpus.squeeze()
contentCorpus

0        robert lerman  writes that achieving a healthy...
1        national disability advocate  sara hart weir m...
3        exploring in this months talent management  hr...
4        i count myself fortunate to have spent time wi...
5        online job platforms are a different way of wo...
                               ...                        
34007    igniter of the year  well i know that im an op...
34008    executives who prioritize the shareholder are ...
34009    like many i too have been reflecting as we nea...
34010    if you say customer first that means your empl...
34011    the small work hard to serve themselves in a b...
Name: Content, Length: 31996, dtype: object

In [26]:
testCorpus = contentCorpus[2:20]
testCorpus

3     exploring in this months talent management  hr...
4     i count myself fortunate to have spent time wi...
5     online job platforms are a different way of wo...
6     between the burgeoning unemployment rates and ...
7     this years national apprenticeship week comes ...
8     coaching   learninganddevelopment   workforcet...
9     look forward to joining the conversation with ...
10    learning pods aren’t just a group of people co...
11    congratulations to dr mark goulston on recogni...
12    i wanted to hear from companies that take out ...
13    lexington ky — the us bureau of labor statisti...
14    agree with  zach boren  johns post highlights ...
15    its been a bumpy road to a  recovery countries...
16    the rapidly changing nature of work requires b...
17    recessions are tough on everyone but they hit ...
18    the imf predicts we will be entering the worst...
19              some  practicaltips  from kathryn vasel
21    the health care sector is multifaceted and

In [27]:
cleanedCorpus = testCorpus.apply(corpusCleaning)
cleanedCorpus

3     talent management company modern apprenticeshi...
4     i count time packer assistant secretary policy...
5     job different way times international example ...
6     unemployment stillness more more right time covid
7     national apprenticeship week time creative way...
8                                                      
9     forward conversation upcoming summit question ...
10    t group people everyone focus particular proje...
11    mark recognition series call inspiration mark ...
12                        best diversity i couple cisco
13    bureau labor statistics unemployment more perc...
14                   agree post importance young people
15    bumpy road recovery free action second wave ap...
16    nature work creative recruitment talent develo...
17    tough everyone young people hard youth economy...
18    worst global recession downturn par great depr...
19                                                     
21    health care sector complex let pivot such 

In [28]:
#We define stop words
stop_words = text.ENGLISH_STOP_WORDS
#Document term matrix
cv = CountVectorizer(stop_words=stop_words)
data_cv = cv.fit_transform(cleanedCorpus)
dataDtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
id2word = dict((v, k) for k, v in cv.vocabulary_.items())
# We're going to put the term-document matrix into a new gensim format, from df --> sparse matrix --> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(dataDtm)
corpus = matutils.Sparse2Corpus(sparse_counts) #It's actually a dtm
#4topics
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=80)
lda.print_topics()

[(0,
  '0.201*"assistant" + 0.091*"conversation" + 0.077*"agree" + 0.062*"carter" + 0.055*"best" + 0.033*"bureau" + 0.005*"cisco" + 0.004*"administration" + 0.004*"apprentice" + 0.004*"care"'),
 (1,
  '0.186*"administration" + 0.089*"action" + 0.084*"company" + 0.057*"business" + 0.051*"apprentice" + 0.046*"bumpy" + 0.045*"cisco" + 0.040*"coaching" + 0.030*"anxiety" + 0.029*"care"')]

And now for the entire dataset : contentCorpus

In [29]:
cleanedCorpus = contentCorpus.apply(corpusCleaning)
cleanedCorpus

0        healthy future work productive ways apprentice...
1        national disability advocate hart weir congres...
3        talent management company modern apprenticeshi...
4        i count time packer assistant secretary policy...
5        job different way times international example ...
                               ...                        
34007    igniter year optimist idealist i world differe...
34008            shareholder coach fair weather needs team
34009    many end year golden circle origin i share jou...
34010                         customer least second people
34011          small work big way big work world small way
Name: Content, Length: 31996, dtype: object

In [30]:
#32 minutes to process (due to "in English Dic")
cleanedCorpus.to_pickle('cleanedPostsCorpus.pkl')

In [67]:
reducedCorpus = cleanedCorpus.iloc[:10000]
reducedCorpus

0        healthy future work productive ways apprentice...
1        national disability advocate hart weir congres...
3        talent management company modern apprenticeshi...
4        i count time packer assistant secretary policy...
5        job different way times international example ...
                               ...                        
10657                   virtual accelerator investor forum
10658                                          better seed
10659                   virtual accelerator investor forum
10660                         accelerator entrepreneurship
10661                              building strategy cloud
Name: Content, Length: 10000, dtype: object

In [68]:
#We define stop words
stop_words = text.ENGLISH_STOP_WORDS
#Document term matrix
cv = CountVectorizer(stop_words=stop_words)
data_cv = cv.fit_transform(reducedCorpus)
dataDtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
id2word = dict((v, k) for k, v in cv.vocabulary_.items())
# We're going to put the term-document matrix into a new gensim format, from df --> sparse matrix --> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(dataDtm)
corpus = matutils.Sparse2Corpus(sparse_counts) #It's actually a dtm
# #4topics
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=1, passes=10)
lda.print_topics()

IndexError: index 8419 is out of bounds for axis 1 with size 7608

There is a bug in the LDA algorithm.  
Let's try to figure out what makes this bug occurs !

## 5th attempt

Let's take all authors posts dataset, and apply only Nouns & Adj function on it.  
In other words, we don't use check if a word is in the english dictionary.  
In this way, we will see if our algorithm has a problem.

In [6]:
#Import the corpus
contentCorpus = pd.read_pickle("contentCorpus.pkl")
#Remove useless columns for this analysis. 
contentCorpus.drop(['Name','#Reactions','#Comments','Location','Followers','Time_spent','Media_type'],axis=1,inplace=True)
contentCorpus

Unnamed: 0,Content
0,robert lerman writes that achieving a healthy...
1,national disability advocate sara hart weir m...
3,exploring in this months talent management hr...
4,i count myself fortunate to have spent time wi...
5,online job platforms are a different way of wo...
...,...
34007,igniter of the year well i know that im an op...
34008,executives who prioritize the shareholder are ...
34009,like many i too have been reflecting as we nea...
34010,if you say customer first that means your empl...


In [7]:
import nltk
from nltk import word_tokenize, pos_tag
nltk.download('words')
englishWordsDic = nltk.corpus.words.words()
print ("There are {} words in this dictionary".format(len(englishWordsDic)))

There are 236736 words in this dictionary


[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Jeremy\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [8]:
#Let's keep nouns & adjectives only
#Let's create a function to pull out nouns & adjectives from a string of text

def corpusCleaning(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives belonging to english dictionary.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    cleanedtext = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(cleanedtext)

In [10]:
#Transform the corpus into a Serie
contentCorpus = contentCorpus.squeeze()
contentCorpus

0        robert lerman  writes that achieving a healthy...
1        national disability advocate  sara hart weir m...
3        exploring in this months talent management  hr...
4        i count myself fortunate to have spent time wi...
5        online job platforms are a different way of wo...
                               ...                        
34007    igniter of the year  well i know that im an op...
34008    executives who prioritize the shareholder are ...
34009    like many i too have been reflecting as we nea...
34010    if you say customer first that means your empl...
34011    the small work hard to serve themselves in a b...
Name: Content, Length: 31996, dtype: object

In [11]:
cleanedCorpus = contentCorpus.apply(corpusCleaning)
cleanedCorpus

0        robert lerman healthy future work employees sk...
1        national disability advocate sara hart weir ms...
3        months talent management hr company modern app...
4        i count time brooklynborn arnold packer arnold...
5        online job platforms different way times workp...
                               ...                        
34007    igniter year optimist im idealist i world diff...
34008    executives shareholder coach desires fair weat...
34009    many end year years golden circle origin i sha...
34010               customer employees least second people
34011          small work big way big work world small way
Name: Content, Length: 31996, dtype: object

In [13]:
#Transform the corpus into a Serie
contentCorpus = contentCorpus.squeeze()
contentCorpus
#We define stop words
stop_words = text.ENGLISH_STOP_WORDS
#Document term matrix
cv = CountVectorizer(stop_words=stop_words)
data_cv = cv.fit_transform(cleanedCorpus)
dataDtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
id2word = dict((v, k) for k, v in cv.vocabulary_.items())
# We're going to put the term-document matrix into a new gensim format, from df --> sparse matrix --> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(dataDtm)
corpus = matutils.Sparse2Corpus(sparse_counts) #It's actually a dtm
#4topics
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=6, passes=20)
lda.print_topics()

[(0,
  '0.020*"airline" + 0.012*"foreseen" + 0.010*"airing" + 0.008*"ain" + 0.006*"aira" + 0.006*"aiims" + 0.005*"formulaic" + 0.005*"aicapengage" + 0.004*"airplane" + 0.004*"aislebut"'),
 (1,
  '0.007*"formation" + 0.006*"aichess" + 0.005*"ai" + 0.004*"aid" + 0.003*"aibiased" + 0.003*"hearmetoo" + 0.003*"ahmed" + 0.003*"aihealthcare" + 0.003*"aim" + 0.003*"aibased"'),
 (2,
  '0.007*"aisle" + 0.006*"airlines" + 0.005*"airtravel" + 0.005*"airquality" + 0.005*"akande" + 0.004*"ais" + 0.004*"akali" + 0.004*"akamai" + 0.003*"aithe" + 0.003*"airtime"'),
 (3,
  '0.004*"longread" + 0.002*"akinlade" + 0.002*"procrastinate" + 0.002*"processors" + 0.001*"aissata" + 0.001*"planetorplastic" + 0.001*"flies" + 0.001*"manyif" + 0.001*"marketingevent" + 0.001*"poorly"'),
 (4,
  '0.021*"haveagreatweek" + 0.007*"aibmfin" + 0.002*"airlineindustry" + 0.002*"productinno" + 0.002*"ahsan" + 0.002*"productideas" + 0.001*"akula" + 0.001*"poster" + 0.001*"pertain" + 0.001*"imagined"'),
 (5,
  '0.015*"forgetting

OK the LDA algorithm works correctly without our english words only filter.  
That means we made a mistake in our filter function.  
Let's correct it !

## 6th attempt

In [3]:
#Import the corpus
contentCorpus = pd.read_pickle("contentCorpus.pkl")
#Remove useless columns for this analysis. 
contentCorpus.drop(['Name','#Reactions','#Comments','Location','Followers','Time_spent','Media_type'],axis=1,inplace=True)
contentCorpus

Unnamed: 0,Content
0,robert lerman writes that achieving a healthy...
1,national disability advocate sara hart weir m...
3,exploring in this months talent management hr...
4,i count myself fortunate to have spent time wi...
5,online job platforms are a different way of wo...
...,...
34007,igniter of the year well i know that im an op...
34008,executives who prioritize the shareholder are ...
34009,like many i too have been reflecting as we nea...
34010,if you say customer first that means your empl...


In [5]:
import nltk
from nltk import word_tokenize, pos_tag
nltk.download('words')
englishWordsDic = nltk.corpus.words.words()
print ("There are {} words in this dictionary".format(len(englishWordsDic)))

There are 236736 words in this dictionary


[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Jeremy\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [9]:
#Let's keep nouns & adjectives only
#Let's create a function to pull out nouns & adjectives from a string of text

def corpusCleaning(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives belonging to english dictionary.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    is_english = lambda word : word in englishWordsDic
    tokenized = word_tokenize(text)
    NounsAdjList = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    #Then, we check if words are english
    englishList = [word for word in NounsAdjList if is_english(word)]
    return ' '.join(englishList)

In [11]:
#Transform the corpus into a Serie
contentCorpus = contentCorpus.squeeze()
contentCorpus

0        robert lerman  writes that achieving a healthy...
1        national disability advocate  sara hart weir m...
3        exploring in this months talent management  hr...
4        i count myself fortunate to have spent time wi...
5        online job platforms are a different way of wo...
                               ...                        
34007    igniter of the year  well i know that im an op...
34008    executives who prioritize the shareholder are ...
34009    like many i too have been reflecting as we nea...
34010    if you say customer first that means your empl...
34011    the small work hard to serve themselves in a b...
Name: Content, Length: 31996, dtype: object

In [12]:
cleanedCorpus = contentCorpus.apply(corpusCleaning)
cleanedCorpus

0        healthy future work productive ways apprentice...
1        national disability advocate hart weir congres...
3        talent management company modern apprenticeshi...
4        i count time packer assistant secretary policy...
5        job different way times international example ...
                               ...                        
34007    igniter year optimist idealist i world differe...
34008            shareholder coach fair weather needs team
34009    many end year golden circle origin i share jou...
34010                         customer least second people
34011          small work big way big work world small way
Name: Content, Length: 31996, dtype: object

In [31]:
#It appears that after this cleaning, some strings are empty ""
cleanedCorpus.loc[cleanedCorpus == ''].size

783

In [34]:
#Therefore we remove these posts
cleaned2Corpus = cleanedCorpus.loc[cleanedCorpus != ""]
cleaned2Corpus.size

31213

In [35]:
#Let's save this df
cleaned2Corpus.to_pickle("cleanedPostsCorpus.pkl")

In [36]:
#We define stop words
stop_words = text.ENGLISH_STOP_WORDS
#Document term matrix
cv = CountVectorizer(stop_words=stop_words)
data_cv = cv.fit_transform(cleaned2Corpus)
dataDtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
id2word = dict((v, k) for k, v in cv.vocabulary_.items())
# We're going to put the term-document matrix into a new gensim format, from df --> sparse matrix --> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(dataDtm)
corpus = matutils.Sparse2Corpus(sparse_counts) #It's actually a dtm
#4topics
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=6, passes=20)
lda.print_topics()



IndexError: index 30498 is out of bounds for axis 1 with size 12565

In [41]:
print(sparse_counts)

  (0, 556)	2
  (0, 620)	1
  (0, 4563)	1
  (0, 5081)	1
  (0, 5394)	1
  (0, 5778)	1
  (0, 6757)	1
  (0, 8529)	1
  (0, 8584)	1
  (0, 8590)	1
  (0, 11965)	1
  (0, 12276)	1
  (0, 12439)	1
  (1, 14)	1
  (1, 108)	1
  (1, 211)	1
  (1, 2289)	1
  (1, 3145)	1
  (1, 5036)	1
  (1, 7255)	1
  (1, 12311)	1
  (2, 556)	1
  (2, 2164)	1
  (2, 4164)	1
  (2, 5012)	1
  :	:
  (31210, 1902)	1
  (31210, 2763)	1
  (31210, 3682)	3
  (31210, 4140)	1
  (31210, 4766)	1
  (31210, 5224)	1
  (31210, 5380)	1
  (31210, 6065)	2
  (31210, 6648)	1
  (31210, 7676)	1
  (31210, 8019)	3
  (31210, 9924)	1
  (31210, 11128)	1
  (31210, 11251)	1
  (31210, 12035)	1
  (31210, 12452)	2
  (31210, 12510)	1
  (31211, 2696)	1
  (31211, 8019)	1
  (31211, 9764)	1
  (31212, 1058)	2
  (31212, 10182)	2
  (31212, 12275)	2
  (31212, 12439)	2
  (31212, 12452)	1
