In [1]:
import numpy as np
import pandas as pd
import spacy
from nltk.corpus import stopwords
from string import punctuation

In [2]:
comments_df = pd.read_csv('./comments_with_score.csv',index_col=0)

In [3]:
comments_df.head()

Unnamed: 0,article_id,comments,is_reply,neg,neu,pos,compound
0,0.0,What's the point of studying so much ended up ...,0.0,0.0,0.872,0.128,0.7096
1,0.0,No matter what kind of streaming or subject ba...,0.0,0.156,0.76,0.084,-0.8555
2,0.0,Seems to be that the purpose of this system is...,1.0,0.0,0.844,0.156,0.6322
3,0.0,This feels like just another diversion from RE...,0.0,0.045,0.797,0.159,0.8981
4,0.0,Isn’t a “real” issue the boxing of kids into s...,1.0,0.0,0.69,0.31,0.6597


## Gensim LDA with lemmatization

In [4]:
#Get english stopwords from nltk
nltk_stops = stopwords.words('english')

In [5]:
#Instantiate tokenizer
nlp = spacy.load('en_core_web_lg')

In [6]:
#Add curly double quotes to punctuation list
punctuation = punctuation + '“' + '”'

In [7]:
def clean_text(article):
    '''
    This function takes in a string and returns the string with stopwords and punctuation removed, plus words are lemmatized unless they are pronouns.
    '''
    #Remove stopwords and punctuation
    removed_stopwords = " ".join(i for i in article.lower().split() if i not in nltk_stops)
    removed_punctuation = "".join(i for i in removed_stopwords if i not in punctuation)
    
    #Tokenize words
    doc = nlp(removed_punctuation)
    #Lemmatize words
    lemmatized = " ".join(token.lemma_ if token.pos_ != 'PRON' else token.text for token in doc )
    return lemmatized

In [8]:
#Try cleaning the first comment
clean_text(comments_df['comments'][0])

'what s point study much end work clean taxi driver university graduate rest position fill foreign talent work apply singapore citizenship go back china join people liberation army drop singapore citizenship'

In [9]:
#Add the cleaned comments to the DataFrame
comments_df['cleaned comments'] = comments_df['comments'].apply(clean_text)

In [10]:
import gensim
from gensim import corpora,models

In [11]:
#Convert each comment into a list of words
text_list = [text.split() for text in comments_df['cleaned comments']]

# Create the term dictionary of our corpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(text_list)
print(dictionary)

Dictionary(2135 unique tokens: ['apply', 'army', 'back', 'china', 'citizenship']...)


In [12]:
#Convert corpus into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in text_list]

In [13]:
#Create the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

#Run and train LDA model on the document term matrix for 2 topics
ldamodel = Lda(doc_term_matrix, num_topics=2, id2word = dictionary, passes = 100, random_state = 42)

In [14]:
from pprint import pprint

In [15]:
#Print topic number and most contributing words
pprint(ldamodel.print_topics(num_words=15))

[(0,
  '0.013*"student" + 0.011*"school" + 0.010*"stream" + 0.009*"good" + '
  '0.007*"go" + 0.006*"take" + 0.006*"education" + 0.006*"system" + '
  '0.006*"change" + 0.006*"one" + 0.005*"policy" + 0.004*"parent" + '
  '0.004*"minister" + 0.004*"child" + 0.004*"study"'),
 (1,
  '0.011*"singapore" + 0.009*"like" + 0.007*"even" + 0.006*"bad" + '
  '0.006*"many" + 0.006*"not" + 0.006*"student" + 0.006*"singaporean" + '
  '0.006*"people" + 0.006*"system" + 0.006*"stream" + 0.005*"chinese" + '
  '0.005*"year" + 0.005*"normal" + 0.005*"get"')]


In [16]:
#Run and train LDA model on the document term matrix for 3 topics
ldamodel3 = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes = 100, random_state = 42)

In [17]:
#Print topic number and most contributing words
pprint(ldamodel3.print_topics(num_words=15))

[(0,
  '0.013*"school" + 0.010*"student" + 0.010*"stream" + 0.007*"good" + '
  '0.007*"one" + 0.006*"go" + 0.006*"normal" + 0.005*"system" + 0.005*"class" '
  '+ 0.005*"uturn" + 0.005*"child" + 0.005*"like" + 0.005*"education" + '
  '0.004*"lose" + 0.004*"many"'),
 (1,
  '0.012*"singapore" + 0.010*"like" + 0.008*"student" + 0.007*"bad" + '
  '0.007*"system" + 0.007*"stream" + 0.007*"even" + 0.007*"year" + '
  '0.006*"chinese" + 0.006*"many" + 0.006*"singaporean" + 0.006*"people" + '
  '0.005*"education" + 0.005*"kid" + 0.005*"government"'),
 (2,
  '0.010*"good" + 0.009*"student" + 0.009*"take" + 0.008*"get" + '
  '0.007*"change" + 0.006*"stream" + 0.006*"policy" + 0.006*"school" + '
  '0.006*"education" + 0.006*"streaming" + 0.005*"people" + 0.005*"pap" + '
  '0.005*"not" + 0.005*"job" + 0.005*"need"')]


In [18]:
#Run and train LDA model on the document term matrix for 4 topics
ldamodel4 = Lda(doc_term_matrix, num_topics=4, id2word = dictionary, passes = 100, random_state = 42)

In [19]:
#Print topic number and most contributing words
pprint(ldamodel4.print_topics(num_words=15))

[(0,
  '0.016*"school" + 0.013*"student" + 0.012*"stream" + 0.008*"good" + '
  '0.008*"go" + 0.007*"normal" + 0.007*"one" + 0.006*"uturn" + 0.006*"teacher" '
  '+ 0.005*"express" + 0.005*"lose" + 0.005*"minister" + 0.005*"system" + '
  '0.005*"mix" + 0.005*"like"'),
 (1,
  '0.014*"singapore" + 0.011*"like" + 0.009*"bad" + 0.007*"year" + '
  '0.007*"even" + 0.007*"singaporean" + 0.006*"civil" + 0.006*"education" + '
  '0.006*"system" + 0.006*"many" + 0.005*"chinese" + 0.005*"or" + '
  '0.005*"government" + 0.005*"world" + 0.005*"not"'),
 (2,
  '0.011*"student" + 0.011*"take" + 0.011*"good" + 0.009*"get" + '
  '0.007*"change" + 0.007*"policy" + 0.007*"streaming" + 0.007*"need" + '
  '0.007*"stream" + 0.006*"job" + 0.006*"education" + 0.006*"people" + '
  '0.006*"pap" + 0.005*"also" + 0.004*"want"'),
 (3,
  '0.015*"student" + 0.009*"system" + 0.009*"stream" + 0.008*"class" + '
  '0.007*"not" + 0.007*"school" + 0.007*"many" + 0.007*"normal" + '
  '0.006*"nonchinese" + 0.006*"would" + 0.006

<div class='alert alert-block alert-warning'>
    There does not seem to be distinct separation of topics using gensim. There is a lot of overlap for the top contributing words for each topic.
</div>

In [None]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

In [20]:
#Visualise topics and most relevant terms
pyLDAvis.gensim.prepare(ldamodel3, doc_term_matrix, dictionary)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


## Include Bigrams

In [21]:
from gensim.models import Phrases

In [22]:
#Convert each comment into a list of unigrams
text = [line.split() for line in comments_df['comments']]

In [23]:
# Add bigrams to documents (minimum 10 occurances)
bigrams = Phrases(text, min_count=10)

for index in range(len(text)):
    #run through tokens including the bigrams 
    for word in bigrams[text[index]]:
        #Check if token is a bigram
        if '_' in word:
            #Add bigram to the doc
            text[index].append(word)

In [24]:
#Create a new column for the comments with bigrams, convert the lists of unigrams and bigrams into individual comments
comments_df['bigram_text'] = [' '.join(line) for line in text] 

In [25]:
def clean2(text):
    '''
    This function takes in a string and returns the string with stopwords and punctuation removed, plus words are lemmatized unless they are pronouns.
    '''
    #Remove stopwords and punctuation
    removed_stopwords = " ".join(i for i in text.lower().split() if i not in nltk_stops)
    removed_punctuation = "".join(i for i in removed_stopwords if i not in punctuation or i == '_')
    
    #Tokenize words
    doc = nlp(removed_punctuation)
    #Lemmatize words
    lemmatized = " ".join(token.lemma_ if token.pos_ != 'PRON' else token.text for token in doc)
    return lemmatized

In [26]:
#Add the cleaned bigram text to the DataFrame
comments_df['cleaned_bigram_text'] = comments_df['bigram_text'].apply(clean2)

In [27]:
#Convert each comment into a list of unigrams/bigrams
text_list_2 = [text.split() for text in comments_df['cleaned_bigram_text']]

In [28]:
#Create a gensim dictionary
dictionary_bigram = corpora.Dictionary(text_list_2)
print(dictionary_bigram)

Dictionary(2146 unique tokens: ['apply', 'army', 'back', 'china', 'citizenship']...)


<div class='alert alert-block alert-warning'>
    Only 11 more words than previously, seems unlikely to have much impact.
</div>

In [29]:
#Convert corpus into Document Term Matrix using dictionary prepared above.
dtm_2 = [dictionary_bigram.doc2bow(doc) for doc in text_list_2]

In [30]:
#Run and train LDA model on the document term matrix for 2 topics
lda_bg = Lda(dtm_2, num_topics=2, id2word = dictionary_bigram, passes = 100, random_state = 42)

In [31]:
#Print topic number and most contributing words
pprint(lda_bg.show_topics(num_words=15))

[(0,
  '0.016*"student" + 0.012*"stream" + 0.011*"good" + 0.009*"school" + '
  '0.007*"normal" + 0.006*"express" + 0.006*"one" + 0.006*"system" + '
  '0.006*"kid" + 0.006*"many" + 0.006*"take" + 0.006*"education" + 0.005*"get" '
  '+ 0.005*"streaming" + 0.005*"like"'),
 (1,
  '0.012*"singapore" + 0.011*"_" + 0.009*"like" + 0.007*"year" + 0.007*"bad" + '
  '0.007*"even" + 0.007*"chinese" + 0.006*"singaporean" + 0.006*"world" + '
  '0.005*"government" + 0.005*"system" + 0.005*"civil" + 0.005*"foreign" + '
  '0.005*"education" + 0.005*"do"')]


In [32]:
#Run and train LDA model on the document term matrix for 3 topics
lda_bg_3 = Lda(dtm_2, num_topics=3, id2word = dictionary_bigram, passes = 100, random_state = 42)

In [33]:
#Print topic number and most contributing words
pprint(lda_bg_3.show_topics(num_words=15))

[(0,
  '0.014*"stream" + 0.014*"student" + 0.012*"good" + 0.011*"normal" + '
  '0.009*"express" + 0.008*"school" + 0.007*"many" + 0.007*"kid" + 0.007*"one" '
  '+ 0.007*"go" + 0.006*"take" + 0.006*"like" + 0.005*"year" + 0.005*"people" '
  '+ 0.005*"study"'),
 (1,
  '0.007*"minister" + 0.007*"local" + 0.007*"foreign" + 0.006*"job" + '
  '0.006*"university" + 0.005*"issue" + 0.005*"singapore" + 0.005*"take" + '
  '0.005*"elite" + 0.005*"chinese" + 0.005*"change" + 0.005*"like" + '
  '0.004*"year" + 0.004*"do" + 0.004*"not"'),
 (2,
  '0.010*"system" + 0.010*"singapore" + 0.009*"_" + 0.009*"like" + '
  '0.008*"student" + 0.008*"school" + 0.007*"education" + 0.007*"bad" + '
  '0.006*"get" + 0.005*"people" + 0.005*"even" + 0.005*"stream" + '
  '0.005*"class" + 0.005*"not" + 0.005*"change"')]


In [34]:
#Run and train LDA model on the document term matrix for 4 topics
lda_bg_4 = Lda(dtm_2, num_topics=3, id2word = dictionary_bigram, passes = 100, random_state = 42)

In [35]:
#Print topic number and most contributing words
pprint(lda_bg_4.show_topics(num_words=15))

[(0,
  '0.014*"stream" + 0.014*"student" + 0.012*"good" + 0.011*"normal" + '
  '0.009*"express" + 0.008*"school" + 0.007*"many" + 0.007*"kid" + 0.007*"one" '
  '+ 0.007*"go" + 0.006*"take" + 0.006*"like" + 0.005*"year" + 0.005*"people" '
  '+ 0.005*"study"'),
 (1,
  '0.007*"minister" + 0.007*"local" + 0.007*"foreign" + 0.006*"job" + '
  '0.006*"university" + 0.005*"issue" + 0.005*"singapore" + 0.005*"take" + '
  '0.005*"elite" + 0.005*"chinese" + 0.005*"change" + 0.005*"like" + '
  '0.004*"year" + 0.004*"do" + 0.004*"not"'),
 (2,
  '0.010*"system" + 0.010*"singapore" + 0.009*"_" + 0.009*"like" + '
  '0.008*"student" + 0.008*"school" + 0.007*"education" + 0.007*"bad" + '
  '0.006*"get" + 0.005*"people" + 0.005*"even" + 0.005*"stream" + '
  '0.005*"class" + 0.005*"not" + 0.005*"change"')]


<div class='alert alert-block alert-warning'>
    There might be a bit of indication of politics for 2 and 3 topics, with words such as government, system, civil and minister, local, foreign respectively. However, there is still quite a bit of overlap and it feels like quite the stretch. I'll take a look a the top relevant terms and their relative contributions to get a better picture.
</div>

In [36]:
#Visualise topics and most relevant terms for 2 topics
pyLDAvis.gensim.prepare(lda_bg, dtm_2, dictionary_bigram)

In [37]:
#Visualise topics and most relevant terms for 3 topics
pyLDAvis.gensim.prepare(lda_bg_3, dtm_2, dictionary_bigram)

In [38]:
#Retrieve topics and probabilities for each, for the first comment
lda_bg_3.get_document_topics(dictionary_bigram.doc2bow(text_list_2[0]))

[(0, 0.011714848), (1, 0.97677267), (2, 0.011512484)]

In [39]:
#Create function to sort by the second item in the tuple
def sort_by_second(tuple_item):
    return tuple_item[1]

#Retrieve the most probable topic, for the first comment
sorted(lda_bg_3.get_document_topics(dictionary_bigram.doc2bow(text_list_2[0])),key=sort_by_second, reverse= True)[0][0]

1

In [40]:
#Create function to retrieve most probable topic
def get_topic(text_entry):
    return sorted(lda_bg_3.get_document_topics(dictionary_bigram.doc2bow(text_entry)),key=sort_by_second, reverse= True)[0][0]

In [41]:
#Add the topics to the DataFrame
comments_df['topics'] = [get_topic(comment) for comment in text_list_2]

In [42]:
#Print the number of comments for each topic
for i in range(3):
    print('No. of comments for topic {}: '.format(i),len(comments_df[comments_df['topics']==i]))

No. of comments for topic 0:  96
No. of comments for topic 1:  58
No. of comments for topic 2:  111


In [43]:
import random

In [44]:
def show_random_5(topic_num):
    '''
    This function takes in a topic number and prints out 5 random comments belonging to that topic
    '''
    #for reproducibility
    random.seed(99)
    #randomly pick 5 indexes
    for index in random.sample(range(len(comments_df[comments_df['topics']==topic_num])),5):
        #Print comment with the selected index
        print(comments_df[comments_df['topics']==topic_num].iloc[index]['comments'])
        print('-----'*20)
        print('\n')

In [45]:
#Show 5 random comments from topic 0
show_random_5(0)

No one takes Yahoo polls seriously. Are you new here?
----------------------------------------------------------------------------------------------------


@ Peter, most of ur tribe members cannot make it, so how to break bond?
----------------------------------------------------------------------------------------------------


No more streaming into classes but segregating students into subjects means a different way to stream!
----------------------------------------------------------------------------------------------------


China action good good
----------------------------------------------------------------------------------------------------


Look at how many students suffer mental problems. All thanks to the stressful education system
----------------------------------------------------------------------------------------------------




In [46]:
#Show 5 random comments from topic 1
show_random_5(1)

Why need to study much, at the end, the 4ligners getting the jobs
----------------------------------------------------------------------------------------------------


WHAT ABOUT SEPS SCHOOLS?
this is THE major stigma that leads to elitism that forms the distinct gaps!
such move will only distinguish this elite groups further away from the mass, holding them 'untouchable' in moral and social.
likewise, what about educators themselves????
More
----------------------------------------------------------------------------------------------------


CEOs tell Trump they are hiring more Americans without college degrees:
WASHINGTON (Reuters) - Chief executives of major companies said at a White House forum on Wednesday that they are hiring more Americans without college degrees as they search to find increasingly scarce applicants for open jobs.
----------------------------------------------------------------------------------------------------


Only 1diots like you take Yahoo polls serious

In [47]:
#Show 5 random comments from topic 2
show_random_5(2)

After election, things might change again...
----------------------------------------------------------------------------------------------------


Sounds like the GRC system!
----------------------------------------------------------------------------------------------------


Even the teachers
----------------------------------------------------------------------------------------------------


Not a problem, as long as the schools are top performing and have a challenging curriculum. I hope Singapore doesn’t marginalize their school system
----------------------------------------------------------------------------------------------------


To Everyone in this Website, Especially PAP, Opposition Parties & All Singaporean,

To improve our competitiveness in Global Economy , We really must REVAMP our entire school education system , in actual fact, it should have been Done it in over 20 years ago, during the 1990s .

From this website on “ Subject-Based Banding to replace streaming in

<div class='alert alert-block alert-warning'>
    There is still a lot of overlap and topic distribution isn't clear. Perhaps unigrams without lemmatization (as was done previously for sci-kit learn's LDA and NMF topic modelling) would provide better results.
</div>

## Minimal preprocessing (no lemmatization)

In [48]:
def remove_stopwords_punctuation(article):
    '''
    This function takes in a string and returns the string with stopwords and punctuation removed
    '''
    removed_stopwords = " ".join(word for word in article.lower().split() if word not in nltk_stops)
    removed_punctuation = ''.join(letter for letter in removed_stopwords if letter not in punctuation)
    return removed_punctuation

In [49]:
#Remove stopwaords and punctuation
text_3 = [remove_stopwords_punctuation(article) for article in comments_df['comments']]

In [50]:
#Convert each comment into a list of unigrams
text_list_3 = [comment.split() for comment in text_3]

In [51]:
#Crate gensim dictionary
dictionary_3 = corpora.Dictionary(text_list_3)
print(dictionary_3)

Dictionary(2575 unique tokens: ['apply', 'army', 'back', 'china', 'citizenship']...)


In [52]:
#Convert corpus into Document Term Matrix using dictionary prepared above.
dtm_3 = [dictionary_3.doc2bow(comment) for comment in text_list_3]

In [53]:
#Run and train LDA model on the document term matrix for 2 topics
lda_min = Lda(dtm_3,num_topics=2,id2word=dictionary_3, passes=100,random_state=42)

In [54]:
#Print topic number and most contributing words
pprint(lda_min.show_topics())

[(0,
  '0.009*"students" + 0.008*"like" + 0.007*"singapore" + 0.007*"streaming" + '
  '0.007*"even" + 0.006*"good" + 0.006*"education" + 0.006*"system" + '
  '0.006*"stream" + 0.005*"many"'),
 (1,
  '0.005*"singapore" + 0.005*"streaming" + 0.005*"students" + 0.004*"system" + '
  '0.004*"foreign" + 0.004*"education" + 0.004*"years" + 0.003*"local" + '
  '0.003*"pap" + 0.003*"one"')]


In [55]:
#Run and train LDA model on the document term matrix for 3 topics
lda_min_3 = Lda(dtm_3,num_topics=3,id2word=dictionary_3, passes=100,random_state=42)

In [56]:
#Print topic number and most contributing words
pprint(lda_min_3.show_topics())

[(0,
  '0.009*"singapore" + 0.008*"like" + 0.008*"students" + 0.007*"good" + '
  '0.007*"streaming" + 0.007*"even" + 0.007*"system" + 0.006*"education" + '
  '0.005*"many" + 0.005*"people"'),
 (1,
  '0.005*"streaming" + 0.005*"education" + 0.005*"pap" + 0.004*"social" + '
  '0.004*"polls" + 0.004*"uturn" + 0.004*"years" + 0.004*"one" + '
  '0.003*"always" + 0.003*"system"'),
 (2,
  '0.012*"normal" + 0.010*"students" + 0.010*"stream" + 0.009*"kids" + '
  '0.009*"express" + 0.006*"streaming" + 0.006*"like" + 0.005*"foreign" + '
  '0.005*"many" + 0.005*"singapore"')]


In [57]:
#Run and train LDA model on the document term matrix for 4 topics
lda_min_4 = Lda(dtm_3,num_topics=4,id2word=dictionary_3, passes=100,random_state=42)

In [58]:
#Print topic number and most contributing words
pprint(lda_min_4.show_topics())

[(0,
  '0.010*"singapore" + 0.009*"like" + 0.007*"students" + 0.007*"even" + '
  '0.006*"people" + 0.006*"system" + 0.006*"good" + 0.006*"streaming" + '
  '0.005*"singaporean" + 0.005*"many"'),
 (1,
  '0.006*"pap" + 0.004*"make" + 0.004*"education" + 0.004*"low" + 0.004*"take" '
  '+ 0.003*"better" + 0.003*"like" + 0.003*"streaming" + 0.003*"days" + '
  '0.003*"question"'),
 (2,
  '0.020*"normal" + 0.016*"express" + 0.014*"kids" + 0.014*"stream" + '
  '0.010*"students" + 0.008*"many" + 0.007*"streaming" + 0.006*"like" + '
  '0.005*"school" + 0.004*"look"'),
 (3,
  '0.007*"education" + 0.007*"streaming" + 0.007*"students" + '
  '0.007*"singapore" + 0.006*"system" + 0.006*"good" + 0.005*"schools" + '
  '0.005*"foreign" + 0.005*"like" + 0.004*"years"')]


In [59]:
pyLDAvis.gensim.prepare(lda_min, dtm_3, dictionary_3)

In [60]:
pyLDAvis.gensim.prepare(lda_min_3, dtm_3, dictionary_3)

<div class='alert alert-block alert-warning'>
    Even without lemmatization, gensim's LDA did not seem to be able to pull out distinct topics from the comments. It could be due to many the pool of comments being too small, and there being many similar terms across most of the comments.
</div>