In [1]:
import re
import numpy as np
import pandas as pd
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim import similarities
from gensim.utils import  simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
from pprint import pprint



In [2]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<h3> 1. Use topic modelling on tweet’s text data, then analyse the results and 
identify issues with short text topic modelling. </h3> 

In [3]:
data_df = pd.read_csv('tweets.csv')

In [4]:
def sent_to_words(sentences):
    for sent in sentences:
        sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
        sent = re.sub('\s+', ' ', sent)  # remove newline chars
        sent = re.sub("\'", "", sent)  # remove single quotes
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True) 
        yield(sent)  
        
# Convert to list
data = data_df.text.values.tolist()
data_words = list(sent_to_words(data))
print(data_words[:1])

[['my', 'son', 'and', 'went', 'on', 'tour', 'to', 'the', 'allianz', 'arena', 'we', 'were', 'admiring', 'the', 'seat', 'arena', 'when', 'he', 'suddenly', 'pointed', 'at', 'the', 'pitch', 'dad', 'who', 'are', 'those', 'men', 'camping', 'there', 'said', 'son', 'they', 'are', 'penandes', 'amp', 'penaldo', 'they', 'live', 'in', 'that', 'penalty', 'box', 'they', 'only', 'perform', 'in', 'small', 'games']]


In [5]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """Remove Stopwords, Form Bigrams, Trigrams and Lemmatization"""
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    texts_out = []

    nlp = spacy.load("en_core_web_sm")
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]    
    return texts_out

data_ready = process_words(data_words)  # processed Text Data!

In [6]:
# Create Dictionary
id2word = corpora.Dictionary(data_ready)

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]

In [7]:
# Build LDA model 20 topics chunksize 1000
lda_20_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=1000,
                                           passes=10,
                                           alpha='symmetric',
                                           iterations=100,
                                           per_word_topics=True)

pprint(lda_20_model.print_topics())

[(0,
  '0.089*"amp" + 0.047*"great" + 0.045*"people" + 0.034*"work" + 0.030*"much" '
  '+ 0.020*"child" + 0.019*"tonight" + 0.018*"girl" + 0.017*"care" + '
  '0.016*"issue"'),
 (1,
  '0.182*"england" + 0.055*"game" + 0.052*"euro" + 0.043*"play" + 0.040*"win" '
  '+ 0.037*"team" + 0.036*"tomorrow" + 0.033*"germany" + 0.031*"football" + '
  '0.021*"well"'),
 (2,
  '0.039*"pay" + 0.029*"sure" + 0.021*"chance" + 0.021*"point" + 0.018*"uk" + '
  '0.017*"shit" + 0.017*"low" + 0.017*"understand" + 0.017*"small" + '
  '0.016*"car"'),
 (3,
  '0.044*"miss" + 0.041*"next" + 0.034*"penalty" + 0.026*"little" + '
  '0.022*"offer" + 0.019*"send" + 0.018*"write" + 0.018*"ticket" + 0.017*"min" '
  '+ 0.014*"pick"'),
 (4,
  '0.036*"pm" + 0.022*"match" + 0.020*"happen" + 0.016*"start" + 0.015*"uk" + '
  '0.015*"july" + 0.014*"club" + 0.014*"huge" + 0.013*"evening" + '
  '0.013*"claim"'),
 (5,
  '0.064*"watch" + 0.028*"bring" + 0.024*"guy" + 0.024*"give" + 0.021*"yet" + '
  '0.020*"oxford" + 0.019*"rememb

In [13]:
# save/load model
#lda_20_model.save('lda_20.model')
#lda_20_model = gensim.models.ldamodel.LdaModel.load('lda_20.model')

In [14]:
# Build LDA model 50 topics chunksize 1000
lda_50_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=50, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=1000,
                                           passes=10,
                                           alpha='symmetric',
                                           iterations=100,
                                           per_word_topics=True)

In [15]:
#lda_50_model.save('lda_50.model')
#lda_50_model = gensim.models.ldamodel.LdaModel.load('lda_50.model')

In [16]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

# aggregate lda 20 topics model
lda_20_df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_20_model, corpus=corpus, texts=data_ready)
lda_20_df_dominant_topic = lda_20_df_topic_sents_keywords.reset_index()
lda_20_df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

In [17]:
# aggregate lda 50 topics model
lda_50_df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_50_model, corpus=corpus, texts=data_ready)
lda_50_df_dominant_topic = lda_50_df_topic_sents_keywords.reset_index()
lda_50_df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

<h3> Statistics of Original Tweets</h3>

In [20]:
doc_lens = [len(d) for d in lda_20_df_dominant_topic.Text]
tweets_df = pd.DataFrame(data=doc_lens)
original = round(tweets_df.describe(),2)
original.columns = ['Original Tweets']

display(original)

Unnamed: 0,Original Tweets
count,53399.0
mean,11.54
std,6.61
min,0.0
25%,6.0
50%,10.0
75%,16.0
max,42.0


In [23]:
# Display setting to show more characters in column
pd.options.display.max_colwidth = 100

lda_20_sent_topics_sorteddf_mallet = pd.DataFrame()
lda_20_sent_topics_outdf_grpd = lda_20_df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in lda_20_sent_topics_outdf_grpd:
    lda_20_sent_topics_sorteddf_mallet = pd.concat([lda_20_sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                            axis=0)

# Reset Index    
lda_20_sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
lda_20_sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]

# Show
lda_20_sent_topics_sorteddf_mallet.head(5)

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Representative Text
0,0.0,0.81,"amp, great, people, work, much, child, tonight, girl, care, issue","[fabulous, opportunity, great, place]"
1,1.0,0.8643,"england, game, euro, play, win, team, tomorrow, germany, football, well","[england, beat, germany, tomorrow, england, win]"
2,2.0,0.8227,"pay, sure, chance, point, uk, shit, low, understand, small, car","[life, city, car, car, car, car, car, car, car, car]"
3,3.0,0.8417,"miss, next, penalty, little, offer, send, write, ticket, min, pick","[suppose, hella, sunny, london, next, week]"
4,4.0,0.81,"pm, match, happen, start, uk, july, club, huge, evening, claim","[late, uk, time, pm]"


In [22]:
lda_50_sent_topics_sorteddf_mallet = pd.DataFrame()
lda_50_sent_topics_outdf_grpd = lda_50_df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in lda_50_sent_topics_outdf_grpd:
    lda_50_sent_topics_sorteddf_mallet = pd.concat([lda_50_sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                            axis=0)

# Reset Index    
lda_50_sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
lda_50_sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]

# Show
lda_50_sent_topics_sorteddf_mallet.head(5)

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Representative Text
0,0.0,0.755,"people, france, work, head, die, amazing, ill, spain, walk, almost","[look, look, amazing, gal]"
1,1.0,0.6733,"death, least, cause, deserve, uk, represent, racist, binance, lord, murder","[birch, uk, ping, mum, lifer]"
2,2.0,0.6733,"way, sure, actually, boris, people, close, family, reason, young, plan","[scotch, egg, actually, scottish]"
3,3.0,0.6733,"read, future, literally, possible, mp, excited, begin, saturday, opening, normal","[blood_clot, micro, chip]"
4,4.0,0.6733,"penalty, listen, experience, together, series, instead, nowplaye, programme, dream, fast","[nick, listen, uk, drill]"


<h3> Perplexity between LDA 20 topics and LDA 50 topics </h3>

In [24]:
per_20_topics = lda_20_model.log_perplexity(corpus)
per_50_topics = lda_50_model.log_perplexity(corpus)

In [25]:
coherence_model_lda_20 = CoherenceModel(model=lda_20_model, texts=data_ready, dictionary=id2word, coherence='c_v')
coherence_lda_20 = coherence_model_lda_20.get_coherence()

coherence_model_lda_50 = CoherenceModel(model=lda_50_model, texts=data_ready, dictionary=id2word, coherence='c_v')
coherence_lda_50 = coherence_model_lda_50.get_coherence()

In [26]:
model_select_df = pd.DataFrame([
                                np.abs([per_20_topics, per_50_topics]),
                                [coherence_lda_20, coherence_lda_50]
                                ],
                                columns=['LDA_20', 'LDA_50'],
                                index=['Perplexity', 'Coherence'])
model_select_df

Unnamed: 0,LDA_20,LDA_50
Perplexity,14.815138,24.151448
Coherence,0.310247,0.37202


The LDA with 20 topics has better perplexity score than 50 topics while coherence score 50 topics have higher value. The issues for short text topic modeling for tweets data is there are a lot of factors such as sparse words, lack of clear context, and high volume of tweets data. This factor can increase memory requirement and processing time, while reduce generalization ability of the model when chose high number of topics.

<h3> 2. Group tweets based on some criteria. The idea here is to group similar tweets content wise and/or 
from the same users. Develop topic models on them and analyse the performance. </h3> 

In [27]:
# Group by user ID since ID in dataset is unique 
print("Number of all tweets:", len(data_df))
print("Number of all tweeets group by username:", len(data_df._id.unique()))
print("\n Since ID in dataset is unique which treat as different user, so LDA from Q1 can be use as group by userbaseline")

Number of all tweets: 53399
Number of all tweeets group by username: 53399

 Since ID in dataset is unique which treat as different user, so LDA from Q1 can be use as group by userbaseline


In [28]:
# Removal punctuation/digit/stopwords/unnesseary word before put back to sentence 

from string import punctuation
from nltk.tokenize import word_tokenize

stoplist = set(stopwords.words('english'))
punc = list(punctuation)

def norm_words(sent):
    #for sent in sentences:
        sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
        sent = re.sub('\s+', ' ', sent)  # remove newline chars
        sent = re.sub("\'", "", sent)  # remove single quotes
        sent = re.sub("\”", "", sent)  # remove single quotes
        sent = re.sub("\“", "", sent)  # remove single quotes
        sent = re.sub(r'http\S+', '', sent)

        for i in punc:
            sent = re.sub('\\'+i, '', sent)  

        text_tokens = word_tokenize(sent.lower())
        tokens_without_sw = [word for word in text_tokens if not word in stoplist and not word.isdigit()]
        sent = (" ").join(tokens_without_sw)
        return sent
    
# Convert to list
list_data = data_df.text.values.tolist()
list_data_word = list()
for sen in list_data:
    list_data_word.append(norm_words(sen))

In [29]:
# TFIDF Vectorizer and Cosine similarity to compare similiarity of tweets

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer()
tweet_vec = vectorizer.fit_transform(list_data_word)

a = cosine_similarity(tweet_vec[0:1], tweet_vec)
X = np.stack(( a.flatten(), np.zeros_like(a.flatten()) ), axis=1)

In [40]:
# KMeans Cluster with 20 clusters

from sklearn.cluster import KMeans
 
kmeans = KMeans(n_clusters=20, random_state=0).fit(X)

labels = kmeans.labels_
grouping_list = list()

for i in range(20):
    c_idx = np.where(labels==i)
    group_tweets = ""
    for j in c_idx[0]:
        group_tweets += " " + list_data_word[j]
    grouping_list.append(group_tweets)

In [53]:
def group_to_words(sentences):
    for sent in sentences:
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True) 
        yield(sent)  
group_data_words = list(group_to_words(grouping_list))

# Create Dictionary
group_id2word = corpora.Dictionary(group_data_words)

# Create Corpus: Term Document Frequency
group_corpus = [group_id2word.doc2bow(text) for text in group_data_words]

In [412]:
# Build LDA model 20 topics chunksize 1000
group_lda_20_model = gensim.models.ldamodel.LdaModel(corpus=group_corpus,
                                           id2word=group_id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=1000,
                                           passes=10,
                                           alpha='symmetric',
                                           iterations=100,
                                           per_word_topics=True)

In [429]:
#group_lda_20_model.save('group_lda_20.model')
#lda_20_model = gensim.models.ldamodel.LdaModel.load('lda_20.model')

<h4> Statistics of Group Aggregate Tweets</h4>

In [427]:
group_doc_lens = [len(d) for d in group_df_dominant_topic.Text]
group_tweets_df = pd.DataFrame(data=group_doc_lens)
aggregate = round(group_tweets_df.describe(),2)
aggregate.columns = ['Aggregate Tweets']

display(aggregate)

Unnamed: 0,Aggregate Tweets
count,20.0
mean,34027.95
std,126111.27
min,20.0
25%,467.75
50%,3781.0
75%,10771.75
max,568986.0


In [478]:
# aggregate lda 20 topics model
group_lda_20_df_topic_sents_keywords = format_topics_sentences(ldamodel=group_lda_20_model, corpus=group_corpus, texts=group_data_words)
group_lda_20_df_dominant_topic = group_lda_20_df_topic_sents_keywords.reset_index()
group_lda_20_df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

In [484]:
# Display setting to show more characters in column
pd.options.display.max_colwidth = 100

group_sent_topics_sorteddf_mallet = pd.DataFrame()
group_topics_outdf_grpd = group_lda_20_df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in group_topics_outdf_grpd:
    group_sent_topics_sorteddf_mallet = pd.concat([group_sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                            axis=0)

# Reset Index    
group_sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
group_sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]

# Show
#group_sent_topics_sorteddf_mallet

In [417]:
group_per_20_topics = group_lda_20_model.log_perplexity(group_corpus)

group_coherence_model_lda_20 = CoherenceModel(model=group_lda_20_model, texts=group_data_words, dictionary=group_id2word, coherence='c_v')
group_coherence_lda_20 = group_coherence_model_lda_20.get_coherence()

<h4> Perplexity between LDA 20 topics and Group-LDA 20 topics </h4>

In [440]:
com_model_select_df = pd.DataFrame([
                                np.abs([per_20_topics, group_per_20_topics]),
                                [coherence_lda_20, group_coherence_lda_20]
                                ],
                                columns=['LDA_20', 'GLDA_20'],
                                index=['Perplexity', 'Coherence'])
com_model_select_df

Unnamed: 0,LDA_20,GLDA_20
Perplexity,14.815109,9.548358
Coherence,0.310247,0.262563


<h2> 3. Compare the performance differences and discuss the reasons </h2>

In [439]:
overall_performance = pd.DataFrame([
                                np.abs([per_20_topics, per_50_topics, group_per_20_topics]),
                                [coherence_lda_20, coherence_lda_50, group_coherence_lda_20]
                                ],
                                columns=['LDA_20', 'LDA_50', 'G-LDA_20'],
                                index=['Perplexity', 'Coherence'])
overall_performance

Unnamed: 0,LDA_20,LDA_50,G-LDA_20
Perplexity,14.815109,24.151444,9.548358
Coherence,0.310247,0.37202,0.262563


Overall result show LDA with 20 topics as baseline have lower perplexity than LDA with 50 topics, but when Grouping LDA with 20 topics showed decrease in perplexity value mean that it perform better than baseline. Due to grouping tweets with similar context by combine them together increase size of text which help in generalization performance of topics as tweets data tend to have similar context from tfidf and cosine similarity process. 
While coherence score in Grouping LDA with 20 topics furthure decrease as size of text become larger which penalty in finding same topic in tweets.

Topic from Topic Model

In [485]:
lda_20_sent_topics_sorteddf_mallet

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Representative Text
0,0.0,0.81,"amp, great, people, work, much, child, tonight, girl, care, issue","[hugh, gaitskell, die, young, well, time]"
1,1.0,0.8643,"england, game, euro, play, win, team, tomorrow, germany, football, well","[england, beat, germany, tomorrow, england, win]"
2,2.0,0.8227,"pay, sure, chance, point, uk, shit, low, understand, small, car","[life, city, car, car, car, car, car, car, car, car]"
3,3.0,0.8417,"miss, next, penalty, little, offer, send, write, ticket, min, pick","[suppose, hella, sunny, london, next, week]"
4,4.0,0.81,"pm, match, happen, start, uk, july, club, huge, evening, claim","[late, uk, time, pm]"
5,5.0,0.81,"watch, bring, guy, give, yet, oxford, remember, police, second, mate","[people, destroy, uk, mayor, london, priti, patal]"
6,6.0,0.7625,"report, wait, hear, woman, prime_minister, north, wish, grow, pass, eat","[flower, eat, flower, fade]"
7,7.0,0.81,"scotland, feel, fan, goal, mean, move, hour, fact, high, open","[move, apartment, september, proper, place]"
8,8.0,0.7625,"break, add, happy, month, friend, complete, view, drop, currently, design","[love_island, drop, stream]"
9,9.0,0.8597,"really, hope, way, conservative, actually, turn, life, people, believe, country","[hope, well, sense, morality, donation, conservative]"


In [486]:
group_sent_topics_sorteddf_mallet

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Representative Text
0,4.0,1.0,"uk, england, new, like, york, london, people, get, one, would","[think, told, saj, tells, change, position, ethereum, looks, retake, days, london, hard, fork, b..."
1,5.0,0.9748,"amp, said, live, uk, new, us, people, england, london, would","[thank, campaigning, barriers, far, harm, good, amp, particularly, bad, record, london, students..."
2,6.0,0.9997,"amp, live, said, games, went, small, england, men, tour, penalty","[another, busy, week, media, duties, excellent, football, legend, clients, covering, games, amp,..."
3,11.0,0.9959,"amp, uk, new, london, people, us, one, get, like, england","[deadline, pm, today, apply, frontend, developer, role, youre, big, mission, flexible, working, ..."
4,17.0,0.9973,"arena, son, london, going, get, boy, long, tickets, due, break","[burna, boy, concert, london, arena, august, going, opener, arena, long, break, due, pandemic, g..."
