In [14]:
# GSDM - Gibbs Sampling Dirichlet Mixture Model is better for short text topic modeling

"""
Well, the Gibbs Sampling Dirichlet Mixture Model (GSDMM) is an “extended” LDA algorithm, that makes the initial assumption: 1 topic is 1 document.

The words within a document are generated using the same unique topic, and not from a mixture of topics as it was in the original LDA.

GSDMM is a good choice for short text topic modeling.
"""

import numpy as np
import pandas as pd

import sys

sys.path.append('../input/gsdmm-short-text-clustering')
from gsdmm import MovieGroupProcess

from gensim.models.coherencemodel import CoherenceModel
from gensim import corpora, models
from gensim.utils import simple_preprocess
import gensim, spacy
import re

from gensim.models import Phrases
from gensim.models.phrases import Phraser

In [15]:
data = pd.read_csv('./data/tripadvisor_hotel_reviews.csv')

In [16]:

# Another test of pre-process:
data['review_list'] = data.Review.values.tolist()

# remove characters
data['review_list'] = [re.sub('\s+', ' ', sent) for sent in data['review_list']]
data['review_list'] = [re.sub("\'", "", sent) for sent in data['review_list']]


def sent_to_words(sentences):
    for sentence in sentences:
        yield (gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations


# create N-grams
def make_n_grams(texts):
    bigram = gensim.models.Phrases(texts, min_count=5, threshold=100)  # higher threshold fewer phrases.
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram = gensim.models.Phrases(bigram[texts], threshold=100)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    bigrams_text = [bigram_mod[doc] for doc in texts]
    trigrams_text = [trigram_mod[bigram_mod[doc]] for doc in bigrams_text]
    return trigrams_text


tokens_reviews = list(sent_to_words(data['review_list']))

tokens_reviews = make_n_grams(tokens_reviews)

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])


# gensim stop-words and add stop-words based on texts
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if
             word not in gensim.parsing.preprocessing.STOPWORDS.union(
                 {'also', 'meanwhile', 'however', 'time', 'hour', 'soon', 'day', 'book', 'there', 'hotel', 'room',
                  'leave', 'arrive', 'place', 'stay', 'staff', 'location', 'service', 'come', 'check', 'ask', 'lot',
                  'thing', 'soooo', 'add', 'rarely', 'use', 'look', 'minute', 'bring', 'need', 'world', 'think',
                  'value', 'include'})] for doc in
            texts]


# , 'arrive', 'place', 'stay', 'staff', 'location', 'service'

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out


# do lemmatization keeping only noun, vb, adv
# because adj is not informative for reviews topic modeling
reviews_lemmatized = lemmatization(tokens_reviews, allowed_postags=['NOUN', 'VERB', 'ADV'])

# remove stop words after lemmatization
reviews_lemmatized = remove_stopwords(reviews_lemmatized)

In [17]:
# THE MODEL
mgp = MovieGroupProcess(K=6, alpha=0.01, beta=0.01, n_iters=30)

vocab = set(x for review in reviews_lemmatized for x in review)
n_terms = len(vocab)
model = mgp.fit(reviews_lemmatized, n_terms)

In stage 0: transferred 15833 clusters with 6 clusters populated
In stage 1: transferred 8736 clusters with 6 clusters populated
In stage 2: transferred 5395 clusters with 6 clusters populated
In stage 3: transferred 4019 clusters with 6 clusters populated
In stage 4: transferred 3178 clusters with 6 clusters populated
In stage 5: transferred 2850 clusters with 6 clusters populated
In stage 6: transferred 2611 clusters with 6 clusters populated
In stage 7: transferred 2404 clusters with 6 clusters populated
In stage 8: transferred 2371 clusters with 6 clusters populated
In stage 9: transferred 2234 clusters with 6 clusters populated
In stage 10: transferred 2193 clusters with 6 clusters populated
In stage 11: transferred 2151 clusters with 6 clusters populated
In stage 12: transferred 2142 clusters with 6 clusters populated
In stage 13: transferred 2079 clusters with 6 clusters populated
In stage 14: transferred 2085 clusters with 6 clusters populated
In stage 15: transferred 2055 clus

In [18]:
# This shows us the top tokens for each cluster
def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts = sorted(mgp.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print("\nCluster %s : %s" % (cluster, sort_dicts))


doc_count = np.array(mgp.cluster_doc_count)
print('Number of documents per topic :', doc_count)

# topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[-10:][::-1]
print('\nMost important clusters (by number of docs inside):', top_index)
# show the top 5 words in term frequency for each cluster
top_words(mgp.cluster_word_distribution, top_index, 10)

Number of documents per topic : [4390 2836 1403 4370 1168 6324]

Most important clusters (by number of docs inside): [5 0 3 1 2 4]

Cluster 5 : [('resort', 8588), ('beach', 8135), ('food', 7317), ('pool', 6596), ('night', 5718), ('restaurant', 5612), ('people', 4993), ('want', 3990), ('drink', 3574), ('bar', 3450)]

Cluster 0 : [('breakfast', 3110), ('night', 2530), ('walk', 2386), ('restaurant', 1570), ('bathroom', 1412), ('area', 1219), ('city', 1217), ('bed', 1204), ('recommend', 1170), ('price', 988)]

Cluster 3 : [('night', 3450), ('bed', 2176), ('bathroom', 1661), ('breakfast', 1649), ('floor', 1377), ('price', 1262), ('desk', 1188), ('pay', 1131), ('walk', 1127), ('view', 1082)]

Cluster 1 : [('night', 1621), ('walk', 1468), ('breakfast', 1399), ('restaurant', 1149), ('view', 990), ('area', 989), ('bed', 964), ('price', 833), ('floor', 680), ('city', 670)]

Cluster 2 : [('breakfast', 505), ('night', 478), ('love', 418), ('return', 360), ('feel', 340), ('restaurant', 332), ('reco

In [19]:
# I don`t rename the clusters

topic_dict = {}
topic_names = ['type 1',
               'type 2',
               'type 3',
               'type 4',
               'type 5',
               'type 6',
               ]
for i, topic_num in enumerate(top_index):
    topic_dict[topic_num] = topic_names[i]

In [20]:
def create_topics_dataframe(data_text=data.Review, mgp=mgp, threshold=0.3, topic_dict=topic_dict,
                            lemma_text=reviews_lemmatized):
    result = pd.DataFrame(columns=['Text', 'Topic', 'Rating', 'Lemma-text'])
    for i, text in enumerate(data_text):
        result.at[i, 'Text'] = text
        result.at[i, 'Rating'] = data.Rating[i]
        result.at[i, 'Lemma-text'] = lemma_text[i]
        prob = mgp.choose_best_label(reviews_lemmatized[i])
        if prob[1] >= threshold:
            result.at[i, 'Topic'] = topic_dict[prob[0]]
        else:
            result.at[i, 'Topic'] = 'Other'
    return result

In [21]:
result = create_topics_dataframe(data_text=data.Review, mgp=mgp, threshold=0.3, topic_dict=topic_dict,
                                 lemma_text=reviews_lemmatized)
result.head(5)

# TODO: Add some evaluations, and top extract top tokens..

Unnamed: 0,Text,Topic,Rating,Lemma-text
0,nice hotel expensive parking got good deal sta...,type 3,4,"[parking, deal, anniversary, evening, advice, ..."
1,ok nothing special charge diamond member hilto...,Other,2,"[charge, diamond_member, decide, chain, shoot,..."
2,nice rooms not 4* experience hotel monaco seat...,type 3,3,"[experience, level, positive, bathroom, bed, h..."
3,"unique, great stay, wonderful time hotel monac...",type 3,5,"[stroll, downtown, shopping, area, sign, anima..."
4,"great stay great stay, went seahawk game aweso...",Other,5,"[game, downfall, view, building, complain, web..."


In [22]:
# COHESION CALCULATIONS

# import library from gensim
from gensim.models import CoherenceModel


# define function to get words in topics
def get_topics_lists(model, top_clusters, n_words):
    '''
    Gets lists of words in topics as a list of lists.

    model: gsdmm instance
    top_clusters:  numpy array containing indices of top_clusters
    n_words: top n number of words to include

    '''
    # create empty list to contain topics
    topics = []

    # iterate over top n clusters
    for cluster in top_clusters:
        #create sorted dictionary of word distributions
        sorted_dict = sorted(model.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[
                      :n_words]

        #create empty list to contain words
        topic = []

        #iterate over top n words in topic
        for k, v in sorted_dict:
            #append words to topic list
            topic.append(k)

        #append topics to topics list
        topics.append(topic)

    return topics


# Create Dictionary
id2word = corpora.Dictionary(reviews_lemmatized)

texts = reviews_lemmatized

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# get topics to feed to coherence model
topics = get_topics_lists(mgp, top_index, 20)

# evaluate model using Topic Coherence score
cm_gsdmm = CoherenceModel(topics=topics,
                          dictionary=id2word,
                          corpus=corpus,
                          texts=texts,
                          coherence='c_v')

# get coherence value
c_v_score = cm_gsdmm.get_coherence()

print(c_v_score)
# Well the cohesion is kinda shit atm


0.3451827758606199
