# Research question 3 - topic detection STTM (GSDMM algorithm)

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Setup" data-toc-modified-id="Setup-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Setup</a></span><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Styles" data-toc-modified-id="Styles-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Styles</a></span></li><li><span><a href="#Load-file" data-toc-modified-id="Load-file-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Load file</a></span></li><li><span><a href="#Remove-10-most-frequent-words" data-toc-modified-id="Remove-10-most-frequent-words-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Remove 10 most frequent words</a></span></li><li><span><a href="#Sample-the-data-set" data-toc-modified-id="Sample-the-data-set-1.5"><span class="toc-item-num">1.5&nbsp;&nbsp;</span>Sample the data set</a></span></li><li><span><a href="#Create-Dictionary" data-toc-modified-id="Create-Dictionary-1.6"><span class="toc-item-num">1.6&nbsp;&nbsp;</span>Create Dictionary</a></span></li><li><span><a href="#Most-frequent-words-in-each-topic" data-toc-modified-id="Most-frequent-words-in-each-topic-1.7"><span class="toc-item-num">1.7&nbsp;&nbsp;</span>Most frequent words in each topic</a></span></li><li><span><a href="#Assign-topic-name--and-probability-of-topic-to-each-sentence" data-toc-modified-id="Assign-topic-name--and-probability-of-topic-to-each-sentence-1.8"><span class="toc-item-num">1.8&nbsp;&nbsp;</span>Assign topic name  and probability of topic to each sentence</a></span></li><li><span><a href="#Plot-topics" data-toc-modified-id="Plot-topics-1.9"><span class="toc-item-num">1.9&nbsp;&nbsp;</span>Plot topics</a></span></li><li><span><a href="#Merge-sentence-topics-with-data-frame" data-toc-modified-id="Merge-sentence-topics-with-data-frame-1.10"><span class="toc-item-num">1.10&nbsp;&nbsp;</span>Merge sentence topics with data frame</a></span></li></ul></li></ul></div>

## Setup

### Imports

In [18]:
import numpy as np
import pandas as pd

from gensim.utils import simple_preprocess

from gsdmm.gsdmm import MovieGroupProcess
from GPyM_TM import GSDMM
from GPyM_TM import GPM

# Plotting tools
import pyLDAvis
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.simplefilter('ignore', category=FutureWarning)

### Styles

In [2]:
def set_plot_styles(styles):
    mpl.rcParams.update(mpl.rcParamsDefault)
    plt.style.use(styles)
    
set_plot_styles(['mplstyle.config'])
color = sns.color_palette('tab20')

### Load file

In [3]:
data_lemmatized = pd.read_pickle('data_preprocessed.pkl')

### Remove 10 most frequent words

In [19]:
from nltk import FreqDist
def freq_words(content):
    all_words = [word for sentences in content for sentence in sentences for word in simple_preprocess(str(sentence), deacc=True)]
    freq_dist = FreqDist(all_words)
    return freq_dist

In [20]:
words_freq = freq_words(data_lemmatized.data_lemmatized_freq.tolist())

In [21]:
most_frequent_words = set([word for word, v in words_freq.most_common(10)])
most_frequent_words

{'apartment',
 'area',
 'host',
 'location',
 'place',
 'restaurant',
 'room',
 'station',
 'stay',
 'time'}

In [22]:
most_frequent_words = set([word for word, v in words_freq.most_common(10)])
def remove_most_freq_words(text):
    return [word for sentence in text for word in sentence.split() if word not in most_frequent_words]

In [23]:
data_lemmatized['data_lemmatized_freq_most'] = data_lemmatized.data_lemmatized_freq.apply(lambda review: [remove_most_freq_words(sentence) 
                                                                                     for sentence in review])

### Sample the data set

In [24]:
def stratified_sample_df(df, col, n_samples):
    n = min(n_samples, df[col].value_counts().min())
    df_ = df.groupby(col).apply(lambda x: x.sample(n))
    df_.index = df_.index.droplevel(0)
    return df_

In [25]:
data_sample = stratified_sample_df(data_lemmatized, 'city', 20000)
print(data_sample.shape)
data_sample.head()

(260000, 31)


Unnamed: 0,id,date,comments,host_id,neighbourhood_cleansed,city,latitude,longitude,number_of_reviews,first_review,...,sentiment_reviews,sentiment_reviews_textblob,comments_to_sentences,sentiment_sentences,type,review_word_bined,tokens,data_lemmatized,data_lemmatized_freq,data_lemmatized_freq_most
2686876,34668170,2019-09-19,- to use the token (key). you need to place it...,64252230,朝阳区 / Chaoyang,Beijing,39.94108,116.44448,11,2019-08-31,...,1,1,"[to use the token key., you need to place it b...","[0, 0, 0, 0, 1, 0, 0, 0]",Non-Western,Long,"[[to, use, the, token, key], [you, need, to, p...","[[key], [handle, lock], [place, distance, dong...","[[key], [handle, lock], [place, distance, dong...","[[key], [handle, lock], [distance, dongzhiman,..."
2675689,16031942,2016-12-28,"【Trustable Owner!】\nSteven is a humble man, an...",63722494,朝阳区 / Chaoyang,Beijing,39.89313,116.46772,49,2016-12-24,...,1,1,"[Trustable Owner., Steven is a humble man and ...","[1, 1, 0]",Non-Western,Medium,"[[trustable, owner], [steven, is, humble, man,...","[[trustable, owner], [man, help, apartment], [...","[[trustable, owner], [man, help, apartment], [...","[[trustable, owner], [man, help], [apt, busine..."
2671875,9352934,2019-04-17,Beautifully designed and well located apartmen...,48541753,朝阳区 / Chaoyang,Beijing,39.90954,116.49802,50,2017-06-20,...,1,1,[Beautifully designed and well located apartme...,"[1, 1, 0]",Non-Western,Short,"[[beautifully, designed, and, well, located, a...","[[apartment], [communication], []]","[[apartment], [communication], []]","[[], [communication], []]"
2684961,30512392,2019-05-27,The apartment is good and the location is perf...,214747089,朝阳区 / Chaoyang,Beijing,39.93716,116.45477,26,2018-12-15,...,0,1,[The apartment is good and the location is per...,"[1, 0, 0, 0]",Non-Western,Medium,"[[the, apartment, is, good, and, the, location...","[[apartment, location], [bathroom, door], [sho...","[[apartment, location], [bathroom, door], [sho...","[[], [bathroom, door], [shower, tap], [bathroom]]"
2680668,22955615,2019-11-07,Neal was an incredibly kind host who was very ...,58985623,东城区,Beijing,39.89127,116.40089,136,2018-01-31,...,1,1,[Neal was an incredibly kind host who was very...,"[1, 1, 0, 1]",Non-Western,Long,"[[neal, was, an, incredibly, kind, host, who, ...","[[host], [familys, need, circumstance, stay], ...","[[host], [familys, need, circumstance, stay], ...","[[], [familys, need, circumstance], [space, fa..."


In [26]:
data_sample = [sentence for review in data_sample.data_lemmatized_freq_most.tolist() for sentence in review]

In [27]:
corpus = data_sample

In [33]:
nTopics=15

In [35]:
#data_dmm = GSDMM.DMM(corpus, nTopics) # Initialize the object, with default parameters.

#data_dmm = GSDMM.DMM(corpus, nTopics, alpha = 0.25, beta = 0.15, nTopWords = 15, iters =5) # Initialize the object.

data_dmm.topicAssigmentInitialise() # Performs the inital document assignments and counts
data_dmm.inference()

psi, theta, selected_psi, selected_theta = data_dmm.worddist() # Determines and stores the psi, theta and selected_psi and selected_theta values
   
finalAssignments = data_dmm.writeTopicAssignments() # Records the final topic assignments for the documents

coherence_topwords = data_dmm.writeTopTopicalWords(finalAssignments) # Record the top words for each document

score = data_dmm.coherence(coherence_topwords, len(finalAssignments)) #Calculates and stores the coherence

print("Final number of topics found: " + str(len(finalAssignments)))

corpus=1112234, words=20584, K=15, a=0.250000, b=0.150000, nTopWords=15, iters=5
iteration: 0
iteration: 1
iteration: 2
iteration: 3
iteration: 4
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
minute city bus walk town airport taxi subway beach bike 
neighborhood aire city heart distance home subway night lot spot 
coffee breakfast kitchen water machine touch morning tea fridge food 
shop distance minute store subway walk bar lot supermarket neighborhood 
question tip trip city experience lot home information communication thing 
family home friend day experience trip house night thank week 
building night noise street floor security parking door people bit 
value money communication question price experience home trip service airport 
bed shower bathroom water air towel toilet kitchen bedroom day 
view space balcony pool bed family city picture rooftop day 
bed space bathroom bedroom people kitchen lot floor night living 
check communication question day flight arrival problem luggage

### Create Dictionary

In [37]:
K = 10
mgp = MovieGroupProcess(K=10, alpha=0.1, beta=0.1, n_iters=30)
docs = data_sample
vocab = set(x for doc in docs for x in doc)
n_terms = len(vocab)
y = mgp.fit(docs, n_terms)

In stage 0: transferred 995496 clusters with 10 clusters populated
In stage 1: transferred 975394 clusters with 10 clusters populated
In stage 2: transferred 927155 clusters with 10 clusters populated
In stage 3: transferred 830435 clusters with 10 clusters populated
In stage 4: transferred 753400 clusters with 10 clusters populated
In stage 5: transferred 704606 clusters with 10 clusters populated
In stage 6: transferred 673836 clusters with 10 clusters populated
In stage 7: transferred 654193 clusters with 10 clusters populated
In stage 8: transferred 641965 clusters with 10 clusters populated
In stage 9: transferred 634114 clusters with 10 clusters populated
In stage 10: transferred 628809 clusters with 10 clusters populated
In stage 11: transferred 624898 clusters with 10 clusters populated
In stage 12: transferred 620433 clusters with 10 clusters populated
In stage 13: transferred 617267 clusters with 10 clusters populated
In stage 14: transferred 616286 clusters with 10 clusters 

In [38]:
doc_count = np.array(mgp.cluster_doc_count)
print('Number of documents per topic :', doc_count)
print('*'*20)

fractions = (np.array(mgp.cluster_doc_count)*100. / sum(mgp.cluster_doc_count))
np.set_printoptions(precision=2)
print('% of documents per topic:', fractions)
print('*'*20)

# Topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[::-1]
print('Most important topics (by number of docs inside):', top_index)
print('*'*20)

Number of documents per topic : [189690 177439 167455  48270  72466 115826  94582  48023  46325 152158]
********************
% of documents per topic: [17.05 15.95 15.06  4.34  6.52 10.41  8.5   4.32  4.17 13.68]
********************
Most important topics (by number of docs inside): [0 1 2 9 5 6 4 3 7 8]
********************


### Most frequent words in each topic

In [39]:
import operator
def top_words(cluster_word_distribution, top_index, num_words):
    for index in top_index:
        print('Topic {} '.format(index))
        print(list(sorted(mgp.cluster_word_distribution[index].items(), key=operator.itemgetter(1), reverse=True))[:num_words])
        print('*'*20)

In [40]:
top_words(mgp.cluster_word_distribution, top_index, 9)

Topic 0 
[('minute', 14034), ('distance', 12946), ('shop', 11836), ('subway', 11175), ('walk', 10940), ('bus', 9478), ('store', 8813), ('lot', 8473), ('bar', 8249)]
********************
Topic 1 
[('experience', 10160), ('home', 9624), ('family', 8875), ('thank', 7653), ('friend', 6037), ('trip', 5971), ('day', 4924), ('night', 4148), ('week', 3770)]
********************
Topic 2 
[('communication', 10930), ('question', 10541), ('check', 10249), ('tip', 4394), ('response', 4187), ('day', 4046), ('arrival', 3829), ('airport', 3511), ('information', 3421)]
********************
Topic 9 
[('space', 10208), ('value', 7502), ('picture', 5397), ('home', 5383), ('price', 5049), ('people', 4845), ('house', 4709), ('amenity', 4342), ('bed', 3702)]
********************
Topic 5 
[('bed', 15577), ('bathroom', 10072), ('shower', 6880), ('kitchen', 6823), ('water', 5400), ('bedroom', 5024), ('space', 4406), ('towel', 4013), ('air', 3811)]
********************
Topic 6 
[('city', 7594), ('town', 4290), (

In [43]:
topic_dict = {}
topic_names = ['Location',
               'Experience',
               'Communication with host',
               'Value',
               'Rooms',
               'Area',
               'View',
               'Night life',
               'Breakfast',
               'Building'
              ]
for i, topic_num in enumerate(top_index):
    topic_dict[topic_num]=topic_names[i] 

In [44]:
topic_dict

{0: 'Location',
 1: 'Experience',
 2: 'Communication with host',
 9: 'Value',
 5: 'Rooms',
 6: 'Area',
 4: 'View',
 3: 'Night life',
 7: 'Breakfast',
 8: 'Building'}

### Assign topic name  and probability of topic to each sentence

In [45]:
def create_topics_dataframe(data_text=data_lemmatized,  mgp=mgp, threshold=0.4, topic_dict=topic_dict):
    result = pd.DataFrame(columns=['text', 'topic', 'topic_prob'])
    for i, text in enumerate(data_text):
        result.at[i, 'text'] = text
        prob = mgp.choose_best_label(data_text[i])
        if prob[1] >= threshold:
            result.at[i, 'topic'] = topic_dict[prob[0]]
            result.at[i, 'topic_prob'] = prob[1]
        else:
            if len(text) != 0:
                result.at[i, 'topic'] = 'Other'
                result.at[i, 'topic_prob'] = prob[1]
            else:
                result.at[i, 'topic'] = []
                result.at[i, 'topic_prob'] = None
    return result

In [46]:
data_lemmatized_list = [sentence for review in data_lemmatized.data_lemmatized_freq_most.tolist() for sentence in review]

In [None]:
gsdmm_output = create_topics_dataframe(data_lemmatized_list)

### Plot topics

In [41]:
import pandas as pd
import pyLDAvis
import math

def prepare_data(mgp):
    vocabulary = list(vocab)
    doc_topic_dists = [mgp.score(doc) for doc in docs]
    for doc in doc_topic_dists:
        for f in doc:
            assert not isinstance(f, complex)

    doc_lengths = [len(doc) for doc in docs]
    term_counts_map = {}
    for doc in docs:
        for term in doc:
            term_counts_map[term] = term_counts_map.get(term, 0) + 1
    term_counts = [term_counts_map[term] for term in vocabulary]
    doc_topic_dists2 = [[v if not math.isnan(v) else 1/K for v in d] for d in doc_topic_dists]
    doc_topic_dists2 = [d if sum(d) > 0 else [1/K]*K for d in doc_topic_dists2]
    for doc in doc_topic_dists2:
        for f in doc:
            assert not isinstance(f, complex)
    
    assert (pd.DataFrame(doc_topic_dists2).sum(axis=1) < 0.999).sum() == 0
    matrix = []
    for cluster in mgp.cluster_word_distribution:
        total = sum([occurance for word, occurance in cluster.items()])
        assert not math.isnan(total)
        # assert total > 0
        if total == 0:
            row = [(1 / len(vocabulary))] * len(vocabulary)   # <--- The discussed workaround is here
        else:
            row = [cluster.get(term, 0) / total for term in vocabulary]
        for f in row:
            assert not isinstance(f, complex)
        matrix.append(row)
    return matrix, doc_topic_dists2, doc_lengths, vocabulary, term_counts

def prepare_visualization_data(mgp):
    vis_data = pyLDAvis.prepare(*prepare_data(mgp), sort_topics=False)
    return vis_data

vis_data = prepare_visualization_data(mgp)

%matplotlib inline
pyLDAvis.enable_notebook()
pyLDAvis.display(vis_data)

  kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
  log_lift = np.log(topic_term_dists / term_proportion)
  log_ttd = np.log(topic_term_dists)


### Merge sentence topics with data frame 

In [42]:
data_lemmatized['sentence_count'] = data_lemmatized.tokens.apply(lambda x: len(x))

In [113]:
sentence_topics = gsdmm_output.topic.tolist()
for index, row in data_lemmatized[:100].iterrows():
    n = row['sentence_count']
    data_lemmatized.at[index, 'sentence_topic'] = sentence_topics[:n]
    sentence_topics = sentence_topics[n:]

In [131]:
sentence_topics_prob = gsdmm_output.topic_prob.tolist()
data_lemmatized['sentence_topic_prob'] = None
data_lemmatized['sentence_topic_prob'] = data_lemmatized['sentence_topic_prob'].astype('object')
for index, row in data_lemmatized[:100].iterrows():
    n = row['sentence_count']
    data_lemmatized.at[index, 'sentence_topic_prob'] = sentence_topics_prob[:n]
    sentence_topics_prob = sentence_topics_prob[n:]

In [9]:
data_lemmatized.comments_to_sentences[:1].tolist()

[['My girlfriend and I had not known Alina before we took the leap of faith to rent her flat.',
  'Alina just could not be nicer.',
  'Her flat is comfortable homey very sunny and quiet at night.',
  'Her diverse neighborhood rocks.',
  'it is full of excellent eateries of varying ethnicities good supermarkets etc.',
  'etc.',
  'Her place is about a minute walk to the Finsbury Park tube stop and there are also several buses that ply Stroud Green Road.',
  'The Piccadilly and Victoria tube lines that serve Finsbury Park are very dependable and come frequently.',
  'My only caveat is if you are unwilling to take to minute commutes to downtown London attractions and money is no object you will prefer to stay downtown.',
  'But for anyone else you will love your experience of living in a real untouristy neighborhood and dealing with a generous warm-hearted woman I am happy to call a new friend']]

In [10]:
data_lemmatized.sentence_topics[:1].tolist()

AttributeError: 'DataFrame' object has no attribute 'sentence_topics'

In [11]:
data_lemmatized.head()

Unnamed: 0,id,date,comments,host_id,neighbourhood_cleansed,city,latitude,longitude,number_of_reviews,first_review,...,sentiment_from_rating,sentiment_reviews,sentiment_reviews_textblob,comments_to_sentences,sentiment_sentences,type,review_word_bined,tokens,data_lemmatized,data_lemmatized_freq
0,13913,2010-08-18,My girlfriend and I hadn't known Alina before ...,54730,Islington,London,51.56802,-0.11121,21,2010-08-18,...,pos,1,1,[My girlfriend and I had not known Alina befor...,"[0, 0, 1, 0, 1, 0, 0, 0, 0, 1]",Western,Long,"[[my, girlfriend, and, had, not, known, alina,...","[[girlfriend, leap, faith], [], [homey, night]...","[[girlfriend, leap, faith], [], [homey, night]..."
1,13913,2011-07-11,Alina was a really good host. The flat is clea...,54730,Islington,London,51.56802,-0.11121,21,2010-08-18,...,pos,1,1,"[Alina was a really good host., The flat is cl...","[0, 0, 0]",Western,Medium,"[[alina, was, really, good, host], [the, flat,...","[[host], [park, station], []]","[[host], [park, station], []]"
2,13913,2011-09-13,Alina is an amazing host. She made me feel rig...,54730,Islington,London,51.56802,-0.11121,21,2010-08-18,...,pos,1,1,"[Alina is an amazing host., She made me feel r...","[1, 0, 1, 1, 0, 0, 1]",Western,Long,"[[alina, is, an, amazing, host], [she, made, m...","[[host], [home], [friend, stranger], [espresso...","[[host], [home], [friend, stranger], [espresso..."
3,13913,2011-10-03,"Alina's place is so nice, the room is big and ...",54730,Islington,London,51.56802,-0.11121,21,2010-08-18,...,pos,1,1,[Alina s place is so nice the room is big and ...,"[1, 1, 0, 1]",Western,Long,"[[alina, place, is, so, nice, the, room, is, b...","[[room, bed], [host, instance, towel, bed, hom...","[[room, bed], [host, instance, towel, bed, hom..."
4,13913,2011-10-09,"Nice location in Islington area, good for shor...",54730,Islington,London,51.56802,-0.11121,21,2010-08-18,...,pos,1,1,[Nice location in Islington area good for shor...,"[1, 1]",Western,Short,"[[nice, location, in, islington, area, good, f...","[[location, area, business, trip], [host]]","[[location, area, business, trip], [host]]"


In [136]:
with open('data_with_topics.pkl', 'wb') as f:
    pickle.dump(data_lemmatized, f)