In [0]:
import os
import gzip
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models import Word2Vec, LdaMulticore

from gensim.corpora import Dictionary

import numpy as np
import pyLDAvis
from pyLDAvis import gensim

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [0]:
with open('input/reviews_data.txt', 'rb') as f:
    for i, line in enumerate(f):
        print(i, line)
        break

0 b"Oct 12 2009 \tNice trendy hotel location not too bad.\tI stayed in this hotel for one night. As this is a fairly new place some of the taxi drivers did not know where it was and/or did not want to drive there. Once I have eventually arrived at the hotel, I was very pleasantly surprised with the decor of the lobby/ground floor area. It was very stylish and modern. I found the reception's staff geeting me with 'Aloha' a bit out of place, but I guess they are briefed to say that to keep up the coroporate image.As I have a Starwood Preferred Guest member, I was given a small gift upon-check in. It was only a couple of fridge magnets in a gift box, but nevertheless a nice gesture.My room was nice and roomy, there are tea and coffee facilities in each room and you get two complimentary bottles of water plus some toiletries by 'bliss'.The location is not great. It is at the last metro stop and you then need to take a taxi, but if you are not planning on going to see the historic sites in 

In [0]:
#  Ignoring STOPWORDS function
def preprocessing(sentence):
    return [word for word in simple_preprocess(sentence) \
            if word not in STOPWORDS]

#  A function that reads sentences
def read_sentences(filename):
    with open(filename, 'rb') as f:
        for line in f:
            yield preprocessing(line)  #  iterating sentences

In [0]:
sentences = list(read_sentences('input/reviews_data.txt'))

In [0]:
len(sentences)  #  number of sentences

255404

## Model

In [0]:
model = Word2Vec(sentences, size=100, window=5, min_count=2)

## Vector representation

In [0]:
#  The most similar words
model.wv.most_similar('good')

[('decent', 0.8326160907745361),
 ('great', 0.8147170543670654),
 ('excellent', 0.8032379746437073),
 ('ok', 0.6541191339492798),
 ('fair', 0.6409246921539307),
 ('reasonable', 0.6199265718460083),
 ('average', 0.6138752698898315),
 ('nice', 0.6130363345146179),
 ('okay', 0.6087017059326172),
 ('terrific', 0.6073236465454102)]

In [0]:
model.wv.most_similar('bad')

[('terrible', 0.7201335430145264),
 ('horrible', 0.6737171411514282),
 ('awful', 0.6487069725990295),
 ('poor', 0.6060112714767456),
 ('okay', 0.5991752743721008),
 ('negative', 0.5902986526489258),
 ('ok', 0.5737053751945496),
 ('fair', 0.5591520667076111),
 ('worse', 0.5510201454162598),
 ('lousy', 0.5434880256652832)]

In [0]:
#  bad + terrible - good
model.wv.most_similar(positive=['bad', 'terrible'], negative=['good'])

[('horrible', 0.8349869251251221),
 ('awful', 0.7970746755599976),
 ('dreadful', 0.721515417098999),
 ('horrid', 0.6982079148292542),
 ('horrific', 0.688759982585907),
 ('horrendous', 0.6875596642494202),
 ('shocking', 0.6638824343681335),
 ('appalling', 0.6575877070426941),
 ('miserable', 0.6538939476013184),
 ('nasty', 0.6435579061508179)]

In [0]:
model.wv.most_similar('hotel')

[('property', 0.7065991163253784),
 ('place', 0.6075045466423035),
 ('hotels', 0.5893204212188721),
 ('resort', 0.5518630743026733),
 ('establishment', 0.5392840504646301),
 ('accommodation', 0.536628246307373),
 ('accomodation', 0.5154818296432495),
 ('travelodge', 0.5103733539581299),
 ('location', 0.49186185002326965),
 ('accomodations', 0.4703047275543213)]

In [0]:
model.wv.most_similar('level')

[('levels', 0.8054913282394409),
 ('grade', 0.6398859620094299),
 ('floors', 0.6064598560333252),
 ('floor', 0.591593325138092),
 ('tier', 0.5887811183929443),
 ('echelons', 0.5657998919487),
 ('class', 0.556341290473938),
 ('calibre', 0.5218075513839722),
 ('horizon', 0.5138673782348633),
 ('leve', 0.4979720413684845)]

### **Topic modeling - LDA**

In [0]:
#  Mixed subset of words
sentences_light = np.random.permutation(sentences)

In [0]:
sentences_light[0]

['good',
 'hotel',
 'excellent',
 'position',
 'stayed',
 'grand',
 'extended',
 'tour',
 'china',
 'booked',
 'tour',
 'operator',
 'liked',
 'position',
 'room',
 'overlooked',
 'forbidden',
 'city',
 'smog',
 'entrance',
 'gate',
 'attractive',
 'staff',
 'attentive',
 'rooms',
 'attractive',
 'food',
 'good',
 'walked',
 'adjoining',
 'hotels',
 'shopping',
 'area',
 'worth',
 'walk',
 'compare',
 'hotels']

In [0]:
#  subset of 1000 words
sentences_light = sentences_light[:1000]

In [0]:
dictionary = Dictionary(sentences_light)

In [0]:
len(dictionary)  #  number of unique sentences

8731

In [0]:
dictionary.doc2bow(['car'])

[(765, 1)]

In [0]:
dictionary.token2id['car']

765

In [0]:
bow_corpus = [dictionary.doc2bow(sent) for sent in sentences_light]

In [0]:
lda_model = LdaMulticore(bow_corpus, id2word=dictionary, num_topics=100, passes=20, workers=8)

In [0]:
#  What words appear in given topics
for idx, topic in lda_model.print_topics(-1):
  print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.026*"valet" + 0.019*"car" + 0.014*"parking" + 0.013*"hotel" + 0.013*"room" + 0.013*"keys" + 0.011*"night" + 0.011*"said" + 0.011*"called" + 0.011*"left"
Topic: 1 
Words: 0.025*"hotel" + 0.021*"storm" + 0.014*"beijing" + 0.012*"aud" + 0.012*"great" + 0.010*"worked" + 0.010*"train" + 0.010*"reviews" + 0.008*"cruise" + 0.008*"room"
Topic: 2 
Words: 0.026*"hotel" + 0.021*"room" + 0.020*"stay" + 0.018*"staff" + 0.011*"great" + 0.011*"friendly" + 0.010*"trump" + 0.009*"stayed" + 0.008*"clean" + 0.008*"casino"
Topic: 3 
Words: 0.039*"hotel" + 0.014*"room" + 0.014*"stay" + 0.013*"great" + 0.011*"location" + 0.011*"breakfast" + 0.009*"staff" + 0.008*"central" + 0.008*"wine" + 0.007*"nice"
Topic: 4 
Words: 0.021*"room" + 0.020*"hotel" + 0.018*"night" + 0.014*"stay" + 0.012*"stayed" + 0.010*"th" + 0.009*"clean" + 0.008*"great" + 0.008*"view" + 0.008*"good"
Topic: 5 
Words: 0.031*"hotel" + 0.023*"room" + 0.014*"great" + 0.010*"nice" + 0.009*"stay" + 0.008*"area" + 0.008*"staff" 

## **Visualization of topics**

In [0]:
lda_vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
pyLDAvis.display(lda_vis)