Unsupervised technique that intends to analyze large volumes of text data by clustering the documents into groups. Group the documents into clusters based on similar characteristics.

Non-negative matrix factorization 
    Performs clustering as well as dimensionality reduction. 
    It can be used in combination with TF-IDF scheme to perform topic modeling. 

In [1]:
import os
import pandas as pd
import numpy as np

os.chdir("D:\Choogle\Data\dataset_review")
reviews_datasets = pd.read_csv(r'dataset2_london.csv')
reviews_datasets = reviews_datasets.head(20000)
reviews_datasets.dropna()

Unnamed: 0,crayon_review_id,crayon_user_id,crayon_product_id,domain,url,type,category,date_created,gid,key,...,user_id,user_name,user_location_text,user_city,user_country,user_total_reviews,user_total_reviews_range,user_helpful_reviews,user_helpful_reviews_range,partition_0


In [2]:
#Use TFIDF vectorizer as NMF works with TFIDF. Create a document term matrix with TFIDF.
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(max_df=0.8, min_df=2, stop_words='english')
doc_term_matrix = tfidf_vect.fit_transform(reviews_datasets['text'].values.astype('U'))

In [3]:
#Create a probability matrix that contains probabilities of all the words in the vocabulary for all the topics. 
#To do so, we use the NMF class from the sklearn.decomposition module.

from sklearn.decomposition import NMF

nmf = NMF(n_components=5, random_state=42)
nmf.fit(doc_term_matrix )

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=5, random_state=42, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [4]:
#randomly get 10 words from our vocabulary
import random

for i in range(10):
    random_id = random.randint(0,len(tfidf_vect.get_feature_names()))
    print(tfidf_vect.get_feature_names()[random_id])

parfait
projector
chap
eatery
deeply
patrons
tuscan
helped
hour
days


In [5]:
#Retrieve the probability vector of words for the first topic and retrieve the indexes of the ten words with the highest probabilities
first_topic = nmf.components_[0]
top_topic_words = first_topic.argsort()[-10:]

In [6]:
#These indexes can now be passed to the tfidf_vect object to retrieve the actual words
for i in top_topic_words:
    print(tfidf_vect.get_feature_names()[i])

lovely
night
excellent
cocktails
location
fantastic
service
atmosphere
food
great


In [9]:
for i,topic in enumerate(nmf.components_):
    print(f'Top 30 words for topic #{i}:')
    print([tfidf_vect.get_feature_names()[i] for i in topic.argsort()[-30:]])
    print('\n')

Top 30 words for topic #0:
['especially', 'efficient', 'love', 'time', 'really', 'amazing', 'come', 'fun', 'visited', 'inamo', 'drinks', 'friends', 'highly', 'friendly', 'recommend', 'reasonably', 'value', 'evening', 'priced', 'definitely', 'lovely', 'night', 'excellent', 'cocktails', 'location', 'fantastic', 'service', 'atmosphere', 'food', 'great']


Top 30 words for topic #1:
['eat', 'took', 'like', 'got', 'asked', 'just', 'tables', 'restaurant', 'waitress', 'wait', 'waiting', 'sushi', 'told', 'arrived', 'didn', 'menu', 'drinks', 'inamo', 'experience', 'came', 'dishes', 'quot', 'minutes', 'interactive', 'did', 'ordered', 'food', 'time', 'order', 'table']


Top 30 words for topic #2:
['interesting', 'tapas', 'like', 'pork', 'london', 'lunch', 'bread', 'natural', 'extensive', 'octopus', 'nice', 'menu', 'douro', 'dishes', 'bottle', 'charcuterie', 'good', 'terroirs', 'plates', 'selection', 'excellent', 'glass', 'portuguese', 'cheese', 'french', 'wines', 'small', 'list', 'bar', 'wine']



In [10]:
topic_values = nmf.transform(doc_term_matrix)
reviews_datasets['Topic'] = topic_values.argmax(axis=1)
reviews_datasets.head()

Unnamed: 0,crayon_review_id,crayon_user_id,crayon_product_id,domain,url,type,category,date_created,gid,key,...,user_name,user_location_text,user_city,user_country,user_total_reviews,user_total_reviews_range,user_helpful_reviews,user_helpful_reviews_range,partition_0,Topic
0,RR-202001000-552550429,RU-302001000-806642411,R-102001000-202456154,item_1_saturam_restaurant_review_metadata_info,https://item_1_saturam_restaurant_review_metad...,Reviews,Restaurants,2019-04-11,item_1_saturam_restaurant_review_metadata_info...,ef65c514583b9254dbd8f794c516aaca21c864076d59e3...,...,Stevetarn2014,"London, United Kingdom",London,United Kingdom,423,101 to 500,1030,1001 to 5000,item_1_restaurant_review_20190609_v10_stage4_c...,3
1,RR-202001000-550793573,RU-302001000-802600320,R-102001000-202456154,item_1_saturam_restaurant_review_metadata_info,https://item_1_saturam_restaurant_review_metad...,Reviews,Restaurants,2019-04-11,item_1_saturam_restaurant_review_metadata_info...,e766851ea18b154b154e419691d4ee7eeb4db9d80333f4...,...,T35BZcristinag,,,,141,101 to 500,9,1 to 100,item_1_restaurant_review_20190609_v10_stage4_c...,0
2,RR-202001000-533398566,RU-302001000-802928560,R-102001000-200371326,item_1_saturam_restaurant_review_metadata_info,https://item_1_saturam_restaurant_review_metad...,Reviews,Restaurants,2019-04-12,item_1_saturam_restaurant_review_metadata_info...,982dced88fd267e522b3591a402d2a3f5d3cb8bc783bdd...,...,Q1487BW,"Bristol, United Kingdom",Bristol,United Kingdom,8,1 to 100,2,1 to 100,item_1_restaurant_review_20190609_v10_stage4_c...,0
3,RR-202001000-556167355,RU-302001000-803691542,R-102001000-200371326,item_1_saturam_restaurant_review_metadata_info,https://item_1_saturam_restaurant_review_metad...,Reviews,Restaurants,2019-04-12,item_1_saturam_restaurant_review_metadata_info...,ffdeac1f47ee1b9e21e3bf54869d87e1b9ab7b9a55d02f...,...,Ian M,,,,6,1 to 100,1,1 to 100,item_1_restaurant_review_20190609_v10_stage4_c...,3
4,RR-202001000-545446145,RU-302001000-809584645,R-102001000-200371326,item_1_saturam_restaurant_review_metadata_info,https://item_1_saturam_restaurant_review_metad...,Reviews,Restaurants,2019-04-12,item_1_saturam_restaurant_review_metadata_info...,cf0e41f130196d008f51f301712f0f48e572472e4532a1...,...,gillank,"Hanoi, Vietnam",Hanoi,Vietnam,140,101 to 500,41,1 to 100,item_1_restaurant_review_20190609_v10_stage4_c...,3
