In [1]:
import pandas as pd
import numpy as np
import warnings
import re
import string

import spacy

import gensim
from gensim import corpora

# libraries for visualization
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
warnings.filterwarnings('ignore')

In [2]:
review_data= pd.read_csv("Reviews.csv")

print('Total reviews---- ',len(review_data))

print('Unique Products---- ',len(review_data.groupby('ProductId')))

print('Unique Users---- ',len(review_data.groupby('UserId')))

Total reviews----  568454
Unique Products----  74258
Unique Users----  256059


In [3]:
def clean_text(text ): 
    delete_dict = {sp_character: '' for sp_character in string.punctuation} 
    delete_dict[' '] = ' ' 
    table = str.maketrans(delete_dict)
    text1 = text.translate(table)
    #print('cleaned:'+text1)
    textArr= text1.split()
    text2 = ' '.join([w for w in textArr if ( not w.isdigit() and  ( not w.isdigit() and len(w)>3))]) 
    
    return text2.lower()

In [4]:
import nltk
nltk.download('stopwords') # run this one time

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gregormilligan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
review_data.dropna(axis = 0, how ='any',inplace=True) #removing any NAs
review_data['Text'] = review_data['Text'].apply(clean_text)
review_data['Num_words_text'] = review_data['Text'].apply(lambda x:len(str(x).split())) 

print('-------Dataset --------')
print(review_data['Score'].value_counts())
print(len(review_data))
print('-------------------------')
max_review_data_sentence_length  = review_data['Num_words_text'].max()

mask = (review_data['Num_words_text'] < 100) & (review_data['Num_words_text'] >=20)
df_short_reviews = review_data[mask]
df_sampled = df_short_reviews.groupby('Score').apply(lambda x: x.sample(n=20000)).reset_index(drop = True)

print('No of Short reviews')
print(len(df_short_reviews))



#all_sentences = train_data['text'].tolist() + test_data['text'].tolist()

-------Dataset --------
5    363111
4     80655
1     52264
3     42638
2     29743
Name: Score, dtype: int64
568411
-------------------------
No of Short reviews
373281


In [6]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# function to remove stopwords
def remove_stopwords(text):
    textArr = text.split(' ')
    rem_text = " ".join([i for i in textArr if i not in stop_words])
    return rem_text

# remove stopwords from the text
df_sampled['Text']=df_sampled['Text'].apply(remove_stopwords)

In [7]:
import en_core_web_sm
nlp = en_core_web_sm.load()
nlp.disable_pipes('parser', 'ner')
def lemmatization(texts,allowed_postags=['NOUN', 'ADJ']): 
    output = []
    for sent in texts:
        doc = nlp(sent) 
        output.append([token.lemma_ for token in doc if token.pos_ in allowed_postags ])
    return output

In [15]:
text_list=df_sampled['Text'][0:10000].tolist()
print(text_list[1])
tokenized_reviews = lemmatization(text_list)
print(tokenized_reviews[1])

lucky stock canisters last summer three left written called company several times dont listen well pity wonderful coffee sticks small wastefulbr started facebook group fans canisters want show numbers hopefully companys attentionbr amazon disallowed first review link group youre interested search nescafe facebook youll please bring back nescafe 61oz canisters
['lucky', 'stock', 'canister', 'last', 'summer', 'company', 'several', 'time', 'pity', 'wonderful', 'coffee', 'stick', 'small', 'wastefulbr', 'facebook', 'group', 'fan', 'canister', 'show', 'number', 'attentionbr', 'amazon', 'first', 'review', 'link', 'group', 'interested', 'search', 'nescafe', 'nescafe', 'oz', 'canister']


In [16]:
dictionary = corpora.Dictionary(tokenized_reviews)
doc_term_matrix = [dictionary.doc2bow(rev) for rev in tokenized_reviews]
# Creating the object for LDA model using gensim library
LDA = gensim.models.ldamodel.LdaModel

# Build LDA model
lda_model = LDA(corpus=doc_term_matrix, id2word=dictionary, num_topics=10, random_state=100,
                chunksize=1000, passes=50,iterations=100)
lda_model.print_topics()

[(0,
  '0.056*"taste" + 0.036*"flavor" + 0.029*"good" + 0.019*"product" + 0.014*"chocolate" + 0.011*"brand" + 0.011*"review" + 0.011*"money" + 0.010*"awful" + 0.010*"stuff"'),
 (1,
  '0.019*"plant" + 0.013*"year" + 0.012*"week" + 0.011*"trap" + 0.010*"day" + 0.009*"flower" + 0.009*"greenie" + 0.009*"last" + 0.008*"unhappy" + 0.008*"note"'),
 (2,
  '0.113*"coffee" + 0.017*"kcup" + 0.012*"good" + 0.012*"brand" + 0.012*"water" + 0.010*"cup" + 0.010*"ground" + 0.010*"weak" + 0.009*"cheap" + 0.009*"flavor"'),
 (3,
  '0.040*"chip" + 0.033*"water" + 0.027*"bottle" + 0.020*"coconut" + 0.016*"cereal" + 0.014*"brand" + 0.010*"product" + 0.009*"potato" + 0.009*"size" + 0.008*"case"'),
 (4,
  '0.064*"food" + 0.026*"product" + 0.022*"treat" + 0.019*"dog" + 0.015*"good" + 0.015*"chicken" + 0.013*"ingredient" + 0.012*"organic" + 0.011*"cat" + 0.011*"time"'),
 (5,
  '0.013*"popcorn" + 0.013*"batch" + 0.011*"product" + 0.009*"bread" + 0.009*"time" + 0.008*"vitamin" + 0.007*"honey" + 0.007*"year" + 0.00

In [14]:
# Visualize the topics
#https://github.com/bmabey/pyLDAvis
#https://speakerdeck.com/bmabey/visualizing-topic-models
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, doc_term_matrix, dictionary)
vis