In [1]:
import pandas as pd
import numpy as np
import re
import string
import spacy
import gensim
from gensim import corpora 
import nltk
from nltk.corpus import stopwords
import spacy
import en_core_web_md

import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from gensim.models.coherencemodel import CoherenceModel

In [2]:
df = pd.read_csv('AmazonReviews.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      568454 non-null  int64 
 1   ProductId               568454 non-null  object
 2   UserId                  568454 non-null  object
 3   ProfileName             568438 non-null  object
 4   HelpfulnessNumerator    568454 non-null  int64 
 5   HelpfulnessDenominator  568454 non-null  int64 
 6   Score                   568454 non-null  int64 
 7   Time                    568454 non-null  int64 
 8   Summary                 568427 non-null  object
 9   Text                    568454 non-null  object
dtypes: int64(5), object(5)
memory usage: 43.4+ MB


In [3]:
print('Unique users:')
print(len(df.groupby('UserId')))
print('Unique products:')
print(len(df.groupby('ProductId')))

Unique users:
256059
Unique products:
74258


In [4]:
def clean_text(text):
    delect_dict = {special_char: '' for special_char in string.punctuation}
    delect_dict[' '] = ' ' # ??
    table = str.maketrans(delect_dict)  # dict, get keys' ASCII
    text = text.translate(table)  # turn key to its value
    text = text.split()  # tokenize
    text = ' '.join([w for w in text if (not w.isdigit() and len(w)>3)]) # remove digits, word's len>3
    return text.lower()

In [5]:
df.dropna(axis=0, how='any', inplace=True)
df['Text'] = df['Text'].apply(clean_text)
df['Word_count'] = df['Text'].apply(lambda x: len(x.split()))

print(df['Score'].value_counts())
print(len(df))

max_text_len = df['Word_count'].max()
print(max_text_len)

5    363111
4     80655
1     52264
3     42638
2     29743
Name: Score, dtype: int64
568411
2234


In [6]:
condition = (df['Word_count']<100) & (df['Word_count']>=20)
df_short = df[condition]
print(len(df_short))

print(df_short['Score'].value_counts())
print(len(df_short))

373281
5    233648
4     53413
1     36230
3     29080
2     20910
Name: Score, dtype: int64
373281


In [7]:
df_sampled = df_short.groupby('Score').apply(lambda x: x.sample(n=20000)).reset_index(drop=True)
print(len(df_sampled))

100000


In [8]:
def remove_stopwords(text):
    stop_words = stopwords.words('english')
    text = text.split(' ')
    text = ' '.join([w for w in text if w not in stop_words])
    return text

df_sampled['Text'] = df_sampled['Text'].apply(remove_stopwords)

In [9]:
nlp = spacy.load('en_core_web_md', disable = ['parser', 'ner'])

def lemmatization(texts, allowed_postags = ['NOUN', 'ADJ']):
    output = []
    for sentence in texts:
        doc = nlp(sentence)
        output.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return output

text_list = df_sampled['Text'].tolist()
tokens = lemmatization(text_list)
print(tokens[1])

['many', 'different', 'item', 'amazoncom', 'year', 'review', 'today', 'rancher', 'green', 'apple', 'one', 'gift', 'husband', 'jolly', 'rancher', 'candy', 'broken', 'crushed', 'dust', 'slivereen', 'seller', 'handling', 'box']


In [12]:
dictionary = corpora.Dictionary(tokens)
doc_term_matrix = [dictionary.doc2bow(rev) for rev in tokens]

In [None]:
LDA = gensim.models.ldamodel.LdaModel
lda_model = LDA(corpus = doc_term_matrix, 
                id2word = dictionary, 
                num_topics = 10, 
                random_state = 100, 
                chunksize = 100, 
                passes = 50,
                iterations = 100
               )

In [None]:
lda_model.print_topics()

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, doc_term_matrix, dictionary)
vis

In [None]:
print('Perplexity:') # the lower the better
# Perp = P(w1w2...wn)^(1/n)
print(lda_model.log_perplexity(doc_term_matrix), total_docs = 10000)
coherence_model_lda = CoherenceModel(model = lda_model, texts = tokens, dictionary = dictionary, coherence = 'c_v')
print('Coherence:') # the higher the better
print(coherence_model_lda.get_coherence())