In [189]:
import pandas as pd

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import TfidfModel

import spacy
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

In [200]:
# Load dataset
iphone_rev = pd.read_json('..\\scrape_reviews\\apple_reviews.json')
iphone_rev.head()

Unnamed: 0,name,stars,title,review
0,Akash Sinha,3,Worst battery,Worst battery performance.Iphone 11 is far bet...
1,Amazon Customer,4,Kidney as a load balancer,"Sold kidney bought this, now not feeling well ..."
2,S.Siva Ram Kris,5,"Another Fabulous IPhone ( IPhone 12, Blue 64GB)",The media could not be loaded. Another beauty...
3,Paras,5,The Beast,This was my first switch to an ios device afte...
4,Anupam,1,This phone is a joke and the joke is on us!!! ...,Extremely disappointed with this phone. It’s a...


In [191]:
iphone_rev['stars'].value_counts(dropna=False)
# positive reviews are higher

5    742
4    137
1    122
3     62
2     23
Name: stars, dtype: int64

In [192]:
reviews = iphone_rev['review'].to_list()
print(reviews[1])

Sold kidney bought this, now not feeling well but the number of days I am alive with one kidney will enjoy using this phone. Guys be careful if you rich it’s ok else sell something else but not kidney it hurts


In [193]:
# transforming words to its base form 
def lemmatization(texts, allowed_postags=['NOUN','VERB', 'ADJ', 'ADV']):
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    final = [' '.join([token.lemma_ for token in nlp(text) if token.pos_ in allowed_postags]) for text in texts]
    return final

lemmatized_rev = lemmatization(reviews)
print(lemmatized_rev[1])

sell kidney buy now feel well number day alive kidney enjoy use phone guy careful rich ’ else sell else kidney hurt


In [194]:
# basic preprocessing to tokenize the reviews
processed_rev = [simple_preprocess(text) for text in lemmatized_rev]

print(processed_rev[1])

['sell', 'kidney', 'buy', 'now', 'feel', 'well', 'number', 'day', 'alive', 'kidney', 'enjoy', 'use', 'phone', 'guy', 'careful', 'rich', 'else', 'sell', 'else', 'kidney', 'hurt']


In [195]:
# including bigrams 
bigram_phrases = gensim.models.Phrases(processed_rev, min_count=5, threshold=1)
bigram = gensim.models.phrases.Phraser(bigram_phrases)

def make_bigrams(texts):
    return [bigram[doc] for doc in texts]

texts = make_bigrams(processed_rev)

print(texts[1])

['sell', 'kidney', 'buy', 'now', 'feel', 'well', 'number', 'day', 'alive', 'kidney', 'enjoy', 'use_phone', 'guy', 'careful', 'rich', 'else', 'sell', 'else', 'kidney', 'hurt']


In [196]:
# tf-idf removal
# https://stackoverflow.com/questions/24688116/how-to-filter-out-words-with-low-tf-idf-in-a-corpus-with-gensim/35951190

id2word = corpora.Dictionary(texts)
corpus = [id2word.doc2bow(text) for text in texts]

tfidf = TfidfModel(corpus, id2word=id2word)

low_value=0.03
words = []

for i in range(0, len(corpus)):
    bow = corpus[i]
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value<low_value]
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] 

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i] = new_bow

In [197]:
# building model
lda_model = gensim.models.ldamodel.LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=8,
    update_every=1,
    chunksize=100,
    passes=10,
    alpha='auto',
    random_state=42
)

In [198]:
# Visualization
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word, mds='mmds', R=30)
vis

  default_term_info = default_term_info.sort_values(


- From the above visualization we can see there are lot of reviews with the word `camera_quality`, which could indicate that the phone has some pretty great cameras
- Words like `performance` and `smooth` is also high which could suggest that this phone performes great
- Has great `build_quality` and `battery`
- There are also negative aspects such as `overpriced`, `waste_money`

In [199]:
# saving the model 
lda_model.save('model\\iphone_model.model')