# Load Data

In [1]:
import re
import ssl

import pandas as pd
import numpy as np
import tqdm
from scipy.stats import chi2_contingency

import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

ssl._create_default_https_context = ssl._create_unverified_context
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

from langdetect import detect

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy

from pprint import pprint


In [2]:
# load data
star_wars = pd.read_csv('/Users/javier/Desktop/cmgt-597/steam-review-data/star_wars_df.csv') # Star Wars: Jedi Survivor

In [3]:
# filter helpful and actual reviews
# note: weighted_vote_score measures how helpful a review is. so reviews with a weighted_vote_score of 0 are filtered out

star_wars_df = star_wars[(star_wars['weighted_vote_score'] != 0.0) & # helpful reviews only
             (star_wars['playtime_at_review'] != 0.0) & # actual reviews only (can't review a game you never played!)
             (star_wars['review'].notnull()) # reviews with words only
             ]

star_wars_reviews = star_wars_df[['recommendationid', 'review', 'voted_up']]
star_wars_reviews

Unnamed: 0,recommendationid,review,voted_up
0,137493977,I'm not your Beta Tester EA.,False
1,137509326,Wait for performance fixes.,False
2,150979796,"Don't get me wrong, The PC release of this gam...",True
3,145957830,"When this game first launched, I was smart, I ...",False
4,148551958,[h1][b]May the FPS be With You[/b][/h1]\n\n[i]...,False
...,...,...,...
13249,137457062,Don't understand why people are complaining :)...,True
13250,137499558,I cannot speak for everyone but I feel like ma...,True
13251,137467901,A cinematic game does not need to run high FPS...,True
13252,137552438,I like it. The people disliking it over the ba...,True


# Text Prep

In [4]:
# First tokenize each sentence into a list of words, removing punctuations and unnecessary characters altogether

def sent_to_words(sentences): # define function sent_to_words (sentences to words), argument is the dataframe and column of your reviews.
    for sentence in sentences: # for every sentence (review) in the sentences (review df)
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations  | # use the gensim package to tokenize your reviews

star_wars_reviews['data_words'] = list(sent_to_words(star_wars_reviews['review'])) # declare a new column in the lop_review df called 'data_words'
                                                                        # then use the sent_to_words function to tokenize the review column of the lop_review df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  star_wars_reviews['data_words'] = list(sent_to_words(star_wars_reviews['review'])) # declare a new column in the lop_review df called 'data_words'


In [5]:
# After tokenization remove stoword and apply lemmatization 

stop_words = stopwords.words('english') # declare english as the stopword language
# stop_words.extend(['app', 'bank', 'account','try','issue','problem', 'face'])  #adding my own stop words

def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): # define function process_words. arguments are the column of the tokenized words,
                                                                                                 # stop words library, allowed_postags
    # remove stop words
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts] # remove stop words from tokenized reviews
    texts_out = [] # define variable texts_out. it is an empty list.

    # Lemmatization
    nlp = spacy.load("en_core_web_sm",disable=['parser', 'ner']) # define variable nlp. this is for lemmatization
    for sent in texts: # for every sentence in texts (review df)
        doc = nlp(" ".join(sent)) # declare variable doc
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) # append the lemmatized words to the texts_out empty list
    
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out] # remove stopwords again
    return texts_out # return the texts_out list

star_wars_reviews['data_ready'] = process_words(star_wars_reviews['data_words'])  # processed Text Data!

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  star_wars_reviews['data_ready'] = process_words(star_wars_reviews['data_words'])  # processed Text Data!


In [6]:
# cleaning the reviews further to remove non-English text

def detect_my(text):
    try:
        return detect(text)
    except:
        return 'unknown'
    
star_wars_reviews['language'] = star_wars_reviews['review'].apply(detect_my)
star_wars_reviews


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  star_wars_reviews['language'] = star_wars_reviews['review'].apply(detect_my)


Unnamed: 0,recommendationid,review,voted_up,data_words,data_ready,language
0,137493977,I'm not your Beta Tester EA.,False,"[not, your, beta, tester, ea]","[beta, tester]",en
1,137509326,Wait for performance fixes.,False,"[wait, for, performance, fixes]","[wait, performance, fix]",en
2,150979796,"Don't get me wrong, The PC release of this gam...",True,"[don, get, me, wrong, the, pc, release, of, th...","[get, wrong, pc, release, game, terrible, choo...",en
3,145957830,"When this game first launched, I was smart, I ...",False,"[when, this, game, first, launched, was, smart...","[game, first, launch, smart, trust, buy, still...",en
4,148551958,[h1][b]May the FPS be With You[/b][/h1]\n\n[i]...,False,"[may, the, fps, be, with, you, the, overview, ...","[fps, overview, relevant, master, level, diffi...",en
...,...,...,...,...,...,...
13249,137457062,Don't understand why people are complaining :)...,True,"[don, understand, why, people, are, complainin...","[understand, people, complain, game, run, fine...",en
13250,137499558,I cannot speak for everyone but I feel like ma...,True,"[cannot, speak, for, everyone, but, feel, like...","[speak, feel, majority, people, complain, opti...",en
13251,137467901,A cinematic game does not need to run high FPS...,True,"[cinematic, game, does, not, need, to, run, hi...","[cinematic, game, run, high, fps, quit, cry, o...",en
13252,137552438,I like it. The people disliking it over the ba...,True,"[like, it, the, people, disliking, it, over, t...","[people, dislike, bad, pc, optimization, proba...",en


In [7]:
language_check = star_wars_reviews['language'].value_counts()
language_check

language
en         11918
so           174
af           115
unknown      111
de            86
tl            76
da            71
es            64
tr            56
no            53
cy            46
ca            45
sw            39
fr            37
hr            31
et            28
ro            28
pl            25
sq            24
id            24
nl            23
it            20
pt            19
sv            17
sl            16
vi            13
cs            12
fi             9
sk             8
hu             8
ru             6
lt             4
zh-cn          3
th             2
bg             1
mk             1
uk             1
Name: count, dtype: int64

In [8]:
file_name = 'star-wars-language.csv'

star_wars_reviews.to_csv(f'{file_name}', index = False)
output_filepath = f'/Users/javier/Desktop/cmgt-597/star-wars/{file_name}'
star_wars_reviews.to_csv(output_filepath, index = False)

In [8]:
drop_languages = ['bg', 'mk', 'ru', 'th', 'uk', 'zh-cn'] # these review tagged with these languages are not in English

star_wars_reviews = star_wars_reviews[(~star_wars_reviews['language'].isin(drop_languages)) &
                          ~((star_wars_reviews['language'] == 'unknown') & (star_wars_reviews['data_ready'].apply(len) == 0))
                          ]

star_wars_reviews = star_wars_reviews.drop(columns='language')

In [9]:
star_wars_reviews

Unnamed: 0,recommendationid,review,voted_up,data_words,data_ready
0,137493977,I'm not your Beta Tester EA.,False,"[not, your, beta, tester, ea]","[beta, tester]"
1,137509326,Wait for performance fixes.,False,"[wait, for, performance, fixes]","[wait, performance, fix]"
2,150979796,"Don't get me wrong, The PC release of this gam...",True,"[don, get, me, wrong, the, pc, release, of, th...","[get, wrong, pc, release, game, terrible, choo..."
3,145957830,"When this game first launched, I was smart, I ...",False,"[when, this, game, first, launched, was, smart...","[game, first, launch, smart, trust, buy, still..."
4,148551958,[h1][b]May the FPS be With You[/b][/h1]\n\n[i]...,False,"[may, the, fps, be, with, you, the, overview, ...","[fps, overview, relevant, master, level, diffi..."
...,...,...,...,...,...
13249,137457062,Don't understand why people are complaining :)...,True,"[don, understand, why, people, are, complainin...","[understand, people, complain, game, run, fine..."
13250,137499558,I cannot speak for everyone but I feel like ma...,True,"[cannot, speak, for, everyone, but, feel, like...","[speak, feel, majority, people, complain, opti..."
13251,137467901,A cinematic game does not need to run high FPS...,True,"[cinematic, game, does, not, need, to, run, hi...","[cinematic, game, run, high, fps, quit, cry, o..."
13252,137552438,I like it. The people disliking it over the ba...,True,"[like, it, the, people, disliking, it, over, t...","[people, dislike, bad, pc, optimization, proba..."


# Base LDA Model

In [10]:
# Create the Dictionary and Corpus needed for Topic Modelling

#  Dictionary
id2word = corpora.Dictionary(star_wars_reviews['data_ready'])

#  Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in star_wars_reviews['data_ready']]

In [11]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=10, 
                                            random_state=100,
                                            chunksize=100,
                                            passes=10,
                                            per_word_topics=True)

pprint(lda_model.print_topics())

[(0,
  '0.114*"good" + 0.034*"learn" + 0.033*"watch" + 0.031*"grain" + 0.029*"bad" '
  '+ 0.027*"scene" + 0.021*"long" + 0.020*"cut" + 0.019*"bug" + 0.017*"life"'),
 (1,
  '0.065*"complain" + 0.034*"suck" + 0.027*"wise" + 0.022*"anymore" + '
  '0.021*"background" + 0.020*"tab" + 0.019*"trooper" + 0.019*"lag" + '
  '0.019*"storm" + 0.013*"use"'),
 (2,
  '0.099*"med" + 0.065*"amd" + 0.055*"youtube" + 0.053*"claim" + 0.049*"moron" '
  '+ 0.049*"founder" + 0.043*"ram" + 0.034*"gb" + 0.030*"ti" + 0.029*"guess"'),
 (3,
  '0.116*"game" + 0.033*"crash" + 0.031*"play" + 0.026*"get" + 0.022*"buy" + '
  '0.021*"even" + 0.020*"time" + 0.016*"minute" + 0.016*"try" + 0.013*"fix"'),
 (4,
  '0.022*"go" + 0.017*"get" + 0.017*"make" + 0.015*"feel" + 0.015*"time" + '
  '0.014*"fight" + 0.012*"force" + 0.011*"cal" + 0.011*"story" + 0.011*"boss"'),
 (5,
  '0.125*"war" + 0.115*"star" + 0.085*"order" + 0.074*"fall" + 0.055*"game" + '
  '0.030*"survivor" + 0.025*"love" + 0.025*"good" + 0.017*"fan" + '
  '0.01

In [12]:
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

coherence_model_lda = CoherenceModel(model=lda_model, texts=star_wars_reviews['data_ready'], dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.348813367666329

Coherence Score:  0.44870359651682934


# Hyperparameter Tuning

In [14]:
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, # changed the LDA method to Rolly John's
                                                id2word=dictionary,
                                                num_topics=k, 
                                                random_state=100,
                                                chunksize=100,
                                                passes=10,
                                                alpha=a,
                                                eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=star_wars_reviews['data_ready'], dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [15]:
grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]

corpus_title = ['75% Corpus', '100% Corpus']

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)

    # pd.DataFrame(model_results).to_csv('./results/lda_tuning_results.csv', index=False)
                    
    lda_tuning_results = pd.DataFrame(model_results)

    file_name = 'star-wars-lda-tuning-results.csv'

    lda_tuning_results.to_csv(f'{file_name}', index = False)
    output_filepath = f'/Users/javier/Desktop/cmgt-597/star-wars/{file_name}'
    lda_tuning_results.to_csv(output_filepath, index = False)
    
    pbar.close()

100%|██████████| 540/540 [2:28:48<00:00, 16.53s/it]  


In [16]:
lda_tuning_results = pd.read_csv('/Users/javier/Desktop/cmgt-597/star-wars/star-wars-lda-tuning-results.csv')
lda_tuning_results

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
0,75% Corpus,2,0.01,0.01,0.494393
1,75% Corpus,2,0.01,0.31,0.494393
2,75% Corpus,2,0.01,0.61,0.497145
3,75% Corpus,2,0.01,0.9099999999999999,0.478383
4,75% Corpus,2,0.01,symmetric,0.494393
...,...,...,...,...,...
535,100% Corpus,10,asymmetric,0.01,0.479167
536,100% Corpus,10,asymmetric,0.31,0.469788
537,100% Corpus,10,asymmetric,0.61,0.469806
538,100% Corpus,10,asymmetric,0.9099999999999999,0.447950


# Final Model

In [13]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=4, 
                                            random_state=100,
                                            chunksize=100,
                                            passes=10,
                                            alpha=0.01,
                                            eta=0.9)
                                            #the alpha and beta will yield the highest coherence score, according to the tuning results
                                            

pprint(lda_model.print_topics())

[(0,
  '0.024*"good" + 0.009*"grind" + 0.008*"bad" + 0.007*"long" + 0.007*"average" '
  '+ 0.007*"bug" + 0.007*"price" + 0.005*"money" + 0.005*"easy" + '
  '0.005*"life"'),
 (1,
  '0.068*"game" + 0.026*"run" + 0.021*"get" + 0.019*"fps" + 0.014*"play" + '
  '0.012*"performance" + 0.012*"issue" + 0.010*"good" + 0.009*"high" + '
  '0.009*"first"'),
 (2,
  '0.010*"dislike" + 0.010*"nut" + 0.003*"woman" + 0.003*"pet" + 0.002*"gud" + '
  '0.002*"peak" + 0.002*"merrin" + 0.002*"pew" + 0.002*"steamhappy" + '
  '0.001*"mahd"'),
 (3,
  '0.037*"game" + 0.029*"goty" + 0.015*"war" + 0.014*"star" + 0.013*"story" + '
  '0.010*"combat" + 0.009*"feel" + 0.008*"order" + 0.008*"fall" + '
  '0.007*"character"')]


In [14]:
# Final Model Evaluation
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

coherence_model_lda = CoherenceModel(model=lda_model, texts=star_wars_reviews['data_ready'], dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.020583057836053

Coherence Score:  0.5893379450604936


# Topic-Review Allocation

In [15]:
def format_topics_sentences(ldamodel=None, corpus=None, texts=None):
    # Initialize an empty list to store data
    data = []

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        
        # Initialize variables to store topic information
        dominant_topic = None
        perc_contribution = None
        topic_keywords = None
        
        # Loop through topics in the row
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                dominant_topic = int(topic_num)
                perc_contribution = round(prop_topic, 4)
                break
        
        # Store the extracted information
        data.append([dominant_topic, perc_contribution, topic_keywords])

    # Convert the list of data into a DataFrame
    sent_topics_df = pd.DataFrame(data, columns=['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords'])

    # Add original text to the end of the output
    contents = pd.Series(texts)
    contents.reset_index(drop=True, inplace=True)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return sent_topics_df

# Call the function with appropriate arguments
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=star_wars_reviews.review)

# Format the DataFrame
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'review']

# Display the first 5 rows of the DataFrame
display(df_dominant_topic.head(5))


Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,review
0,0,1,0.9853,"game, run, get, fps, play, performance, issue,...",I'm not your Beta Tester EA.
1,1,1,0.9901,"game, run, get, fps, play, performance, issue,...",Wait for performance fixes.
2,2,1,0.7559,"game, run, get, fps, play, performance, issue,...","Don't get me wrong, The PC release of this gam..."
3,3,1,0.9963,"game, run, get, fps, play, performance, issue,...","When this game first launched, I was smart, I ..."
4,4,3,0.8241,"game, goty, war, star, story, combat, feel, or...",[h1][b]May the FPS be With You[/b][/h1]\n\n[i]...


In [16]:
star_wars_reviews = df_dominant_topic.merge(star_wars_reviews, how='left')
star_wars_reviews.drop_duplicates(subset=['Document_No'], keep='first', inplace=True)

In [17]:
star_wars_reviews = star_wars_reviews[['Document_No', 'review', 'data_words', 'data_ready', 'Keywords', 'Dominant_Topic', 'Topic_Perc_Contrib', 'voted_up']]

In [18]:
# final dataframe
star_wars_reviews

Unnamed: 0,Document_No,review,data_words,data_ready,Keywords,Dominant_Topic,Topic_Perc_Contrib,voted_up
0,0,I'm not your Beta Tester EA.,"[not, your, beta, tester, ea]","[beta, tester]","game, run, get, fps, play, performance, issue,...",1,0.9853,False
1,1,Wait for performance fixes.,"[wait, for, performance, fixes]","[wait, performance, fix]","game, run, get, fps, play, performance, issue,...",1,0.9901,False
2,2,"Don't get me wrong, The PC release of this gam...","[don, get, me, wrong, the, pc, release, of, th...","[get, wrong, pc, release, game, terrible, choo...","game, run, get, fps, play, performance, issue,...",1,0.7559,True
3,3,"When this game first launched, I was smart, I ...","[when, this, game, first, launched, was, smart...","[game, first, launch, smart, trust, buy, still...","game, run, get, fps, play, performance, issue,...",1,0.9963,False
4,4,[h1][b]May the FPS be With You[/b][/h1]\n\n[i]...,"[may, the, fps, be, with, you, the, overview, ...","[fps, overview, relevant, master, level, diffi...","game, goty, war, star, story, combat, feel, or...",3,0.8241,False
...,...,...,...,...,...,...,...,...
17341,13099,Don't understand why people are complaining :)...,"[don, understand, why, people, are, complainin...","[understand, people, complain, game, run, fine...","game, run, get, fps, play, performance, issue,...",1,0.9977,True
17342,13100,I cannot speak for everyone but I feel like ma...,"[cannot, speak, for, everyone, but, feel, like...","[speak, feel, majority, people, complain, opti...","game, run, get, fps, play, performance, issue,...",1,0.9996,True
17343,13101,A cinematic game does not need to run high FPS...,"[cinematic, game, does, not, need, to, run, hi...","[cinematic, game, run, high, fps, quit, cry, o...","game, run, get, fps, play, performance, issue,...",1,0.5833,True
17344,13102,I like it. The people disliking it over the ba...,"[like, it, the, people, disliking, it, over, t...","[people, dislike, bad, pc, optimization, proba...","game, run, get, fps, play, performance, issue,...",1,0.7971,True


In [28]:
file_name = 'star-wars-lda.csv'

star_wars_reviews.to_csv(f'{file_name}', index = False)
output_filepath = f'/Users/javier/Desktop/cmgt-597/star-wars/{file_name}'
star_wars_reviews.to_csv(output_filepath, index = False)

# Data Analysis

In [19]:
star_wars_lda_final = pd.read_csv('/Users/javier/Desktop/cmgt-597/star-wars/star-wars-lda.csv')
star_wars_lda_final

Unnamed: 0,Document_No,review,data_words,data_ready,Keywords,Dominant_Topic,Topic_Perc_Contrib,voted_up
0,0,I'm not your Beta Tester EA.,"['not', 'your', 'beta', 'tester', 'ea']","['beta', 'tester']","game, run, get, fps, play, performance, issue,...",1,0.9853,False
1,1,Wait for performance fixes.,"['wait', 'for', 'performance', 'fixes']","['wait', 'performance', 'fix']","game, run, get, fps, play, performance, issue,...",1,0.9901,False
2,2,"Don't get me wrong, The PC release of this gam...","['don', 'get', 'me', 'wrong', 'the', 'pc', 're...","['get', 'wrong', 'pc', 'release', 'game', 'ter...","game, run, get, fps, play, performance, issue,...",1,0.7559,True
3,3,"When this game first launched, I was smart, I ...","['when', 'this', 'game', 'first', 'launched', ...","['game', 'first', 'launch', 'smart', 'trust', ...","game, run, get, fps, play, performance, issue,...",1,0.9963,False
4,4,[h1][b]May the FPS be With You[/b][/h1]\n\n[i]...,"['may', 'the', 'fps', 'be', 'with', 'you', 'th...","['fps', 'overview', 'relevant', 'master', 'lev...","game, goty, war, star, story, combat, feel, or...",3,0.8241,False
...,...,...,...,...,...,...,...,...
13099,13099,Don't understand why people are complaining :)...,"['don', 'understand', 'why', 'people', 'are', ...","['understand', 'people', 'complain', 'game', '...","game, run, get, fps, play, performance, issue,...",1,0.9977,True
13100,13100,I cannot speak for everyone but I feel like ma...,"['cannot', 'speak', 'for', 'everyone', 'but', ...","['speak', 'feel', 'majority', 'people', 'compl...","game, run, get, fps, play, performance, issue,...",1,0.9996,True
13101,13101,A cinematic game does not need to run high FPS...,"['cinematic', 'game', 'does', 'not', 'need', '...","['cinematic', 'game', 'run', 'high', 'fps', 'q...","game, run, get, fps, play, performance, issue,...",1,0.5833,True
13102,13102,I like it. The people disliking it over the ba...,"['like', 'it', 'the', 'people', 'disliking', '...","['people', 'dislike', 'bad', 'pc', 'optimizati...","game, run, get, fps, play, performance, issue,...",1,0.7971,True


In [20]:
# distribution of topics across reviews

topic_count = star_wars_lda_final['Dominant_Topic'].value_counts()
topic_count

Dominant_Topic
1    9349
3    2990
0     522
2     243
Name: count, dtype: int64

In [21]:
# Visualize topics

import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

---