# Load Data

In [1]:
import re
import ssl

import pandas as pd
import numpy as np
import tqdm
from scipy.stats import chi2_contingency

import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

ssl._create_default_https_context = ssl._create_unverified_context
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

from langdetect import detect

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy

from pprint import pprint


In [2]:
# load data
remnant = pd.read_csv('/Users/javier/Desktop/cmgt-597/steam-review-data/remnant_df.csv') # Remnant II

In [3]:
# filter helpful and actual reviews
# note: weighted_vote_score measures how helpful a review is. so reviews with a weighted_vote_score of 0 are filtered out

remnant_df = remnant[(remnant['weighted_vote_score'] != 0.0) & # helpful reviews only
             (remnant['playtime_at_review'] != 0.0) & # actual reviews only (can't review a game you never played!)
             (remnant['review'].notnull()) # reviews with words only
             ]

remnant_reviews = remnant_df[['recommendationid', 'review', 'voted_up']]
remnant_reviews

Unnamed: 0,recommendationid,review,voted_up
0,143137396,Great game. Please add a North marker to the m...,True
1,142838045,"Just like the previous entry in the franchise,...",True
2,149330673,Remnant 1 but everything more and is one of th...,True
3,143149536,"Honestly, I'm regretting not playing the first...",True
4,144174130,"Like for everyone else in these reviews, I alw...",True
...,...,...,...
7184,142812886,Dumbest f'in game ever; stupid f*s who designe...,False
7185,142781339,"It doesnt have Hold to sprint, so u have to To...",False
7186,143125779,Enemies that can pick you up and drop you off ...,False
7187,142997892,No... just no..,False


# Text Prep

In [4]:
# First tokenize each sentence into a list of words, removing punctuations and unnecessary characters altogether

def sent_to_words(sentences): # define function sent_to_words (sentences to words), argument is the dataframe and column of your reviews.
    for sentence in sentences: # for every sentence (review) in the sentences (review df)
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations  | # use the gensim package to tokenize your reviews

remnant_reviews['data_words'] = list(sent_to_words(remnant_reviews['review'])) # declare a new column in the lop_review df called 'data_words'
                                                                        # then use the sent_to_words function to tokenize the review column of the lop_review df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  remnant_reviews['data_words'] = list(sent_to_words(remnant_reviews['review'])) # declare a new column in the lop_review df called 'data_words'


In [5]:
# After tokenization remove stoword and apply lemmatization 

stop_words = stopwords.words('english') # declare english as the stopword language
# stop_words.extend(['app', 'bank', 'account','try','issue','problem', 'face'])  #adding my own stop words

def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): # define function process_words. arguments are the column of the tokenized words,
                                                                                                 # stop words library, allowed_postags
    # remove stop words
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts] # remove stop words from tokenized reviews
    texts_out = [] # define variable texts_out. it is an empty list.

    # Lemmatization
    nlp = spacy.load("en_core_web_sm",disable=['parser', 'ner']) # define variable nlp. this is for lemmatization
    for sent in texts: # for every sentence in texts (review df)
        doc = nlp(" ".join(sent)) # declare variable doc.
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) # append the lemmatized words to the texts_out empty list
    
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out] # remove stopwords again
    return texts_out # return the texts_out list

remnant_reviews['data_ready'] = process_words(remnant_reviews['data_words'])  # processed Text Data!

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  remnant_reviews['data_ready'] = process_words(remnant_reviews['data_words'])  # processed Text Data!


In [6]:
# cleaning the reviews further to remove non-English text

def detect_my(text):
    try:
        return detect(text)
    except:
        return 'unknown'
    
remnant_reviews['language'] = remnant_reviews['review'].apply(detect_my)
remnant_reviews


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  remnant_reviews['language'] = remnant_reviews['review'].apply(detect_my)


Unnamed: 0,recommendationid,review,voted_up,data_words,data_ready,language
0,143137396,Great game. Please add a North marker to the m...,True,"[great, game, please, add, north, marker, to, ...","[great, game, add, north, marker, minimap]",en
1,142838045,"Just like the previous entry in the franchise,...",True,"[just, like, the, previous, entry, in, the, fr...","[previous, entry, franchise, ton, fun, especia...",en
2,149330673,Remnant 1 but everything more and is one of th...,True,"[remnant, but, everything, more, and, is, one,...","[remnant, good, game, play, coop, insane, amou...",en
3,143149536,"Honestly, I'm regretting not playing the first...",True,"[honestly, regretting, not, playing, the, firs...","[honestly, regret, play, first, game, likely, ...",en
4,144174130,"Like for everyone else in these reviews, I alw...",True,"[like, for, everyone, else, in, these, reviews...","[else, review, always, consider, remnant, ashe...",en
...,...,...,...,...,...,...
7184,142812886,Dumbest f'in game ever; stupid f*s who designe...,False,"[dumbest, in, game, ever, stupid, who, designe...","[dumb, game, ever, stupid, design, moron, joke...",en
7185,142781339,"It doesnt have Hold to sprint, so u have to To...",False,"[it, doesnt, have, hold, to, sprint, so, have,...","[hold, sprint, toggle, sprint, make, unplayabl...",en
7186,143125779,Enemies that can pick you up and drop you off ...,False,"[enemies, that, can, pick, you, up, and, drop,...","[enemy, pick, drop, ledge, easily, boss, fight...",en
7187,142997892,No... just no..,False,"[no, just, no]",[],hr


In [7]:
language_check = remnant_reviews['language'].value_counts()
language_check

language
en         6141
so          140
unknown      93
af           85
de           62
cy           55
tr           51
da           44
tl           44
es           40
no           36
id           32
fr           28
ca           27
pl           26
ro           24
it           24
sw           24
sq           23
nl           23
et           21
pt           20
ru           15
hr           11
sl           11
fi            9
sv            8
ar            7
vi            7
hu            7
lt            5
sk            4
zh-cn         4
th            4
zh-tw         2
cs            2
ko            2
lv            2
ja            1
Name: count, dtype: int64

In [None]:
file_name = 'remnant-language.csv'

remnant_reviews.to_csv(f'{file_name}', index = False)
output_filepath = f'/Users/javier/Desktop/cmgt-597/remnant-II/{file_name}'
remnant_reviews.to_csv(output_filepath, index = False)

In [8]:
drop_languages = ['ar', 'ja', 'ko', 'ru', 'th', 'zh-cn', 'zh-tw']# these review tagged with these languages are not in English

remnant_reviews = remnant_reviews[(~remnant_reviews['language'].isin(drop_languages)) &
                          ~((remnant_reviews['language'] == 'unknown') & (remnant_reviews['data_ready'].apply(len) == 0))
                          ]

remnant_reviews = remnant_reviews.drop(columns='language')

In [9]:
remnant_reviews

Unnamed: 0,recommendationid,review,voted_up,data_words,data_ready
0,143137396,Great game. Please add a North marker to the m...,True,"[great, game, please, add, north, marker, to, ...","[great, game, add, north, marker, minimap]"
1,142838045,"Just like the previous entry in the franchise,...",True,"[just, like, the, previous, entry, in, the, fr...","[previous, entry, franchise, ton, fun, especia..."
2,149330673,Remnant 1 but everything more and is one of th...,True,"[remnant, but, everything, more, and, is, one,...","[remnant, good, game, play, coop, insane, amou..."
3,143149536,"Honestly, I'm regretting not playing the first...",True,"[honestly, regretting, not, playing, the, firs...","[honestly, regret, play, first, game, likely, ..."
4,144174130,"Like for everyone else in these reviews, I alw...",True,"[like, for, everyone, else, in, these, reviews...","[else, review, always, consider, remnant, ashe..."
...,...,...,...,...,...
7184,142812886,Dumbest f'in game ever; stupid f*s who designe...,False,"[dumbest, in, game, ever, stupid, who, designe...","[dumb, game, ever, stupid, design, moron, joke..."
7185,142781339,"It doesnt have Hold to sprint, so u have to To...",False,"[it, doesnt, have, hold, to, sprint, so, have,...","[hold, sprint, toggle, sprint, make, unplayabl..."
7186,143125779,Enemies that can pick you up and drop you off ...,False,"[enemies, that, can, pick, you, up, and, drop,...","[enemy, pick, drop, ledge, easily, boss, fight..."
7187,142997892,No... just no..,False,"[no, just, no]",[]


# Base LDA Model

In [10]:
# Create the Dictionary and Corpus needed for Topic Modelling

#  Dictionary
id2word = corpora.Dictionary(remnant_reviews['data_ready'])

#  Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in remnant_reviews['data_ready']]

In [11]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=10, 
                                            random_state=100,
                                            chunksize=100,
                                            passes=10,
                                            per_word_topics=True)

pprint(lda_model.print_topics())

[(0,
  '0.042*"remnant" + 0.027*"class" + 0.020*"world" + 0.019*"hate" + '
  '0.016*"character" + 0.014*"different" + 0.012*"player" + 0.012*"slow" + '
  '0.012*"system" + 0.012*"weapon"'),
 (1,
  '0.032*"get" + 0.024*"game" + 0.017*"time" + 0.015*"go" + 0.013*"make" + '
  '0.012*"even" + 0.012*"weapon" + 0.011*"point" + 0.011*"character" + '
  '0.010*"feel"'),
 (2,
  '0.083*"boss" + 0.036*"enemy" + 0.028*"fight" + 0.024*"design" + '
  '0.018*"attack" + 0.014*"bad" + 0.013*"mechanic" + 0.013*"hit" + '
  '0.011*"dodge" + 0.011*"difficulty"'),
 (3,
  '0.077*"shit" + 0.060*"fuck" + 0.054*"garbage" + 0.040*"trash" + '
  '0.033*"control" + 0.023*"clunky" + 0.022*"unable" + 0.021*"wake" + '
  '0.021*"ledge" + 0.015*"absolute"'),
 (4,
  '0.115*"game" + 0.036*"play" + 0.026*"first" + 0.018*"good" + 0.015*"well" + '
  '0.013*"fun" + 0.010*"really" + 0.010*"make" + 0.010*"much" + 0.009*"soul"'),
 (5,
  '0.077*"game" + 0.031*"performance" + 0.028*"crash" + 0.024*"issue" + '
  '0.023*"run" + 0.022

In [12]:
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

coherence_model_lda = CoherenceModel(model=lda_model, texts=remnant_reviews['data_ready'], dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.6022939464906525

Coherence Score:  0.40425835871349525


# Hyperparameter Tuning

In [13]:
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, # changed the LDA method to Rolly John's
                                                id2word=dictionary,
                                                num_topics=k, 
                                                random_state=100,
                                                chunksize=100,
                                                passes=10,
                                                alpha=a,
                                                eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=remnant_reviews['data_ready'], dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [14]:
grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]

corpus_title = ['75% Corpus', '100% Corpus']

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)

    # pd.DataFrame(model_results).to_csv('./results/lda_tuning_results.csv', index=False)
                    
    lda_tuning_results = pd.DataFrame(model_results)

    file_name = 'remnant-lda-tuning-results.csv'

    lda_tuning_results.to_csv(f'{file_name}', index = False)
    output_filepath = f'/Users/javier/Desktop/cmgt-597/remnant-II/{file_name}'
    lda_tuning_results.to_csv(output_filepath, index = False)
    
    pbar.close()

100%|██████████| 540/540 [1:33:47<00:00, 10.42s/it]


In [15]:
lda_tuning_results = pd.read_csv('/Users/javier/Desktop/cmgt-597/remnant-II/remnant-lda-tuning-results.csv')
lda_tuning_results

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
0,75% Corpus,2,0.01,0.01,0.358307
1,75% Corpus,2,0.01,0.31,0.363289
2,75% Corpus,2,0.01,0.61,0.359201
3,75% Corpus,2,0.01,0.9099999999999999,0.416131
4,75% Corpus,2,0.01,symmetric,0.359396
...,...,...,...,...,...
535,100% Corpus,10,asymmetric,0.01,0.371487
536,100% Corpus,10,asymmetric,0.31,0.459079
537,100% Corpus,10,asymmetric,0.61,0.439275
538,100% Corpus,10,asymmetric,0.9099999999999999,0.421259


# Final Model

In [13]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=10, 
                                            random_state=100,
                                            chunksize=100,
                                            passes=10,
                                            alpha='symmetric',
                                            eta=0.3)
                                            #the alpha and beta will yield the highest coherence score, according to the tuning results
                                            

pprint(lda_model.print_topics())

[(0,
  '0.040*"dog" + 0.018*"community" + 0.016*"pet" + 0.008*"com" + '
  '0.005*"superior" + 0.005*"praise" + 0.005*"predecessor" + 0.005*"doubt" + '
  '0.005*"handler" + 0.004*"word"'),
 (1,
  '0.021*"get" + 0.015*"game" + 0.011*"time" + 0.011*"make" + 0.010*"enemy" + '
  '0.010*"weapon" + 0.010*"character" + 0.010*"boss" + 0.009*"go" + '
  '0.009*"feel"'),
 (2,
  '0.065*"boss" + 0.024*"enemy" + 0.023*"attack" + 0.020*"fight" + '
  '0.016*"design" + 0.016*"hit" + 0.013*"dodge" + 0.011*"bad" + '
  '0.011*"animation" + 0.010*"shoot"'),
 (3,
  '0.054*"fuck" + 0.038*"deer" + 0.034*"shit" + 0.028*"fucking" + '
  '0.025*"sprint" + 0.019*"wake" + 0.018*"chat" + 0.016*"toggle" + '
  '0.013*"cube" + 0.010*"text"'),
 (4,
  '0.093*"game" + 0.029*"play" + 0.020*"first" + 0.019*"good" + '
  '0.017*"remnant" + 0.015*"fun" + 0.013*"well" + 0.009*"boss" + 0.009*"soul" '
  '+ 0.008*"story"'),
 (5,
  '0.079*"game" + 0.020*"performance" + 0.018*"crash" + 0.017*"get" + '
  '0.016*"play" + 0.015*"run" + 

In [14]:
# Final Model Evaluation
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

coherence_model_lda = CoherenceModel(model=lda_model, texts=remnant_reviews['data_ready'], dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.266655079246949

Coherence Score:  0.48969138921402944


# Topic-Review Allocation

In [15]:
def format_topics_sentences(ldamodel=None, corpus=None, texts=None):
    # Initialize an empty list to store data
    data = []

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        
        # Initialize variables to store topic information
        dominant_topic = None
        perc_contribution = None
        topic_keywords = None
        
        # Loop through topics in the row
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                dominant_topic = int(topic_num)
                perc_contribution = round(prop_topic, 4)
                break
        
        # Store the extracted information
        data.append([dominant_topic, perc_contribution, topic_keywords])

    # Convert the list of data into a DataFrame
    sent_topics_df = pd.DataFrame(data, columns=['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords'])

    # Add original text to the end of the output
    contents = pd.Series(texts)
    contents.reset_index(drop=True, inplace=True)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return sent_topics_df

# Call the function with appropriate arguments
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=remnant_reviews.review)

# Format the DataFrame
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'review']

# Display the first 5 rows of the DataFrame
display(df_dominant_topic.head(5))


Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,review
0,0,4,0.3951,"game, play, first, good, remnant, fun, well, b...",Great game. Please add a North marker to the m...
1,1,4,0.6493,"game, play, first, good, remnant, fun, well, b...","Just like the previous entry in the franchise,..."
2,2,4,0.865,"game, play, first, good, remnant, fun, well, b...",Remnant 1 but everything more and is one of th...
3,3,4,0.5256,"game, play, first, good, remnant, fun, well, b...","Honestly, I'm regretting not playing the first..."
4,4,4,0.5218,"game, play, first, good, remnant, fun, well, b...","Like for everyone else in these reviews, I alw..."


In [16]:
remnant_reviews = df_dominant_topic.merge(remnant_reviews, how='left')
remnant_reviews.drop_duplicates(subset=['Document_No'], keep='first', inplace=True)

In [17]:
remnant_reviews = remnant_reviews[['Document_No', 'review', 'data_words', 'data_ready', 'Keywords', 'Dominant_Topic', 'Topic_Perc_Contrib', 'voted_up']]

In [18]:
# final dataframe
remnant_reviews

Unnamed: 0,Document_No,review,data_words,data_ready,Keywords,Dominant_Topic,Topic_Perc_Contrib,voted_up
0,0,Great game. Please add a North marker to the m...,"[great, game, please, add, north, marker, to, ...","[great, game, add, north, marker, minimap]","game, play, first, good, remnant, fun, well, b...",4,0.3951,True
1,1,"Just like the previous entry in the franchise,...","[just, like, the, previous, entry, in, the, fr...","[previous, entry, franchise, ton, fun, especia...","game, play, first, good, remnant, fun, well, b...",4,0.6493,True
2,2,Remnant 1 but everything more and is one of th...,"[remnant, but, everything, more, and, is, one,...","[remnant, good, game, play, coop, insane, amou...","game, play, first, good, remnant, fun, well, b...",4,0.8650,True
3,3,"Honestly, I'm regretting not playing the first...","[honestly, regretting, not, playing, the, firs...","[honestly, regret, play, first, game, likely, ...","game, play, first, good, remnant, fun, well, b...",4,0.5256,True
4,4,"Like for everyone else in these reviews, I alw...","[like, for, everyone, else, in, these, reviews...","[else, review, always, consider, remnant, ashe...","game, play, first, good, remnant, fun, well, b...",4,0.5218,True
...,...,...,...,...,...,...,...,...
8756,7042,Dumbest f'in game ever; stupid f*s who designe...,"[dumbest, in, game, ever, stupid, who, designe...","[dumb, game, ever, stupid, design, moron, joke...","game, performance, crash, get, play, run, issu...",5,0.6112,False
8757,7043,"It doesnt have Hold to sprint, so u have to To...","[it, doesnt, have, hold, to, sprint, so, have,...","[hold, sprint, toggle, sprint, make, unplayabl...","fuck, deer, shit, fucking, sprint, wake, chat,...",3,0.4090,False
8758,7044,Enemies that can pick you up and drop you off ...,"[enemies, that, can, pick, you, up, and, drop,...","[enemy, pick, drop, ledge, easily, boss, fight...","get, game, time, make, enemy, weapon, characte...",1,0.5679,False
8759,7045,No... just no..,"[no, just, no]",[],"dog, community, pet, com, superior, praise, pr...",0,0.1000,False


In [28]:
file_name = 'remnant-lda.csv'

remnant_reviews.to_csv(f'{file_name}', index = False)
output_filepath = f'/Users/javier/Desktop/cmgt-597/remnant-II/{file_name}'
remnant_reviews.to_csv(output_filepath, index = False)

# Data Analysis

In [19]:
remnant_lda_final = pd.read_csv('/Users/javier/Desktop/cmgt-597/remnant-II/remnant-lda.csv')
remnant_lda_final

Unnamed: 0,Document_No,review,data_words,data_ready,Keywords,Dominant_Topic,Topic_Perc_Contrib,voted_up
0,0,Great game. Please add a North marker to the m...,"['great', 'game', 'please', 'add', 'north', 'm...","['great', 'game', 'add', 'north', 'marker', 'm...","game, play, first, good, remnant, fun, well, b...",4,0.3951,True
1,1,"Just like the previous entry in the franchise,...","['just', 'like', 'the', 'previous', 'entry', '...","['previous', 'entry', 'franchise', 'ton', 'fun...","game, play, first, good, remnant, fun, well, b...",4,0.6493,True
2,2,Remnant 1 but everything more and is one of th...,"['remnant', 'but', 'everything', 'more', 'and'...","['remnant', 'good', 'game', 'play', 'coop', 'i...","game, play, first, good, remnant, fun, well, b...",4,0.8650,True
3,3,"Honestly, I'm regretting not playing the first...","['honestly', 'regretting', 'not', 'playing', '...","['honestly', 'regret', 'play', 'first', 'game'...","game, play, first, good, remnant, fun, well, b...",4,0.5256,True
4,4,"Like for everyone else in these reviews, I alw...","['like', 'for', 'everyone', 'else', 'in', 'the...","['else', 'review', 'always', 'consider', 'remn...","game, play, first, good, remnant, fun, well, b...",4,0.5218,True
...,...,...,...,...,...,...,...,...
7042,7042,Dumbest f'in game ever; stupid f*s who designe...,"['dumbest', 'in', 'game', 'ever', 'stupid', 'w...","['dumb', 'game', 'ever', 'stupid', 'design', '...","game, performance, crash, get, play, run, issu...",5,0.6112,False
7043,7043,"It doesnt have Hold to sprint, so u have to To...","['it', 'doesnt', 'have', 'hold', 'to', 'sprint...","['hold', 'sprint', 'toggle', 'sprint', 'make',...","fuck, deer, shit, fucking, sprint, wake, chat,...",3,0.4090,False
7044,7044,Enemies that can pick you up and drop you off ...,"['enemies', 'that', 'can', 'pick', 'you', 'up'...","['enemy', 'pick', 'drop', 'ledge', 'easily', '...","get, game, time, make, enemy, weapon, characte...",1,0.5679,False
7045,7045,No... just no..,"['no', 'just', 'no']",[],"dog, community, pet, com, superior, praise, pr...",0,0.1000,False


In [20]:
# distribution of topics across reviews

topic_count = remnant_lda_final['Dominant_Topic'].value_counts()
topic_count

Dominant_Topic
4    3669
1    1317
5    1201
0     283
2     165
7     132
9     104
6      77
3      60
8      39
Name: count, dtype: int64

In [21]:
# Visualize topics

import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

---