# Load Data

In [1]:
import re
import ssl

import pandas as pd
import numpy as np
import tqdm
from scipy.stats import chi2_contingency

import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

ssl._create_default_https_context = ssl._create_unverified_context
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

from langdetect import detect

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy

from pprint import pprint


In [2]:
# load data
lop = pd.read_csv('/Users/javier/Desktop/cmgt-597/steam-review-data/lop_df.csv') # Lies of P
lop.shape

(13901, 8)

In [3]:
# filter helpful and actual reviews
# note: weighted_vote_score measures how helpful a review is. so reviews with a weighted_vote_score of 0 are filtered out

lop_df = lop[(lop['weighted_vote_score'] != 0.0) & # helpful reviews only
             (lop['playtime_at_review'] != 0.0) & # actual reviews only (can't review a game you never played!)
             (lop['review'].notnull()) # reviews with words only
             ]

lop_reviews = lop_df[['recommendationid', 'review', 'voted_up']]
lop_reviews

Unnamed: 0,recommendationid,review,voted_up
0,149149878,the closest we're getting for a bloodborne gam...,True
1,148293801,Experience the horror of being french,True
2,154087316,all right then. keep your Bloodborne Sony.,True
3,148236957,They really went fine i'll make Bloodborne on ...,True
4,147355339,From the alternate reality where Miyazaki rea...,True
...,...,...,...
4412,146614908,No.,False
4413,146733054,fucking trash ass game DO NOT BUY UNLESS YOU W...,False
4414,150243763,A SOULS GAME IN 2023 WITH NO MULTIPLAYER. BIG L.,False
4415,146676211,----------------Edited Review after winning th...,False


# Text Prep

In [4]:
# First tokenize each sentence into a list of words, removing punctuations and unnecessary characters altogether

def sent_to_words(sentences): # define function sent_to_words (sentences to words), argument is the dataframe and column of your reviews.
    for sentence in sentences: # for every sentence (review) in the sentences (review df)
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations  | # use the gensim package to tokenize your reviews

lop_reviews['data_words'] = list(sent_to_words(lop_reviews['review'])) # declare a new column in the lop_review df called 'data_words'
                                                                        # then use the sent_to_words function to tokenize the review column of the lop_review df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lop_reviews['data_words'] = list(sent_to_words(lop_reviews['review'])) # declare a new column in the lop_review df called 'data_words'


In [6]:
# After tokenization remove stoword and apply lemmatization 
from nltk.corpus import stopwords # import the natural language toolkit stopwords processor
stop_words = stopwords.words('english') # declare english as the stopword language
# stop_words.extend(['app', 'bank', 'account','try','issue','problem', 'face'])  #adding my own stop words

def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): # define function process_words. arguments are the column of the tokenized words,
                                                                                                 # stop words library, allowed_postags
    # remove stop words
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts] # remove stop words from tokenized reviews
    texts_out = [] # define variable texts_out. it is an empty list.

    # Lemmatization
    nlp = spacy.load("en_core_web_sm",disable=['parser', 'ner']) # define variable nlp. this is for lemmatization
    for sent in texts: # for every sentence in texts (review df)
        doc = nlp(" ".join(sent)) # declare variable doc
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) # append the lemmatized words to the texts_out empty list
    
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out] # remove stopwords again
    return texts_out # return the texts_out list

lop_reviews['data_ready'] = process_words(lop_reviews['data_words'])  # processed Text Data!

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lop_reviews['data_ready'] = process_words(lop_reviews['data_words'])  # processed Text Data!


In [7]:
# cleaning the reviews further to remove non-English text

def detect_my(text):
    try:
        return detect(text)
    except:
        return 'unknown'
    
lop_reviews['language'] = lop_reviews['review'].apply(detect_my)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lop_reviews['language'] = lop_reviews['review'].apply(detect_my)


In [8]:
drop_languages = ['fa', 'zh-tw', 'sk', 'mk', 'uk', 'th', 'ar', 'ko', 'ru', 'zh-cn', 'pt'] # these review tagged with these languages are not in English

lop_reviews = lop_reviews[(~lop_reviews['language'].isin(drop_languages)) &
                          ~((lop_reviews['language'] == 'unknown') & (lop_reviews['data_ready'].apply(len) == 0))
                          ]

lop_reviews = lop_reviews.drop(columns='language')

In [9]:
lop_reviews

Unnamed: 0,recommendationid,review,voted_up,data_words,data_ready
0,149149878,the closest we're getting for a bloodborne gam...,True,"[the, closest, we, re, getting, for, bloodborn...","[close, get, bloodborne, game, pc, good, game]"
1,148293801,Experience the horror of being french,True,"[experience, the, horror, of, being, french]","[experience, french]"
2,154087316,all right then. keep your Bloodborne Sony.,True,"[all, right, then, keep, your, bloodborne, sony]","[keep, bloodborne]"
3,148236957,They really went fine i'll make Bloodborne on ...,True,"[they, really, went, fine, ll, make, bloodborn...","[really, go, fine, make, bloodborne, pc]"
4,147355339,From the alternate reality where Miyazaki rea...,True,"[from, the, alternate, reality, where, miyazak...","[alternate, reality, miyazaki, read, instead]"
...,...,...,...,...,...
4411,146775644,Just finished this game. I have played Elden R...,True,"[just, finished, this, game, have, played, eld...","[finish, game, play, elden, ring, bloodborne, ..."
4413,146733054,fucking trash ass game DO NOT BUY UNLESS YOU W...,False,"[fucking, trash, ass, game, do, not, buy, unle...","[trash, ass, game, buy, want, waste, money]"
4414,150243763,A SOULS GAME IN 2023 WITH NO MULTIPLAYER. BIG L.,False,"[souls, game, in, with, no, multiplayer, big]","[game, multiplayer, big]"
4415,146676211,----------------Edited Review after winning th...,False,"[edited, review, after, winning, the, game, lo...","[edit, review, win, game, lol, lot, butt, hurt..."


# Base LDA Model

In [10]:
#  Dictionary
id2word = corpora.Dictionary(lop_reviews['data_ready'])

#  Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in lop_reviews['data_ready']]

In [11]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=10, 
                                            random_state=100,
                                            chunksize=100,
                                            passes=10,
                                            per_word_topics=True)

pprint(lda_model.print_topics())

[(0,
  '0.103*"spectre" + 0.086*"twice" + 0.085*"lie" + 0.053*"edition" + '
  '0.019*"puppet" + 0.016*"alone" + 0.011*"hot" + 0.010*"rock" + 0.009*"phone" '
  '+ 0.008*"peak"'),
 (1,
  '0.093*"boring" + 0.064*"magnificent" + 0.058*"caveat" + 0.029*"grow" + '
  '0.022*"gud" + 0.018*"size" + 0.012*"flesh" + 0.012*"route" + 0.011*"trust" '
  '+ 0.005*"iframe"'),
 (2,
  '0.079*"response" + 0.022*"hate" + 0.014*"slap" + 0.013*"ill" + '
  '0.012*"aware" + 0.011*"loot" + 0.010*"brother" + 0.009*"unforgive" + '
  '0.005*"beauty" + 0.004*"forbid"'),
 (3,
  '0.080*"deluxe" + 0.033*"hype" + 0.031*"goty" + 0.028*"sell" + '
  '0.028*"launch" + 0.025*"thumb" + 0.016*"click" + 0.012*"alternate" + '
  '0.007*"arcade" + 0.006*"objectively"'),
 (4,
  '0.123*"fucking" + 0.032*"soulsbourne" + 0.025*"child" + 0.017*"twink" + '
  '0.012*"glitch" + 0.011*"scene" + 0.010*"edge" + 0.010*"son" + '
  '0.008*"successor" + 0.008*"com"'),
 (5,
  '0.069*"weapon" + 0.022*"multiplayer" + 0.021*"item" + 0.019*"cry" + '

In [12]:
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

coherence_model_lda = CoherenceModel(model=lda_model, texts=lop_reviews['data_ready'], dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.6123104194152456

Coherence Score:  0.48523619647351834


# Hyperparameter Tuning

In [None]:
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, # changed the LDA method to Rolly John's
                                                id2word=dictionary,
                                                num_topics=k, 
                                                random_state=100,
                                                chunksize=100,
                                                passes=10,
                                                alpha=a,
                                                eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=lop_reviews['data_ready'], dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [None]:
grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]

corpus_title = ['75% Corpus', '100% Corpus']

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)

    # pd.DataFrame(model_results).to_csv('./results/lda_tuning_results.csv', index=False)
                    
    lop_lda_tuning_results = pd.DataFrame(model_results)

    file_name = 'lop-lda-tuning-results3.csv'

    lop_lda_tuning_results.to_csv(f'{file_name}', index = False)
    output_filepath = f'/Users/javier/Desktop/cmgt-597/{file_name}'
    lop_lda_tuning_results.to_csv(output_filepath, index = False)
    
    pbar.close()

In [None]:
lop_lda_tuning_results = pd.read_csv('/Users/javier/Desktop/cmgt-597/lies-of-p/lop-lda-tuning-results3.csv')
lop_lda_tuning_results

# Final Model

In [13]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=7, 
                                            random_state=100,
                                            chunksize=100,
                                            passes=10,
                                            alpha=0.9,
                                            eta=0.9)
                                            #the alpha and beta will yield the highest coherence score, according to the tuning results
                                            

pprint(lda_model.print_topics())

[(0,
  '0.014*"money" + 0.011*"grind" + 0.011*"ass" + 0.005*"bad" + 0.004*"average" '
  '+ 0.003*"brain" + 0.003*"life" + 0.002*"waste" + 0.002*"attract" + '
  '0.002*"paint"'),
 (1,
  '0.004*"home" + 0.002*"prompt" + 0.001*"gud" + 0.001*"mom" + 0.001*"attract" '
  '+ 0.001*"multiplayer" + 0.001*"dummy" + 0.001*"tr" + 0.001*"playstation" + '
  '0.001*"table"'),
 (2,
  '0.001*"attract" + 0.001*"wasd" + 0.001*"dummy" + 0.001*"multiplayer" + '
  '0.001*"que" + 0.001*"crybaby" + 0.001*"vous" + 0.001*"vanilla" + '
  '0.001*"rofl" + 0.001*"lame"'),
 (3,
  '0.013*"puppet" + 0.007*"edition" + 0.006*"receive" + 0.006*"deluxe" + '
  '0.006*"gepetto" + 0.005*"mischievous" + 0.005*"boy" + 0.005*"promise" + '
  '0.003*"order" + 0.002*"twink"'),
 (4,
  '0.003*"nose" + 0.003*"grow" + 0.001*"attract" + 0.001*"multiplayer" + '
  '0.001*"dummy" + 0.001*"crybaby" + 0.001*"dad" + 0.001*"rofl" + '
  '0.001*"vanilla" + 0.001*"penis"'),
 (5,
  '0.081*"game" + 0.031*"soul" + 0.027*"good" + 0.021*"play" + '
  

In [14]:
# Final Model Evaluation
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

coherence_model_lda = CoherenceModel(model=lda_model, texts=lop_reviews['data_ready'], dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.351645891184366

Coherence Score:  0.5128438546016286


# Topic-Review Allocation

In [15]:
def format_topics_sentences(ldamodel=None, corpus=None, texts=None):
    # Initialize an empty list to store data
    data = []

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        
        # Initialize variables to store topic information
        dominant_topic = None
        perc_contribution = None
        topic_keywords = None
        
        # Loop through topics in the row
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                dominant_topic = int(topic_num)
                perc_contribution = round(prop_topic, 4)
                break
        
        # Store the extracted information
        data.append([dominant_topic, perc_contribution, topic_keywords])

    # Convert the list of data into a DataFrame
    sent_topics_df = pd.DataFrame(data, columns=['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords'])

    # Add original text to the end of the output
    contents = pd.Series(texts)
    contents.reset_index(drop=True, inplace=True)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return sent_topics_df

# Call the function with appropriate arguments
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=lop_reviews.review)

# Format the DataFrame
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'review']

# Display the first 5 rows of the DataFrame
display(df_dominant_topic.head(5))


Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,review
0,0,5,0.5692,"game, soul, good, play, bloodborne, story, wel...",the closest we're getting for a bloodborne gam...
1,1,5,0.203,"game, soul, good, play, bloodborne, story, wel...",Experience the horror of being french
2,2,5,0.3042,"game, soul, good, play, bloodborne, story, wel...",all right then. keep your Bloodborne Sony.
3,3,5,0.3836,"game, soul, good, play, bloodborne, story, wel...",They really went fine i'll make Bloodborne on ...
4,4,6,0.2551,"game, boss, attack, get, time, weapon, enemy, ...",From the alternate reality where Miyazaki rea...


In [16]:
lop_lda = df_dominant_topic.merge(lop_reviews, how='left')
lop_lda.drop_duplicates(subset=['Document_No'], keep='first', inplace=True)

In [17]:
lop_lda = lop_lda[['Document_No', 'review', 'data_words', 'data_ready', 'Keywords', 'Dominant_Topic', 'Topic_Perc_Contrib', 'voted_up']]

In [18]:
# final dataframe
lop_lda

Unnamed: 0,Document_No,review,data_words,data_ready,Keywords,Dominant_Topic,Topic_Perc_Contrib,voted_up
0,0,the closest we're getting for a bloodborne gam...,"[the, closest, we, re, getting, for, bloodborn...","[close, get, bloodborne, game, pc, good, game]","game, soul, good, play, bloodborne, story, wel...",5,0.5692,True
1,1,Experience the horror of being french,"[experience, the, horror, of, being, french]","[experience, french]","game, soul, good, play, bloodborne, story, wel...",5,0.2030,True
2,2,all right then. keep your Bloodborne Sony.,"[all, right, then, keep, your, bloodborne, sony]","[keep, bloodborne]","game, soul, good, play, bloodborne, story, wel...",5,0.3042,True
3,3,They really went fine i'll make Bloodborne on ...,"[they, really, went, fine, ll, make, bloodborn...","[really, go, fine, make, bloodborne, pc]","game, soul, good, play, bloodborne, story, wel...",5,0.3836,True
4,4,From the alternate reality where Miyazaki rea...,"[from, the, alternate, reality, where, miyazak...","[alternate, reality, miyazaki, read, instead]","game, boss, attack, get, time, weapon, enemy, ...",6,0.2551,True
...,...,...,...,...,...,...,...,...
4940,4302,Just finished this game. I have played Elden R...,"[just, finished, this, game, have, played, eld...","[finish, game, play, elden, ring, bloodborne, ...","game, boss, attack, get, time, weapon, enemy, ...",6,0.8913,True
4941,4303,fucking trash ass game DO NOT BUY UNLESS YOU W...,"[fucking, trash, ass, game, do, not, buy, unle...","[trash, ass, game, buy, want, waste, money]","money, grind, ass, bad, average, brain, life, ...",0,0.3126,False
4942,4304,A SOULS GAME IN 2023 WITH NO MULTIPLAYER. BIG L.,"[souls, game, in, with, no, multiplayer, big]","[game, multiplayer, big]","game, boss, attack, get, time, weapon, enemy, ...",6,0.2670,False
4943,4305,----------------Edited Review after winning th...,"[edited, review, after, winning, the, game, lo...","[edit, review, win, game, lol, lot, butt, hurt...","game, boss, attack, get, time, weapon, enemy, ...",6,0.9742,False


In [None]:
file_name = 'lop-lda-v4.csv'

lop_lda.to_csv(f'{file_name}', index = False)
output_filepath = f'/Users/javier/Desktop/cmgt-597/{file_name}'
lop_lda.to_csv(output_filepath, index = False)

# Data Analysis

In [19]:
lop_lda_final = pd.read_csv('/Users/javier/Desktop/cmgt-597/lies-of-p/lop-lda-v4.csv')
lop_lda_final

Unnamed: 0,Document_No,review,data_words,data_ready,Keywords,Dominant_Topic,Topic_Perc_Contrib,voted_up
0,0,the closest we're getting for a bloodborne gam...,"['the', 'closest', 'we', 're', 'getting', 'for...","['close', 'get', 'bloodborne', 'game', 'pc', '...","game, soul, good, play, bloodborne, story, wel...",6,0.5657,True
1,1,Experience the horror of being french,"['experience', 'the', 'horror', 'of', 'being',...","['experience', 'french']","game, soul, good, play, bloodborne, story, wel...",6,0.1949,True
2,2,all right then. keep your Bloodborne Sony.,"['all', 'right', 'then', 'keep', 'your', 'bloo...","['keep', 'bloodborne']","game, soul, good, play, bloodborne, story, wel...",6,0.2937,True
3,3,They really went fine i'll make Bloodborne on ...,"['they', 'really', 'went', 'fine', 'll', 'make...","['really', 'go', 'fine', 'make', 'bloodborne',...","game, soul, good, play, bloodborne, story, wel...",6,0.3525,True
4,4,From the alternate reality where Miyazaki rea...,"['from', 'the', 'alternate', 'reality', 'where...","['alternate', 'reality', 'miyazaki', 'read', '...","game, boss, get, attack, time, weapon, enemy, ...",4,0.2619,True
...,...,...,...,...,...,...,...,...
4302,4302,Just finished this game. I have played Elden R...,"['just', 'finished', 'this', 'game', 'have', '...","['finish', 'game', 'play', 'elden', 'ring', 'b...","game, boss, get, attack, time, weapon, enemy, ...",4,0.9015,True
4303,4303,fucking trash ass game DO NOT BUY UNLESS YOU W...,"['fucking', 'trash', 'ass', 'game', 'do', 'not...","['trash', 'ass', 'game', 'buy', 'want', 'waste...","money, grind, puppet, ass, gepetto, edition, r...",0,0.2757,False
4304,4304,A SOULS GAME IN 2023 WITH NO MULTIPLAYER. BIG L.,"['souls', 'game', 'in', 'with', 'no', 'multipl...","['game', 'multiplayer', 'big']","game, boss, get, attack, time, weapon, enemy, ...",4,0.2771,False
4305,4305,----------------Edited Review after winning th...,"['edited', 'review', 'after', 'winning', 'the'...","['edit', 'review', 'win', 'game', 'lol', 'lot'...","game, boss, get, attack, time, weapon, enemy, ...",4,0.9797,False


In [20]:
# distribution of topics across reviews

topic_count = lop_lda_final['Dominant_Topic'].value_counts()
topic_count

Dominant_Topic
6    2437
4    1422
0     286
3      86
2      38
1      22
5      16
Name: count, dtype: int64

In [21]:
# Visualize topics

import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

---