In [46]:
# Importing modules
import pandas as pd
import os
import re
os.chdir('..')

In [47]:
#loading the user review data
criticreviews = pd.read_csv(r'C:\Users\koenm\OneDrive\Bureaublad\Thesis\Metacritic_thesis\4 Topic_modeling\Topic modelling input\criticcleanforlda.csv', delimiter=";")

#print head
criticreviews.head()

Unnamed: 0,Document_No,url,critic_review_body
0,0,https://www.metacritic.com/music/the-order-of-...,followup s sublime pushin stone finds valerie ...
1,1,https://www.metacritic.com/music/the-order-of-...,june ingénue girldonewrong persona creator spe...
2,2,https://www.metacritic.com/music/the-order-of-...,order time unique presents confident dynamic s...
3,3,https://www.metacritic.com/music/the-order-of-...,triumph form order time completely idiosyncrat...
4,4,https://www.metacritic.com/music/the-order-of-...,times atmospheric grounded mysticism astral pl...


In [58]:
#Prepare data for LDA Analysis because it has already been preprocessed, therefor no cleaning is needed

In [48]:
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\koenm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [49]:
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
data = criticreviews.critic_review_body.values.tolist()
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)
print(data_words[:1][0][:30])

['followup', 'sublime', 'pushin', 'stone', 'finds', 'valerie', 'june', 'expanding', 'unique', 'blend', 'blues', 'soul', 'mountain', 'music', 'create', 'distinctive', 'hybrid', 'past', 'future', 'coalesce', 'gentle', 'power']


In [50]:
import gensim.corpora as corpora
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1)]


In [51]:
from pprint import pprint

# number of topics
num_topics = 4

In [52]:
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)

In [53]:
# Print the Keyword in the 20 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.011*"album" + 0.009*"like" + 0.008*"one" + 0.007*"still" + 0.006*"songs" '
  '+ 0.005*"new" + 0.005*"sound" + 0.005*"band" + 0.005*"even" + '
  '0.005*"albums"'),
 (1,
  '0.010*"album" + 0.010*"music" + 0.008*"may" + 0.007*"record" + 0.006*"best" '
  '+ 0.006*"pop" + 0.004*"like" + 0.004*"sound" + 0.004*"yet" + '
  '0.004*"theres"'),
 (2,
  '0.012*"like" + 0.010*"album" + 0.006*"time" + 0.005*"tracks" + '
  '0.005*"record" + 0.005*"songs" + 0.004*"theres" + 0.004*"music" + '
  '0.004*"one" + 0.004*"new"'),
 (3,
  '0.020*"album" + 0.011*"songs" + 0.006*"one" + 0.006*"much" + 0.005*"record" '
  '+ 0.005*"like" + 0.005*"band" + 0.004*"sound" + 0.004*"little" + '
  '0.004*"good"')]


In [65]:
# 6. What is the Dominant topic and its percentage contribution in each document

In [66]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=criticreviews['critic_review_body']):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(lda_model[corpus]):
        row = row_list[0] if lda_model.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = lda_model.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [55]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=criticreviews['critic_review_body'])

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,8.0,0.9594,"mountain, expanding, future, june, unique, fin...",followup s sublime pushin stone finds valerie ...
1,1,12.0,0.9451,"sharp, world, girldonewrong, wickedly, creator...",june ingénue girldonewrong persona creator spe...
2,2,10.0,0.9333,"dynamic, performer, influences, presents, back...",order time unique presents confident dynamic s...
3,3,11.0,0.9333,"roots, completely, form, beholden, order, take...",triumph form order time completely idiosyncrat...
4,4,6.0,0.9641,"voice, time, soul, door, grounded, strings, es...",times atmospheric grounded mysticism astral pl...
