In [159]:
dataName = 'english' # The name of the data file. Now it is just for different language.
num_of_representatives = 8 # The number of representative narrative you want to show for each topics.
num_of_topics = 50 # The number of topics you want to generate from the data.
# For output
import os
import pathlib
import csv


import re
import numpy as np
import pandas as pd
from pprint import pprint
import nltk; 
nltk.download('stopwords')

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import matplotlib.pyplot as plt
import wordcloud # Package by Andreas Mueller

# General usage
import math

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [160]:
# Prepare the output folder, according to the input data's language type
if not os.path.exists('output'):
    os.mkdir('output')
outputPath = 'output/' + dataName
if not os.path.exists(outputPath):
    os.mkdir(outputPath)
outputPath = outputPath + '/'

In [161]:
# Prepare the input data's path, according to the input data's language type
inputPath = 'input/' + dataName + '/'

In [162]:
%matplotlib inline

In [163]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

In [164]:
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [165]:
from nltk.corpus import stopwords
stop_words = stopwords.words('English')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [166]:
df = pd.read_json(inputPath + dataName + '_data.json')
print(df.Narrative.unique())
df.head()

['a feeling of overwhelmness and a feeling of spiritual sensation, a feeling of calm and serenity and being the only being in the whole universe'
 'A beautiful landscape on a fine day looking across at old vintage planes flying through the air'
 'The day my father was arrested by the police. It was an unpleasant experience. He was an alcoholic. He had verbally threatened to kill my mother repeatedly, I had no choice but to call n the authorities. My brothers were all busy with their own lives, working. In the end, I had to restrain my father to insert a handcuff on him. It was a good lesson.'
 'Graduated from university'
 'Seeing my mother hooked up to tubes after an operation for cancer. My mother had a life of pain, setbacks & hurt. She was born premature in 1931, contracted scarlet fever, rheumatic fever, whooping cough & the other"usual" childhood illnesses. She passed on rhuematic fever to her brother who was 5 years older. She adored her brother. He had a weak heart & it was weak

 'WHEN I SAW THIS BEAUTIFUL GIRL I WAS ABOUT TO BANG.  I WAS AT HOUSE APARTMENT AND SHE HAD A DECENT FACE AND A PHAT ASS.  AFTER WE TOOK OFF OUR CLOTHES WE BEGIN TO HAVE SEX.  IT WAS ONE OF THE BEST EXPERIENCES OF MY LIFE.']


Unnamed: 0,Unnamed: 1,Age,AgeNow,AgeThen,BirthPlace,Body_X1,Body_X2,Body_X3,Body_X4,Body_X5,...,X.7,X.8,X.9,code,country,gc,number,opp,rid,term
0,2,55,55,5,28,92.0,83.0,92.0,294.0,285.0,...,55,,,1,UK,1,107,Qual961-0923Countries,1309002101,
1,3,58,58,6,5,290.0,90.0,287.0,287.0,287.0,...,58,,,1,Australia,1,110,Qual961-0923Countries,1316625902,
2,4,25,25,2,22,266.0,306.0,113.0,37.0,145.0,...,25,,,1,Singapore,1,111,Qual961-0923Countries,1320724474,
3,5,35,35,4,22,88.169998,290.79,82.37,296.05,78.16,...,35,,,1,Singapore,1,112,Qual961-0923Countries,1184419179,
4,6,55,55,3,5,235.0,332.0,295.0,270.0,317.0,...,55,,,1,Australia,1,114,Qual961-0923Countries,1261773742,


In [167]:
# Convert to list
if df.X.values[0] == 'English':
    data = df.Narrative.values.tolist()
else:
    data = df.translation.values.tolist()
# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:1])

['a feeling of overwhelmness and a feeling of spiritual sensation, a feeling '
 'of calm and serenity and being the only being in the whole universe']


In [168]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['feeling', 'of', 'overwhelmness', 'and', 'feeling', 'of', 'spiritual', 'sensation', 'feeling', 'of', 'calm', 'and', 'serenity', 'and', 'being', 'the', 'only', 'being', 'in', 'the', 'whole', 'universe']]


In [169]:
# Build bigram and trigram
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

print(trigram_mod[bigram_mod[data_words[0]]])



['feeling', 'of', 'overwhelmness', 'and', 'feeling', 'of', 'spiritual', 'sensation', 'feeling', 'of', 'calm', 'and', 'serenity', 'and', 'being', 'the', 'only', 'being', 'in', 'the', 'whole', 'universe']


In [170]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [171]:
#remove stop words
data_words_nostops = remove_stopwords(data_words)

#getting bigrams
data_words_bigrams = make_bigrams(data_words_nostops)


nlp = spacy.load('en', disable=['parser', 'ner'])

data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['feel', 'overwhelmness', 'feel', 'spiritual', 'sensation', 'feel', 'calm', 'serenity', 'whole', 'universe']]


In [172]:
id2word = corpora.Dictionary(data_lemmatized)

texts = data_lemmatized

corpus = [id2word.doc2bow(text) for text in texts]

print(corpus[:1])

[[(0, 1), (1, 3), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]]


In [173]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_of_topics, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=30,
                                           alpha='auto',
                                           per_word_topics=False)

In [174]:
num_of_words_per_topic = 15
topics = lda_model.print_topics(num_of_topics, num_of_words_per_topic)
doc_lda = lda_model[corpus]
pprint(topics)

[(0,
  '0.100*"always" + 0.095*"cry" + 0.068*"parent" + 0.062*"keep" + 0.054*"fear" '
  '+ 0.027*"mom" + 0.022*"serious" + 0.020*"recall" + 0.018*"though" + '
  '0.015*"carry" + 0.014*"petty" + 0.013*"ward" + 0.012*"mother" + 0.011*"egg" '
  '+ 0.011*"neighbour"'),
 (1,
  '0.121*"love" + 0.109*"home" + 0.081*"would" + 0.080*"hospital" + '
  '0.041*"everything" + 0.041*"come" + 0.035*"girl" + 0.029*"completely" + '
  '0.026*"forever" + 0.025*"return" + 0.017*"nearly" + 0.015*"s" + '
  '0.012*"ever" + 0.012*"watch" + 0.011*"college"'),
 (2,
  '0.074*"happy" + 0.060*"old" + 0.058*"joy" + 0.052*"sit" + 0.044*"wonderful" '
  '+ 0.030*"bring" + 0.025*"whole" + 0.025*"rock" + 0.023*"probably" + '
  '0.023*"get" + 0.021*"start" + 0.020*"different" + 0.019*"back" + '
  '0.019*"cry" + 0.018*"use"'),
 (3,
  '0.067*"early" + 0.062*"begin" + 0.059*"play" + 0.038*"watch" + '
  '0.036*"match" + 0.036*"pretty" + 0.034*"final" + 0.034*"speak" + '
  '0.029*"rest" + 0.023*"family" + 0.022*"finish" + 0.02

  '0.047*"seat" + 0.038*"sea" + 0.021*"driving" + 0.016*"film" + 0.014*"fire" '
  '+ 0.013*"south_africa" + 0.013*"form" + 0.012*"release" + 0.011*"province" '
  '+ 0.010*"reason" + 0.010*"entire" + 0.010*"speed" + 0.010*"factory" + '
  '0.009*"castle" + 0.009*"afterward"'),
 (31,
  '0.186*"not" + 0.139*"do" + 0.062*"big" + 0.044*"long" + 0.040*"know" + '
  '0.035*"would" + 0.033*"become" + 0.033*"realise" + 0.022*"can" + '
  '0.020*"remember" + 0.019*"amazed" + 0.018*"awed" + 0.018*"scar" + '
  '0.015*"important" + 0.012*"create"'),
 (32,
  '0.180*"world" + 0.104*"be" + 0.057*"state" + 0.048*"sense" + 0.034*"not" + '
  '0.031*"stop" + 0.023*"arm" + 0.022*"body" + 0.022*"immediately" + '
  '0.018*"incredible" + 0.017*"peace" + 0.016*"notify" + 0.013*"definitely" + '
  '0.013*"imagine" + 0.012*"young"'),
 (33,
  '0.119*"never" + 0.064*"find" + 0.045*"school" + 0.042*"look" + '
  '0.039*"water" + 0.034*"hit" + 0.027*"forget" + 0.026*"lose" + 0.025*"let" + '
  '0.025*"ask" + 0.021*"finall

In [175]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.4462622006704351


In [176]:
mallet_path = 'C:/mallet-2.0.8/mallet-2.0.8/bin/mallet' # update this path
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_of_topics, id2word=id2word)

In [177]:
# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)


Coherence Score:  0.5156447758445595


In [178]:
topics = ldamallet.print_topics(num_of_topics, num_of_words_per_topic)
pprint(topics)

[(0,
  '0.148*"love" + 0.105*"feel" + 0.052*"remember" + 0.048*"thought" + '
  '0.048*"heart" + 0.048*"time" + 0.033*"turn" + 0.019*"christ" + 0.019*"tree" '
  '+ 0.014*"league" + 0.014*"prior" + 0.014*"sing" + 0.014*"center" + '
  '0.014*"wrong" + 0.010*"constant"'),
 (1,
  '0.099*"walk" + 0.077*"stop" + 0.077*"fall" + 0.059*"realize" + '
  '0.054*"morning" + 0.027*"ground" + 0.027*"leg" + 0.027*"hill" + '
  '0.023*"step" + 0.023*"leave" + 0.023*"scene" + 0.023*"vast" + '
  '0.018*"duckling" + 0.018*"competition" + 0.018*"idea"'),
 (2,
  '0.101*"parent" + 0.072*"witness" + 0.068*"year" + 0.053*"learn" + '
  '0.048*"student" + 0.039*"space" + 0.034*"achieve" + 0.034*"care" + '
  '0.029*"build" + 0.024*"study" + 0.024*"powerful" + 0.024*"wombat" + '
  '0.024*"person" + 0.019*"social" + 0.019*"nervous"'),
 (3,
  '0.217*"awe" + 0.081*"happy" + 0.045*"complete" + 0.041*"forget" + '
  '0.032*"machine" + 0.027*"awed" + 0.023*"level" + 0.018*"party" + '
  '0.018*"moon" + 0.018*"fiance" + 0.01

  '0.369*"life" + 0.095*"god" + 0.041*"church" + 0.041*"begin" + '
  '0.033*"strong" + 0.029*"change" + 0.025*"presence" + 0.025*"inspiring" + '
  '0.025*"pray" + 0.017*"couple" + 0.012*"fall" + 0.012*"lift" + '
  '0.012*"master" + 0.008*"bug" + 0.008*"lanka"'),
 (31,
  '0.109*"intense" + 0.100*"live" + 0.050*"great" + 0.045*"high" + '
  '0.040*"local" + 0.040*"film" + 0.035*"town" + 0.025*"cape" + '
  '0.020*"south_africa" + 0.020*"talent" + 0.020*"group" + 0.020*"blow" + '
  '0.015*"hook" + 0.015*"toilet" + 0.015*"release"'),
 (32,
  '0.242*"amazing" + 0.050*"fact" + 0.046*"totally" + 0.041*"travel" + '
  '0.037*"concert" + 0.027*"holiday" + 0.027*"wall" + 0.023*"fish" + '
  '0.023*"island" + 0.018*"kiss" + 0.018*"watch" + 0.018*"voice" + '
  '0.014*"happen" + 0.014*"memorial" + 0.014*"activity"'),
 (33,
  '0.102*"hospital" + 0.093*"sister" + 0.074*"cry" + 0.047*"test" + '
  '0.042*"mom" + 0.042*"arrive" + 0.028*"miracle" + 0.028*"section" + '
  '0.023*"natural" + 0.019*"previous" + 

In [179]:
# Prepare the keywords and percentages for futrue useage
# allKeywords[i] will give a array of keywords for topic i
# allPercentages[i] will give a array of percentages for topic i
index = 0
chunks = [None] * num_of_topics
allKeywords = [None] * num_of_topics
allPercentages = [None] * num_of_topics
for chunk in topics:
    chunk = chunk[1]
    percentages = re.findall(r"[-+]?\d*\.\d+|\d+", chunk) #credit to miku on Stackoverflow
    keywords = re.findall('"([^"]*)"', chunk) #credit to jspcal on Stackoverflow
    allKeywords[index] = keywords
    allPercentages[index] = percentages
    result = [None] * 2 * num_of_words_per_topic
    result[::2] = percentages
    result[1::2] = keywords
    result = [str(index)] + result
    chunks[index] = result # A array stroing arrays of keywords and corresponding percentages
    index += 1
print(chunks)
    


[['0', '0.148', 'love', '0.105', 'feel', '0.052', 'remember', '0.048', 'thought', '0.048', 'heart', '0.048', 'time', '0.033', 'turn', '0.019', 'christ', '0.019', 'tree', '0.014', 'league', '0.014', 'prior', '0.014', 'sing', '0.014', 'center', '0.014', 'wrong', '0.010', 'constant'], ['1', '0.099', 'walk', '0.077', 'stop', '0.077', 'fall', '0.059', 'realize', '0.054', 'morning', '0.027', 'ground', '0.027', 'leg', '0.027', 'hill', '0.023', 'step', '0.023', 'leave', '0.023', 'scene', '0.023', 'vast', '0.018', 'duckling', '0.018', 'competition', '0.018', 'idea'], ['2', '0.101', 'parent', '0.072', 'witness', '0.068', 'year', '0.053', 'learn', '0.048', 'student', '0.039', 'space', '0.034', 'achieve', '0.034', 'care', '0.029', 'build', '0.024', 'study', '0.024', 'powerful', '0.024', 'wombat', '0.024', 'person', '0.019', 'social', '0.019', 'nervous'], ['3', '0.217', 'awe', '0.081', 'happy', '0.045', 'complete', '0.041', 'forget', '0.032', 'machine', '0.027', 'awed', '0.023', 'level', '0.018', '




In [180]:
print(allKeywords[0])
print(allPercentages[0])

['love', 'feel', 'remember', 'thought', 'heart', 'time', 'turn', 'christ', 'tree', 'league', 'prior', 'sing', 'center', 'wrong', 'constant']
['0.148', '0.105', '0.052', '0.048', '0.048', '0.048', '0.033', '0.019', '0.019', '0.014', '0.014', '0.014', '0.014', '0.014', '0.010']


In [181]:
header = [None] * 2 * num_of_words_per_topic

for i in range(0, 2 * num_of_words_per_topic):
    if i % 2 == 0:
               header[i] = 'Percentage'
    else:
               header[i] = 'Keyword'

header = ['Topic No.'] + header
topicsPath = outputPath + dataName + '_' + 'topics.csv'
with open(topicsPath, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows([header])
    for chunk in chunks:
        writer.writerows([chunk])

In [182]:
# For generating the most dominant topic for each narrative
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
    sent_topics_df = pd.DataFrame()

    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)

        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=ldamallet, corpus=corpus, texts=data)

df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Topic No.', 'Dominant_Topic', 'Topic Contribution', 'Keywords', 'Text']

dominantTopicsPath = outputPath + dataName + '_' + 'dominant_topic.csv'
df_dominant_topic.to_csv(dominantTopicsPath)

In [183]:
# For generating top 'num_of_representatives' most representative narratives
import operator
repre = [None] * 50
for i, row in enumerate(ldamallet[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        for j, (topic_num, prop_topic) in enumerate(row):
            if repre[topic_num] is None:
                repre[topic_num] = dict()
            repre[topic_num][str(i)] = prop_topic
sorted_repre = [None] * 50
for i in range(0, len(repre)):
    sorted_repre[i] = sorted(repre[i].items(), key=operator.itemgetter(1), reverse = True)

#pprint(sorted_repre)
allTheTopics = [None] * 50
index = 0
for item in sorted_repre:
    allTheTopics[index] = [index]
    allTheTopics[index] += [allKeywords[index]]
    for i in range(0, num_of_representatives):
        allTheTopics[index] += [data[int(sorted_repre[index][i][0])]]
    index += 1
representativeNarraPath = outputPath + dataName + '_' + 'representative_narratives.csv'
header = (2 + num_of_representatives) * [None]
header[0] = 'Topic No.'
header[1] = 'Keywords'
for i in range(1, num_of_representatives+1):
    header[i + 1] = i
with open(representativeNarraPath, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows([header])
    for topic in allTheTopics:
        writer.writerows([topic])

In [184]:
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

sent_topics_outdf_grpd.head()

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,0
0,20.0,0.0490,"awe, time, realise, performance, hope, play, b...",a feeling of overwhelmness and a feeling of sp...
1,36.0,0.0460,"hour, holiday, remember, stay, excited, day, d...",A beautiful landscape on a fine day looking ac...
2,25.0,0.0497,"father, law, eat, end, year, drink, stage, ban...",The day my father was arrested by the police. ...
3,27.0,0.0385,"happen, shock, suddenly, passed_away, boy, par...",Graduated from university
4,26.0,0.3418,"mother, brother, young, age, move, dad, die, d...",Seeing my mother hooked up to tubes after an o...
5,10.0,0.0625,"child, birth, bear, hospital, hold, years_ago,...",i went to Elvis Presley home in memphis.I coul...
6,12.0,0.0501,"make, feel, special, surround, worry, slowly, ...",I went to a local slate quarry that has a lago...
7,5.0,0.1714,"work, money, hard, job, experience, company, c...",my first job in which I had been in for about ...
8,16.0,0.0706,"feel, eye, hand, event, area, hair, peace, sto...",I was shopping in a garden centre with my moth...
9,15.0,0.0364,"big, plan, person, issue, lot, decide, grow, m...",watching the first star wars movie in the cinema


In [185]:
# For generating the frequency of each topic being the most dominant topic
frequency = dict() # A dictionary. The key is the string of topic number. The item is the number of the appearence of this topic
for topicIndex in range (0, num_of_topics):
    frequency[str(topicIndex)] = sent_topics_outdf_grpd.get_group(float(topicIndex)).count().at['Dominant_Topic']

    
total_appearance = 0 # The number of narratives that have a dominant topic
for i in range(0, num_of_topics):
    total_appearance += frequency[str(i)]

    
frequency_rows = [None]*50
for i in range(0, num_of_topics):
    temp = [None] * 4 # Each row always has 4 elements: Topic Number, Topic Keywords, Topic Appearance, Frequency
    float_index = float(i) #The float version of the index for Dataframe Usage
    temp[0] = float_index
    temp[1] = sent_topics_outdf_grpd.get_group(float_index).iat[0, 2] 
    # 0 here means that the first rows(each topic will be the most domimant topic for at least one narrative by the nature of LDA)
    # 2 here means we get the value at the third column, which is the Topic_Keywords
    temp[2] = frequency[str(i)]
    temp[3] = float(temp[2]) / float(total_appearance)
    frequency_rows[i] = temp


frequencyPath = outputPath + dataName + '_' + 'frequency_topics.csv' # Handle the output path

# This part handles the header 
header = [None] * 4
header[0] = "Topic No."
header[1] = "Keywords"
header[2] = "Appearance"
header[3] = "Frequency"

with open(frequencyPath, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows([header])
    for frequency in frequency_rows:
        writer.writerows([frequency])

In [186]:
for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)
   
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

sent_topics_sorteddf_mallet.columns = ['Topic Index', "Topic Contribution", "Keywords", "Text"]

sent_topics_sorteddf_mallet.head(10)

Unnamed: 0,Topic Index,Topic Contribution,Keywords,Text
0,0.0,0.122,"love, feel, remember, thought, heart, time, tu...",I was diagnosed with depression several years ...
1,1.0,0.131,"walk, stop, fall, realize, morning, ground, le...",i was standing and noticed something that look...
2,2.0,0.2372,"parent, witness, year, learn, student, space, ...","I was graduating from college , it was the day..."
3,3.0,0.2116,"awe, happy, complete, forget, machine, awed, l...",I went to a customer factory for a machine ins...
4,4.0,0.1988,"walk, close, water, trip, cruise, decide, coun...",Decided to take a trip to New Zealand and was ...
5,5.0,0.1869,"work, money, hard, job, experience, company, c...",The last time I experienced a moment of awe wa...
6,6.0,0.1627,"night, memory, light, cold, star, sky, tour, h...","2001 , in mid-November . I had recently taken ..."
7,7.0,0.1926,"experience, animal, sea, time, couldnt_believe...",The last time I felt in awe was when I was doi...
8,8.0,0.1035,"moment, awe, human, sit, rush, thing, partner,...",the moment that i gave birth to my daughter / ...
9,9.0,0.1895,"day, school, pass, receive, climb, completely,...",I was in grade 11 and the school sent me a let...


In [187]:
fontpath = 'font/SFCompact/SFCompactDisplay-Light.otf'# Use a local font.
cloud = wordcloud.WordCloud(font_path =fontpath, width = 700, height = 600,
                background_color = None, mode = 'RGBA', relative_scaling = 0.5, 
                            normalize_plurals = False) # The object for generating wordcloud.

# The folder that stores these visulization.
imgPath = outputPath + 'visualizations'
if not os.path.exists(imgPath):
    os.mkdir(imgPath)
imgPath += '/'
for topic in range(0, num_of_topics):
    cloudict = dict()
    for i in range(0, 15):
        cloudict[allKeywords[topic][i]] = float(allPercentages[topic][i]) # Generate the frequency for the cloud object to use.
    
    img = cloud.generate_from_frequencies(cloudict, max_font_size=None)# Generate the image.
    img.to_file(imgPath + 'Topic' + ' ' + str(topic) + '.png')