In [135]:
dataName = 'Overall' # The name of the data file. Now it is just for different language.
num_of_representatives = 8 # The number of representative narrative you want to show for each topics.
num_of_topics = 20 # The number of topics you want to generate from the data.
countryList = ['Argentina','Australia','Austria','Brazil','Canada','Chile','China', 'France','Germany','India','Indonesia','Ireland','Japan','Korea','Mexico','Netherlands','Norway','Russia','Singapore','South_Africa','Spain','Sweden','Switzerland','Turkey','UK','USA']

In [95]:
# For output
import os
import pathlib
import csv


import re
import numpy as np
import pandas as pd
from pprint import pprint
import nltk; 
nltk.download('stopwords')

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import matplotlib.pyplot as plt
import wordcloud # Package by Andreas Mueller

# General usage
import math

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [96]:
# Prepare the output folder, according to the input data's language type
if not os.path.exists('output'):
    os.mkdir('output')
outputPath = 'output/' + dataName
if not os.path.exists(outputPath):
    os.mkdir(outputPath)
outputPath = outputPath + '/'

In [97]:
# Prepare the input data's path, according to the input data's language type
inputPath = 'input/' + dataName + '/'

In [98]:
%matplotlib inline

In [99]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

In [100]:
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [101]:
from nltk.corpus import stopwords
stop_words = stopwords.words('English')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [102]:
df = pd.read_json(inputPath + dataName + '_data.json', encoding = 'utf-8')
print(df.Narrative.unique())

["When I heard a boy sing Ave Maria perfectly. It was about three years ago. I was at the church in my cousin's baptism. I heard with awe how every note was sung with feeling, as if an angel was doing it. Later, I stayed still and was deeply moved by it"
 'I was at my house walking and suddenly sprained my ankle so abruptly that I thought I broke it. My reaction before this was exactly that expression. Once I reacted I slowly accomodated myself, and it ended up being nothing else than just the impression. Thank God it was nothing more than a sprain.'
 'I was at the field walking with my husband when a big spider appeared. I was petrified and I wanted to run, but my body would not answer.'
 ...
 "when i got paid the most money for clicking some random buttons was pretty cool. except for that part with the idiots didn't select the option to maximize everyones earnings and acted selfishly. how long does this take oh there we go bam."
 'my nephew had to goto the hospital and when he got ou

In [103]:
# Convert to list
#if df.X.values[0] == 'English':
data = df.Narrative.values.tolist()
#else:
    #data = df.translation.values.tolist()
# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:1])

['When I heard a boy sing Ave Maria perfectly. It was about three years ago. I '
 'was at the church in my cousins baptism. I heard with awe how every note was '
 'sung with feeling, as if an angel was doing it. Later, I stayed still and '
 'was deeply moved by it']


In [104]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['when', 'heard', 'boy', 'sing', 'ave', 'maria', 'perfectly', 'it', 'was', 'about', 'three', 'years', 'ago', 'was', 'at', 'the', 'church', 'in', 'my', 'cousins', 'baptism', 'heard', 'with', 'awe', 'how', 'every', 'note', 'was', 'sung', 'with', 'feeling', 'as', 'if', 'an', 'angel', 'was', 'doing', 'it', 'later', 'stayed', 'still', 'and', 'was', 'deeply', 'moved', 'by', 'it']]


In [105]:
# Build bigram and trigram
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

print(trigram_mod[bigram_mod[data_words[0]]])



['when', 'heard', 'boy', 'sing', 'ave', 'maria', 'perfectly', 'it', 'was', 'about', 'three', 'years_ago', 'was', 'at', 'the', 'church', 'in', 'my', 'cousins', 'baptism', 'heard', 'with', 'awe', 'how', 'every', 'note', 'was', 'sung', 'with', 'feeling', 'as', 'if', 'an', 'angel', 'was', 'doing', 'it', 'later', 'stayed', 'still', 'and', 'was', 'deeply', 'moved', 'by', 'it']


In [106]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [107]:
#remove stop words
data_words_nostops = remove_stopwords(data_words)

#getting bigrams
data_words_bigrams = make_bigrams(data_words_nostops)


nlp = spacy.load('en', disable=['parser', 'ner'])

data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['hear', 'boy', 'sing', 'ave', 'maria', 'perfectly', 'years_ago', 'church', 'cousin', 'baptism', 'hear', 'awe', 'note', 'sing', 'feeling', 'angel', 'later', 'stay', 'still', 'deeply', 'moved']]


In [108]:
id2word = corpora.Dictionary(data_lemmatized)

texts = data_lemmatized

corpus = [id2word.doc2bow(text) for text in texts]

print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 2), (16, 1), (17, 1), (18, 1)]]


In [109]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_of_topics, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=30,
                                           alpha='auto',
                                           per_word_topics=False)

In [110]:
num_of_words_per_topic = 15
topics = lda_model.print_topics(num_of_topics, num_of_words_per_topic)
doc_lda = lda_model[corpus]
pprint(topics)

[(0,
  '0.206*"get" + 0.035*"show" + 0.030*"ago" + 0.023*"pregnant" + 0.022*"front" '
  '+ 0.016*"excited" + 0.015*"call" + 0.015*"result" + 0.013*"celebrate" + '
  '0.012*"issue" + 0.012*"afraid" + 0.011*"month" + 0.011*"couple" + '
  '0.010*"favorite" + 0.009*"offer"'),
 (1,
  '0.068*"moment" + 0.063*"life" + 0.050*"come" + 0.040*"look" + '
  '0.038*"people" + 0.038*"know" + 0.033*"happen" + 0.032*"tell" + '
  '0.030*"happy" + 0.025*"say" + 0.023*"person" + 0.021*"many" + '
  '0.017*"nothing" + 0.016*"amazed" + 0.015*"face"'),
 (2,
  '0.080*"world" + 0.030*"mom" + 0.023*"church" + 0.019*"bring" + '
  '0.019*"strike" + 0.018*"light" + 0.017*"evening" + 0.016*"calm" + '
  '0.013*"section" + 0.012*"sing" + 0.011*"guy" + 0.011*"sublimity" + '
  '0.011*"delivery_room" + 0.010*"cousin" + 0.010*"red"'),
 (3,
  '0.123*"feel" + 0.080*"awe" + 0.053*"want" + 0.052*"experience" + '
  '0.048*"make" + 0.038*"take" + 0.035*"give" + 0.034*"think" + 0.032*"would" '
  '+ 0.021*"way" + 0.020*"something

In [111]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.4097896212154234


In [112]:
mallet_path = 'C:/mallet-2.0.8/mallet-2.0.8/bin/mallet' # update this path
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_of_topics, id2word=id2word)

In [113]:
# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)


Coherence Score:  0.3326544986605814


In [114]:
topics = ldamallet.print_topics(num_of_topics, num_of_words_per_topic)
pprint(topics)

[(0,
  '0.060*"years_ago" + 0.051*"visit" + 0.048*"amazing" + 0.043*"mountain" + '
  '0.040*"trip" + 0.028*"city" + 0.027*"travel" + 0.025*"view" + '
  '0.024*"country" + 0.023*"top" + 0.018*"enjoy" + 0.016*"boyfriend" + '
  '0.016*"cold" + 0.015*"climb" + 0.015*"building"'),
 (1,
  '0.174*"experience" + 0.101*"time" + 0.074*"feeling" + 0.031*"sit" + '
  '0.027*"intense" + 0.026*"strong" + 0.021*"event" + 0.020*"church" + '
  '0.020*"emotion" + 0.020*"begin" + 0.018*"tear" + 0.015*"kid" + 0.013*"pray" '
  '+ 0.011*"describe" + 0.011*"write"'),
 (2,
  '0.056*"hear" + 0.046*"man" + 0.040*"news" + 0.029*"jag" + 0.026*"bad" + '
  '0.019*"memory" + 0.017*"future" + 0.015*"meeting" + 0.015*"min" + '
  '0.013*"sad" + 0.012*"och" + 0.010*"listen" + 0.010*"det" + 0.009*"var" + '
  '0.009*"internet"'),
 (3,
  '0.193*"time" + 0.114*"year" + 0.093*"happen" + 0.086*"good" + 0.045*"thing" '
  '+ 0.039*"meet" + 0.020*"partner" + 0.016*"nice" + 0.014*"boy" + '
  '0.013*"high_school" + 0.012*"study" + 

In [115]:
# Prepare the keywords and percentages for futrue useage
# allKeywords[i] will give a array of keywords for topic i
# allPercentages[i] will give a array of percentages for topic i
index = 0
chunks = [None] * num_of_topics
allKeywords = [None] * num_of_topics
allPercentages = [None] * num_of_topics
for chunk in topics:
    chunk = chunk[1]
    percentages = re.findall(r"[-+]?\d*\.\d+|\d+", chunk) #credit to miku on Stackoverflow
    keywords = re.findall('"([^"]*)"', chunk) #credit to jspcal on Stackoverflow
    allKeywords[index] = keywords
    allPercentages[index] = percentages
    result = [None] * 2 * num_of_words_per_topic
    result[::2] = percentages
    result[1::2] = keywords
    result = [str(index)] + result
    chunks[index] = result # A array stroing arrays of keywords and corresponding percentages
    index += 1
print(chunks)
    


[['0', '0.060', 'years_ago', '0.051', 'visit', '0.048', 'amazing', '0.043', 'mountain', '0.040', 'trip', '0.028', 'city', '0.027', 'travel', '0.025', 'view', '0.024', 'country', '0.023', 'top', '0.018', 'enjoy', '0.016', 'boyfriend', '0.016', 'cold', '0.015', 'climb', '0.015', 'building'], ['1', '0.174', 'experience', '0.101', 'time', '0.074', 'feeling', '0.031', 'sit', '0.027', 'intense', '0.026', 'strong', '0.021', 'event', '0.020', 'church', '0.020', 'emotion', '0.020', 'begin', '0.018', 'tear', '0.015', 'kid', '0.013', 'pray', '0.011', 'describe', '0.011', 'write'], ['2', '0.056', 'hear', '0.046', 'man', '0.040', 'news', '0.029', 'jag', '0.026', 'bad', '0.019', 'memory', '0.017', 'future', '0.015', 'meeting', '0.015', 'min', '0.013', 'sad', '0.012', 'och', '0.010', 'listen', '0.010', 'det', '0.009', 'var', '0.009', 'internet'], ['3', '0.193', 'time', '0.114', 'year', '0.093', 'happen', '0.086', 'good', '0.045', 'thing', '0.039', 'meet', '0.020', 'partner', '0.016', 'nice', '0.014',

In [116]:
print(allKeywords[0])
print(allPercentages[0])

['years_ago', 'visit', 'amazing', 'mountain', 'trip', 'city', 'travel', 'view', 'country', 'top', 'enjoy', 'boyfriend', 'cold', 'climb', 'building']
['0.060', '0.051', '0.048', '0.043', '0.040', '0.028', '0.027', '0.025', '0.024', '0.023', '0.018', '0.016', '0.016', '0.015', '0.015']


In [117]:
header = [None] * 2 * num_of_words_per_topic

for i in range(0, 2 * num_of_words_per_topic):
    if i % 2 == 0:
               header[i] = 'Percentage'
    else:
               header[i] = 'Keyword'

header = ['Topic No.'] + header
topicsPath = outputPath + dataName + '_' + 'topics.csv'
with open(topicsPath, 'w', newline='', encoding = 'utf-8') as f:
    writer = csv.writer(f)
    writer.writerows([header])
    for chunk in chunks:
        writer.writerows([chunk])

In [118]:
# For generating the most dominant topic for each narrative
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
    sent_topics_df = pd.DataFrame()

    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)

        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=ldamallet, corpus=corpus, texts=data)

df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Narrative No.', 'Dominant_Topic', 'Topic Contribution', 'Keywords', 'Text']

dominantTopicsPath = outputPath + dataName + '_' + 'dominant_topic.csv'
df_dominant_topic.to_csv(dominantTopicsPath, encoding = 'utf-8', index = False)

In [119]:
print(df_dominant_topic)

      Topic No.  Dominant_Topic  Topic Contribution  \
0             0             2.0              0.1200   
1             1            16.0              0.1137   
2             2            12.0              0.0800   
3             3             6.0              0.1468   
4             4             5.0              0.0926   
5             5             4.0              0.1005   
6             6            10.0              0.0850   
7             7            15.0              0.1241   
8             8             9.0              0.0944   
9             9            10.0              0.0932   
10           10            16.0              0.0954   
11           11             8.0              0.0859   
12           12             3.0              0.0708   
13           13             0.0              0.0744   
14           14             3.0              0.0770   
15           15            16.0              0.1042   
16           16            19.0              0.0773   
17        

[2764 rows x 5 columns]


In [120]:
# For generating top 'num_of_representatives' most representative narratives
import operator
repre = [None] * num_of_topics
for i, row in enumerate(ldamallet[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        for j, (topic_num, prop_topic) in enumerate(row):
            if repre[topic_num] is None:
                repre[topic_num] = dict()
            repre[topic_num][str(i)] = prop_topic
sorted_repre = [None] * num_of_topics
for i in range(0, len(repre)):
    sorted_repre[i] = sorted(repre[i].items(), key=operator.itemgetter(1), reverse = True)

#pprint(sorted_repre)
allTheTopics = [None] * num_of_topics

index = 0
for item in sorted_repre:
    allTheTopics[index] = [index]
    allTheTopics[index] += [allKeywords[index]]
    for i in range(0, num_of_representatives):
        allTheTopics[index] += [data[int(sorted_repre[index][i][0])]]
    index += 1
representativeNarraPath = outputPath + dataName + '_' + 'representative_narratives.csv'
header = (2 + num_of_representatives) * [None]
header[0] = 'Topic No.'
header[1] = 'Keywords'
for i in range(1, num_of_representatives+1):
    header[i + 1] = i
with open(representativeNarraPath, 'w', newline='', encoding = 'utf-8') as f:
    writer = csv.writer(f)
    writer.writerows([header])
    for topic in allTheTopics:
        writer.writerows([topic])

In [121]:
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

sent_topics_outdf_grpd.head()

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,0
0,2.0,0.1200,"hear, man, news, jag, bad, memory, future, mee...",When I heard a boy sing Ave Maria perfectly. I...
1,16.0,0.1137,"start, suddenly, fall, move, fear, water, run,...",I was at my house walking and suddenly spraine...
2,12.0,0.0800,"make, big, situation, stand, open, girlfriend,...",I was at the field walking with my husband whe...
3,6.0,0.1468,"beautiful, wonderful, world, sea, beauty, sun,...",I was astonished by watching the moon show up ...
4,5.0,0.0926,"day, mother, father, family, end, young, reali...",1 My first son. 2º It was in October 1992. 3º ...
5,4.0,0.1005,"house, night, husband, hour, close, light, wal...","It made me feel I was in danger, it was at nig..."
6,10.0,0.0850,"remember, parent, talk, lot, room, cry, wait, ...",What made me feel that way was the moment I wa...
7,15.0,0.1241,"child, birth, son, daughter, hospital, bear, w...",The birth of my first son. / This happened whe...
8,9.0,0.0944,"give, happy, love, find, doctor, pregnant, lea...",What: finding a person I did not see since a c...
9,10.0,0.0932,"remember, parent, talk, lot, room, cry, wait, ...","Tandil, Indio Solaris gig, the worst was confi..."


In [122]:
# For generating the frequency of each topic being the most dominant topic
frequency = dict() # A dictionary. The key is the string of topic number. The item is the number of the appearence of this topic
for topicIndex in range (0, num_of_topics):
    frequency[str(topicIndex)] = sent_topics_outdf_grpd.get_group(float(topicIndex)).count().at['Dominant_Topic']

    
total_appearance = 0 # The number of narratives that have a dominant topic
for i in range(0, num_of_topics):
    total_appearance += frequency[str(i)]

    
frequency_rows = [None]*num_of_topics
for i in range(0, num_of_topics):
    temp = [None] * 4 # Each row always has 4 elements: Topic Number, Topic Keywords, Topic Appearance, Frequency
    float_index = float(i) #The float version of the index for Dataframe Usage
    temp[0] = float_index
    temp[1] = sent_topics_outdf_grpd.get_group(float_index).iat[0, 2] 
    # 0 here means that the first rows(each topic will be the most domimant topic for at least one narrative by the nature of LDA)
    # 2 here means we get the value at the third column, which is the Topic_Keywords
    temp[2] = frequency[str(i)]
    temp[3] = float(temp[2]) / float(total_appearance)
    frequency_rows[i] = temp


frequencyPath = outputPath + dataName + '_' + 'frequency_topics.csv' # Handle the output path

# This part handles the header 
header = [None] * 4
header[0] = "Topic No."
header[1] = "Keywords"
header[2] = "Appearance"
header[3] = "Frequency"

with open(frequencyPath, 'w', newline='', encoding = 'utf-8') as f:
    writer = csv.writer(f)
    writer.writerows([header])
    for frequency in frequency_rows:
        writer.writerows([frequency])

In [123]:
for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)
   
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

sent_topics_sorteddf_mallet.columns = ['Topic Index', "Topic Contribution", "Keywords", "Text"]

sent_topics_sorteddf_mallet.head(10)

Unnamed: 0,Topic Index,Topic Contribution,Keywords,Text
0,0.0,0.3058,"years_ago, visit, amazing, mountain, trip, cit...","There is a temple in Maihar city,Satna distric..."
1,1.0,0.2274,"experience, time, feeling, sit, intense, stron...",I was at a religious event i.e. at a church se...
2,2.0,0.5422,"hear, man, news, jag, bad, memory, future, mee...",Jag hade varit i Nepal i en månad med min dåva...
3,3.0,0.1731,"time, year, happen, good, thing, meet, partner...",It happened a few years ago. It was not a good...
4,4.0,0.3157,"house, night, husband, hour, close, light, wal...",I used to live in Co Meath in a rural area clo...
5,5.0,0.408,"day, mother, father, family, end, young, reali...",Seeing my mother hooked up to tubes after an o...
6,6.0,0.252,"beautiful, wonderful, world, sea, beauty, sun,...",Stroll in the forest. Suddenly I had a feeling...
7,7.0,0.2526,"feel, person, world, god, thing, eye, amazed, ...",The time experienced the most awe of my life w...
8,8.0,0.23,"friend, home, watch, family, tv, amazed, girl,...",What can cause me to be amazed is for example ...
9,9.0,0.2463,"give, happy, love, find, doctor, pregnant, lea...","During 2015,overall all the months from Jan to..."


In [124]:
fontpath = 'font/SFCompact/SFCompactDisplay-Light.otf'# Use a local font.
cloud = wordcloud.WordCloud(font_path =fontpath, width = 700, height = 600,
                background_color = None, mode = 'RGBA', relative_scaling = 0.5, 
                            normalize_plurals = False) # The object for generating wordcloud.

# The folder that stores these visulization.
imgPath = outputPath + 'visualizations'
if not os.path.exists(imgPath):
    os.mkdir(imgPath)
imgPath += '/'
for topic in range(0, num_of_topics):
    cloudict = dict()
    for i in range(0, 15):
        cloudict[allKeywords[topic][i]] = float(allPercentages[topic][i]) # Generate the frequency for the cloud object to use.
    
    img = cloud.generate_from_frequencies(cloudict, max_font_size=None)# Generate the image.
    img.to_file(imgPath + 'Topic' + ' ' + str(topic) + '.png')

In [125]:
if (dataName is 'Overall'):
    topicsArray = [None] * num_of_topics # An array that contains the distribution of countries for a topic.
    for i in range(len(data)):
        index = int(df_dominant_topic.iat[i, 1])
        if topicsArray[index] is None:
            topicsArray[index] = dict()
        country = df.iat[i, 0]
        if country in topicsArray[index]:
            topicsArray[index][country] += 1
        else:
            topicsArray[index][country] = 1
    
    header = [None]
    header[0] = 'Topic No.'
    header = header + countryList
    distributionRow = [None] * num_of_topics
    for i in range(len(topicsArray)):
        temp = [None] * len(header)
        temp[0] = float(i)
        j = 1
        for country in countryList:
            if country in topicsArray[i]:
                temp[j] = topicsArray[i][country]
            else:
                temp[j] = 0
            j += 1
        distributionRow[i] = temp
    
    distributionPath = outputPath + dataName + '_' + 'distribution.csv'
    with open(distributionPath, 'w', newline='', encoding = 'utf-8') as f:
        writer = csv.writer(f)
        writer.writerows([header])
        for distribution in distributionRow:
            writer.writerows([distribution])

In [136]:
total = 0
for country in countryList:
    print(country)
    print(df.Country.value_counts()[country])
    total += df.Country.value_counts()[country]
print(total)

Argentina
105
Australia
105
Austria
105
Brazil
118
Canada
105
Chile
105
China
105
France
105
Germany
104
India
130
Indonesia
105
Ireland
105
Japan
104
Korea
105
Mexico
105
Netherlands
105
Norway
104
Russia
105
Singapore
105
South_Africa
104
Spain
104
Sweden
105
Switzerland
105
Turkey
104
UK
106
USA
105
2763


In [137]:
len(df)

2764