In [190]:
import os
import re

import csv
import pandas as pd

from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import pylab
import nltk
import string

from gensim import corpora, models, similarities #Latent Dirichlet Allocation implementation with Gensim
import pyLDAvis
import pyLDAvis.gensim

In [2]:
cur_folder = os.getcwd()
path = os.path.dirname(os.path.dirname(cur_folder))

In [3]:
path_americanos = path+'\\Arquivos americanos'
path_ingleses = path+'\\Arquivos ingleses'
path_brasileiros = path+'\\Arquivos brasileiros'

In [4]:
file_americanos = path_americanos+'\\metadata americanos.xlsx'
file_ingleses = path_ingleses+'\\metadata ingleses.xlsx'
file_brasileiros = path_brasileiros+'\\metadata brasileiros.xlsx'

### create dataframe

In [6]:
def build_df(file):
    df = pd.read_excel(file, 0)
    df = df.dropna(subset=['Date'])
    
    #clear non relevant columns
    df = df.drop(['Box', 'File'], axis=1)
    if 'Sender - Updated' and 'Receiver - Updated' in df.columns:
        df['Sender'] = df['Sender - Updated']
        df['Receiver'] = df['Receiver - Updated']
        df = df.drop(['Sender - Updated', 'Receiver - Updated'], axis=1)
    
    for cell in df['Date']:
        if type(cell) == str:
            df = df[df.Date != cell]
    df['Date'] = pd.to_datetime(df['Date'], format='%Y%m%d')
    df = df.set_index('Date')
    df['Date'] = df.index # trabalhar com coluna é mais fácil do que com índice
    df = df.sort_values('Date')

    return df

In [32]:
us_df = build_df(file_americanos)
uk_df = build_df(file_ingleses)
br_df = build_df(file_brasileiros)
dfs_dict = {'us': us_df, 'uk': uk_df, 'br': br_df}

### Create list of stopwords

In [95]:
additional_words = ['Portuguese']

stopwords = nltk.corpus.stopwords.words('english') + \
            nltk.corpus.stopwords.words('portuguese') + \
            additional_words

### create wordcloud

In [97]:
cur_folder = os.getcwd()
path_images = os.path.dirname(cur_folder)
path_images
def save_file(filename):
    file = path_images+'\\'+filename
    pylab.savefig(file)

In [137]:
def create_wordcloud(df_key, df_value):
    df_value.fillna('', inplace=True)
    subject_list = df_value.loc[:, 'Subject'].values.tolist()
    subject_list = [text for text in subject_list if text != '']
    subjects_all = ' '.join(subject_list)
    wordcloud = WordCloud(background_color="white", max_words=50, relative_scaling = .5,stopwords = stopwords, min_font_size=8)
    wordcloud.generate(subjects_all)
    plt.imshow(wordcloud)
    plt.axis("off")
    #plt.show()
    filename = "wordcloud-{}.png".format(df_key.upper())
    save_file(filename)
    #break

In [138]:
for df_key,df_value in dfs_dict.items():
    create_wordcloud(df_key, df_value)

### create topic modelling

In [226]:
#def create_wordcloud(df_key, df_value):
df_value = dfs_dict['us']
df_value.fillna('', inplace=True)
subjects_list = df_value.loc[:, 'Subject'].values.tolist()
texts = []
for text in subjects_list:
    text = text.split()
    symbols = [x for x in string.punctuation]
    text = [p for p in text if p not in symbols]
    text = [p.strip(string.punctuation) for p in text]
    text = [p for p in text if not p.isdigit()]
    text = [p for p in text if len(p)>1]
    text = [word for word in text if word not in stopwords]
    if text:
        texts.append(text)

In [227]:
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=1, no_above=0.8)
dictionary.filter_tokens(bad_ids=[0,]) #retira palavras a partir do id
corpus = [dictionary.doc2bow(text) for text in texts]

In [228]:
len(corpus)

102

### 10 topics

In [229]:
%time lda10 = models.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=10, eval_every=1, random_state=0)

Wall time: 1.62 s


In [230]:
lda10.print_topics(-1, num_words=5)

[(0,
  '0.042*"Transmission" + 0.027*"Lisbon" + 0.019*"authorities" + 0.019*"imported" + 0.019*"placement"'),
 (1,
  '0.034*"Minister" + 0.023*"Salazar´s" + 0.023*"Reaction" + 0.023*"recent" + 0.023*"Japanese"'),
 (2,
  '0.027*"Government" + 0.027*"License" + 0.027*"authorities" + 0.027*"correspondence" + 0.027*"Censorship"'),
 (3,
  '0.020*"people" + 0.020*"New" + 0.020*"available" + 0.020*"law" + 0.020*"OWI\'s"'),
 (4,
  '0.099*"Allocation" + 0.092*"Transmission" + 0.087*"Authorities" + 0.087*"Commodities" + 0.087*"Imported"'),
 (5,
  '0.042*"Portugal" + 0.041*"National" + 0.028*"Council" + 0.028*"Anti-Fascist" + 0.028*"Unity"'),
 (6,
  '0.037*"Quarter" + 0.034*"National" + 0.033*"First" + 0.023*"Assembly" + 0.023*"Portugal"'),
 (7,
  '0.037*"Portugal" + 0.028*"May" + 0.028*"Salazar´s" + 0.019*"April" + 0.019*"Speech"'),
 (8,
  '0.033*"Portugal" + 0.023*"Transmission" + 0.022*"lists" + 0.022*"distribution" + 0.022*"quota"'),
 (9,
  '0.101*"Motion" + 0.101*"Picture" + 0.100*"Films" + 

#### saves visualization of 20 topics

In [231]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda10, corpus, dictionary)

In [233]:
data_ldavis = pyLDAvis.gensim.prepare(lda10, corpus, dictionary)
#filename = 
outputs = path_images+'\\'
pyLDAvis.save_html(data_ldavis, os.path.join(outputs,'topic_modeling-US.html'))