In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

import pandas as pd
from wordcloud import WordCloud
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk import word_tokenize          
from nltk.stem.porter import PorterStemmer
import nltk
import networkx as nx
#import community

# To make the figures look nice and large
plt.rcParams['figure.figsize'] = (40.0, 30.0)
plt.rcParams['image.interpolation'] = 'nearest'

# 1. Wordclouds
## Loading the data

In [None]:
emails = pd.read_csv("hillary-clinton-emails/Emails.csv")
emails.head()

First we inspect what data we are given. For the wordcloud it would be wise to use only the "ExtractedBodyText" column.

In [None]:
emails.columns.values

## Removing emails without body text

In [None]:
emails.size #Size of the dataset before

In [None]:
emails = emails[pd.notnull(emails['ExtractedBodyText'])]
emails.size #Size of the dataset after

## Generating the wordcloud

Converting the ExtractedBodyText into strings we can use for the wordcloud. Note that we tell WordCloud to not remove any stopwords for this first graph.

In [None]:
text = emails.ExtractedBodyText.to_string();

In [None]:
wordcloud = WordCloud(stopwords=['']).generate(text)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

## Improving the wordcloud

To improve the wordcloud we will go through the process of removing stopwords. Then we will make sure to have only alphabetical characters and use stemming to convert words into their original base form.

In [None]:
stemmer = PorterStemmer()
sw = stopwords.words("english")

In [None]:
def tokenize(text):
    tokens = nltk.word_tokenize(text) # Tokenizing the text
    tokens = [i for i in tokens if i.isalpha()] #Keeping only strings with alphabetic characters
    tokens = [i for i in tokens if i.lower() not in sw] #Removing stopwords
    stems = stem_tokens(tokens, stemmer) #Stemming the words
    return stems  #returns a list

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed 

In [None]:
improvedText = str(tokenize(text)).replace("'", "") #Convert from list to string object without single quotes

In [None]:
wc = WordCloud().generate(improvedText)
plt.imshow(wc)
plt.axis("off")
plt.show()

When comparing the two wordclouds, we see that they are very different. In the first cloud we have all the stopwords, which means the wordcloud does not show us any indication of some specific characteristics of Hillary Clinton emails.  In the second cloud some words are shortened to their stems such as 'Novemb' and 'relea'. The stopwords are removed, so we see a better representation of the most common words used in the emails.

# 2. Sentiment analysis

# 3. Topic modeling

Using the models.ldamodel module from the gensim library, run topic modeling over the corpus. Explore different numbers of topics (varying from 5 to 50), and settle for the parameter which returns topics that you consider to be meaningful at first sight.

Start by important gensim libraries.

In [None]:
from gensim.models.ldamodel import LdaModel
from gensim import corpora
import pickle

Read the data, keep only the ExtractedBodyText cells and remove null values.

In [None]:
emails = pd.DataFrame(pd.read_csv('hillary-clinton-emails/Emails.csv')['ExtractedBodyText'].dropna())

Next, use the same functions defined in the first exercice to remove the stopwords, remove numbers and stem tokens. Note that we also lower all tokens.

In [None]:
emails['ExtractedBodyText'] = emails.ExtractedBodyText.apply(lambda x: [word.lower() for word in tokenize(x) if word not in sw])

Transform the tokenized emails into a list of lists of tokens.

In [None]:
texts = emails.ExtractedBodyText.tolist()
dictionary = corpora.Dictionary(texts)

Transform the list of lists of tokens into a corpus (a list of Bag of Words), and then train the Latent Dirichlet Allocation model on the corpus.

In [None]:
corpus = [dictionary.doc2bow(text) for text in texts]

Now, make a function to print the topics in a concise way, for a better readability.

In [None]:
def format_topics(topics):
    for topic in topics:
        print("Topic {}:\t".format(topic[0]), end="")
        for tup in topic[1]:
            print("{}, ".format(tup[0], tup[1]), end="")
        print()

Let's try to find 5 topics in the corpus using the Latent Dirichlet Allocation model. Note that running the algorithm can take a bit of time, due to the number of passes. To avoid having to train the model multiple times, the topics are stored in two files. It is also better to discuss to topics, because they are not exactly the same between multiple runs. Increasing the number of passes (default is 1) is good because it increases accuracy of topic modeling.

In [None]:
try:
    data = pickle.load(open('5_topics.p', 'rb'))
except:
    model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=50)
    data = model.show_topics(num_topics=5, num_words=10, formatted=False)
    pickle.dump(data, open('5_topics.p', 'wb'))

format_topics(data)

As we can see, the topics are generic. They are about Obama, diplomacy and politics in general. For example, topic 0 is about elections ('elect', 'vote') and american politics ('democrat', 'republican', 'parti', 'senat'). We can compare these topics with the one generated by the LDA model when looking for 50 topics.

In [None]:
try:
    data = pickle.load(open('50_topics.p', 'rb'))
except:
    model = LdaModel(corpus, num_topics=50, id2word=dictionary, passes=50)
    data = model.show_topics(num_topics=50, num_words=10, formatted=False)
    pickle.dump(data, open('50_topics.p', 'wb'))
    
format_topics(data)

As expected, they are more diverse. Instead of having one topic for diplomacy, we have one topic about pakistan (27), one about china (24), one about israel and palestinia (7), etc. There are also topics about recent events: North Korea testing nuclear weapons (27), attack in Benghazi (26), etc. We could say that 50 topics is too much because some topics seem to be not related to anything (2, 49, ...), but it is not necessarily true since some topics (Russia for example) do not appear at each run of the LDA model. Note that his can also be because the word 'Russia' is not important enough to be in the top ten words of any topic.

In [None]:
try:
    data = pickle.load(open('25_topics.p', 'rb'))
except:
    model = LdaModel(corpus, num_topics=25, id2word=dictionary, passes=50)
    data = model.show_topics(num_topics=25, num_words=10, formatted=False)
    pickle.dump(data, open('25_topics.p', 'wb'))
    
format_topics(data)

When limited to 25, the topics are more centered around US politics ('democrat', 'republican', 'vote', 'elect', 'senat', 'bill', 'parti', 'presid', ...), meetings ('call', 'talk', 'tomorrow', 'work', 'offic', 'meet', 'room', ...) and middle east ('iran', 'israel', 'border', 'jewish', 'palestinian', 'ahmadinejad', 'peac', ...)

The best number of topics to look for actually depends on how deep a person wants to go. If set to 5, then only generic topics will emerge with not much information, and it probably won't be enough for most cases. With 50, more detailed topics appear, specialized for different countries the US are working with. But with these also come less meaningful topics. A good compromise would be to find a value between 5 and 25. For example, with 25, the topics are not too generic, and there aren't too much missing data.

# Bonus - Communication graph

First we are going to load the data and select columns, which are relevant for the task. For the communication graph it does not matter whether there was any text in the email or not, so we don't remove rows with NaN for 'ExtractedBodyText'.

In [None]:
emails = pd.read_csv("hillary-clinton-emails/Emails.csv")
emails = emails[pd.notnull(emails['SenderPersonId'])]
emails = emails[['Id', 'SenderPersonId', 'ExtractedBodyText']]
emails.head()

As the table 'persons' has multiple IDs for the same persons, because they can have different email addresses, then we decided to eliminate rows, where the name was given as an email address. We assume that people who wrote more emails were identified by a name instead of an email address. Thanks to this we got rid of some duplicates in the network graph and reduced the number of nodes from 468 to 324.

In [None]:
persons = pd.read_csv("hillary-clinton-emails/Persons.csv")
persons = persons[~persons.Name.str.contains('@')]
persons.head()

In [None]:
receivers = pd.read_csv("hillary-clinton-emails/EmailReceivers.csv", index_col =0)
receivers.head()

By merging the 'receivers' with persons, we are able to match the person IDs with the given name of the person. We do the same for the senders of the emails.

In [None]:
reWithNames = pd.merge(receivers, persons, how='outer', left_on= 'PersonId', right_on= 'Id')
reWithNames = reWithNames[['EmailId','PersonId','Name']]
reWithNames = reWithNames.rename(columns={'Name': 'Receiver'})
reWithNames.head()

In [None]:
seWithNames = pd.merge(emails, persons, how='outer', left_on= 'SenderPersonId', right_on= 'Id')
seWithNames = seWithNames.rename(columns={'Name': 'Sender'})
seWithNames.drop('Id_y', axis=1, inplace=True)
seWithNames.head()

Finally we can combine the senders and receivers into one dataframe by matching the email IDs.

In [None]:
newDF = pd.merge(seWithNames, reWithNames, how='outer', left_on= 'Id_x', right_on= 'EmailId')
newDF = newDF[pd.notnull(newDF['SenderPersonId'])]
newDF = newDF[pd.notnull(newDF['PersonId'])]
newDF.head()

Generating the network graph from the dataframe.

In [None]:

G=nx.from_pandas_dataframe(newDF, 'Sender', 'Receiver', create_using=nx.Graph())


In [None]:
pos=nx.spring_layout(G) # Positions for all nodes
nx.draw(G, with_labels=True, alpha=0.6,node_color='salmon', edge_color='darkgrey', font_size=17)


plt.axis('off')
plt.savefig("communication-graph.png") # Save as png for closer view
plt.show()

In [None]:
G.number_of_nodes()

From the network graph we see that most emails are connected with Hillary Clinton as expected. Nevertheless there are other people who are also forming communities of connection. For example Cheryl Mills, Huma Abedin and Jake Sullivan. When researching about these people we find that they were close advisors of Hillary Clinton.