In [9]:
from mailbox import mbox
import pandas as pd

def store_content(message, body=None):
    if not body:
        body = message.get_payload(decode=True)
    if len(message):
        contents = {
            "subject": message["subject"] or "",
            "body": body,
            "from": message["from"],
            "to": message["to"],
            "date": message["date"],
            "labels": message["X-Gmail-Labels"],
            "epilogue": message.epilogue,
        }
        return df.append(contents, ignore_index=True)

# Create empty DataFrame with relevant columns
df = pd.DataFrame(
    columns=("subject", "body", "from", "to", "date", "labels", "epilogue")
)

# Import downloaded mbox file
box = mbox("Takeout/Mail/AllmailIncludingSpamandTrash.mbox")

fails = []
for message in box:
    try:
        if message.get_content_type() == 'text/plain':
            df = store_content(message)
        elif message.is_multipart():
            # Grab any plaintext from multipart messages
            for part in message.get_payload():
                if part.get_content_type() == 'text/plain':
                    df = store_content(message, part.get_payload(decode=True))
                    break
    except:
        fails.append(message)

In [10]:
from collections import Counter

subject_word_bag = df.subject.apply(lambda t: str(t).lower() + " ").sum()

print(Counter(subject_word_bag.split()).most_common()[:10])

[('to', 4041), ('off', 3303), ('for', 3283), ('on', 3053), ('the', 3046), ('new', 2983), ('your', 2569), ('&', 2560), ('and', 2422), ('-', 2370)]


In [13]:
from nltk.corpus import stopwords

stops = [str(word) for word in stopwords.words('english')] + ['re:', 'fwd:', '_', '&', '-', '+']
subject_words = [word for word in subject_word_bag.split() if str(word).lower() not in stops]
print(Counter(subject_words).most_common()[:10])

[('new', 2983), ('free', 1142), ('save', 1001), ('|', 867), ('sale', 834), ('today', 740), ('20%', 732), ('online', 712), ('extra', 644), ('commented', 639)]


In [15]:
from nltk import collocations

# Analyzing most frequently paired words in email subjects
bigram_measures = collocations.BigramAssocMeasures()
bigram_finder = collocations.BigramCollocationFinder.from_words(subject_words)

# Filter to top 20 results; otherwise this will take a LONG time
bigram_finder.apply_freq_filter(20)
for bigram in bigram_finder.score_ngrams(bigram_measures.raw_freq)[:10]:
    print(bigram)

(('new', 'jobs'), 0.0032635591520096302)
(('free', 'shipping'), 0.002808800909516485)
(('free', 'shipping.'), 0.002287166454891995)
(('bar', 'room'), 0.0022136026215475156)
(('[the', 'bar'), 0.002193539757908112)
(('room', 'heroes]'), 0.002193539757908112)
(('stores', 'online'), 0.002066474954858557)
(('15', 'new'), 0.0016050290911522771)
(('extra', '30%'), 0.0014779642881027218)
(('photo', 'you.'), 0.0014177756971845115)


In [18]:
from nltk import collocations

# Analyzing most frequent groupings of three words in email subjects
trigram_measures = collocations.TrigramAssocMeasures()
trigram_finder = collocations.TrigramCollocationFinder.from_words(subject_words)

# Filter to top 20 results; otherwise this will take a LONG time
trigram_finder.apply_freq_filter(20)
for trigram in trigram_finder.score_ngrams(trigram_measures.raw_freq)[:10]:
    print(trigram)

(('[the', 'bar', 'room'), 0.002193539757908112)
(('bar', 'room', 'heroes]'), 0.002193539757908112)
(('15', 'new', 'jobs'), 0.0015916538487260083)
(('developer', 'wilkes', 'barre,'), 0.0012171470607904768)
(('wilkes', 'barre,', 'pa'), 0.0012171470607904768)
(('jobs', 'developer', 'wilkes'), 0.0012104594395773423)
(('new', 'jobs', 'developer'), 0.0012104594395773423)
(('design', 'free', 'shipping.'), 0.0011703337122985354)
(('everyday', 'design', 'free'), 0.0011703337122985354)
(('commented', 'dwight', "addington's"), 0.001150270848659132)


In [20]:
from nltk import collocations

# Analyzing pairs of words with pointwise mutual information,
# which gives us a word that is most often used with another word.
bigram_measures = collocations.BigramAssocMeasures()
bigram_finder = collocations.BigramCollocationFinder.from_words(subject_words)

# Filter to top 20 results; otherwise this will take a LONG time
bigram_finder.apply_freq_filter(20)
for bigram in bigram_finder.nbest(bigram_measures.pmi, 10):
    print(bigram)

('refurb', 'maestro')
('avengers', 'avengers')
('maestro', 'grinder')
('foundations', 'frontiers]')
('connections,', 'experience,')
('tanya', 'pi')
('[laff:', 'linear')
('biography', 'memoir')
('categories', 'biography')
('leila', "brewster's")


In [21]:
from nltk import collocations

# Analyzes groupings of three words with pointwise
# mutual information
trigram_measures = collocations.TrigramAssocMeasures()
trigram_finder = collocations.TrigramCollocationFinder.from_words(subject_words)

# Filter to top 20 results; otherwise this will take a LONG time
trigram_finder.apply_freq_filter(20)
for trigram in trigram_finder.nbest(trigram_measures.pmi, 10):
    print(trigram)

('refurb', 'maestro', 'grinder')
('avengers', 'avengers', 'avengers')
('categories', 'biography', 'memoir')
('problem', 'refurb', 'maestro')
('algebra', 'foundations', 'frontiers]')
('maestro', 'grinder', 'problem')
('[laff:', 'linear', 'algebra')
('grinder', 'problem', 'refurb')
('"epson', 'connect', 'scan')
('connect', 'scan', 'cloud"')


In [25]:
from textblob import TextBlob

# scores the sentiment of email subjects from 1.0 for positive
# to -1.0 for negative.
df['feels'] = df.subject.apply(
    lambda s: TextBlob(str(s)).sentiment.polarity
)

# Output a few subject lines with the calculated sentiment scores
print(df[['subject', 'feels']])

                                                 subject     feels
0                         Omayeli Arenyeka posted a note  0.000000
1          See Danny's connections, experience, and more  0.500000
2      =?utf-8?Q?Free=20for=2048=20Hours=3A=20New=20R...  0.000000
3      Suggestions based on stephanie yee, + POOL and...  0.000000
4      =?utf-8?Q?The=20best=20=28and=20worst=29=20nut...  0.000000
5                Netted | This Email May Induce Drooling  0.000000
6           See Emma's connections, experience, and more  0.500000
7      8 Surprisingly Easy Ways You Can Wow the Perso...  0.266667
8      [NYEdTech] You are Invited - EdTech Event @ CC...  0.000000
9      =?utf-8?q?All_New_Fonts=2C_Graphics=2C_Templat...  0.000000
10     =?utf-8?q?All_New_Fonts=2C_Graphics=2C_Templat...  0.000000
11                        Docker Weekly: July 29th, 2015  0.000000
12     =?utf-8?Q?Joshua=20=2D=20want=20a=20better=20w...  0.000000
13     The New Denim Style Guide - Find Your Perfect Fit  0.51