In [9]:
from mailbox import mbox
import pandas as pd

def store_content(message, body=None):
    if not body:
        body = message.get_payload(decode=True)
    if len(message):
        contents = {
            "subject": message["subject"] or "",
            "body": body,
            "from": message["from"],
            "to": message["to"],
            "date": message["date"],
            "labels": message["X-Gmail-Labels"],
            "epilogue": message.epilogue,
        }
        return df.append(contents, ignore_index=True)

# Create empty DataFrame with relevant columns
df = pd.DataFrame(
    columns=("subject", "body", "from", "to", "date", "labels", "epilogue")
)

# Import downloaded mbox file
box = mbox("Takeout/Mail/AllmailIncludingSpamandTrash.mbox")

fails = []
for message in box:
    try:
        if message.get_content_type() == 'text/plain':
            df = store_content(message)
        elif message.is_multipart():
            # Grab any plaintext from multipart messages
            for part in message.get_payload():
                if part.get_content_type() == 'text/plain':
                    df = store_content(message, part.get_payload(decode=True))
                    break
    except:
        fails.append(message)

In [10]:
from collections import Counter

subject_word_bag = df.subject.apply(lambda t: str(t).lower() + " ").sum()

print(Counter(subject_word_bag.split()).most_common()[:10])

[('to', 4041), ('off', 3303), ('for', 3283), ('on', 3053), ('the', 3046), ('new', 2983), ('your', 2569), ('&', 2560), ('and', 2422), ('-', 2370)]


In [13]:
from nltk.corpus import stopwords

stops = [str(word) for word in stopwords.words('english')] + ['re:', 'fwd:', '_', '&', '-', '+']
subject_words = [word for word in subject_word_bag.split() if str(word).lower() not in stops]
print(Counter(subject_words).most_common()[:10])

[('new', 2983), ('free', 1142), ('save', 1001), ('|', 867), ('sale', 834), ('today', 740), ('20%', 732), ('online', 712), ('extra', 644), ('commented', 639)]


In [15]:
from nltk import collocations
bigram_measures = collocations.BigramAssocMeasures()
bigram_finder = collocations.BigramCollocationFinder.from_words(subject_words)

# Filter to top 20 results; otherwise this will take a LONG time
bigram_finder.apply_freq_filter(20)
for bigram in bigram_finder.score_ngrams(bigram_measures.raw_freq)[:10]:
    print(bigram)

(('new', 'jobs'), 0.0032635591520096302)
(('free', 'shipping'), 0.002808800909516485)
(('free', 'shipping.'), 0.002287166454891995)
(('bar', 'room'), 0.0022136026215475156)
(('[the', 'bar'), 0.002193539757908112)
(('room', 'heroes]'), 0.002193539757908112)
(('stores', 'online'), 0.002066474954858557)
(('15', 'new'), 0.0016050290911522771)
(('extra', '30%'), 0.0014779642881027218)
(('photo', 'you.'), 0.0014177756971845115)


In [17]:
from nltk import collocations
trigram_measures = collocations.TrigramAssocMeasures()
trigram_finder = collocations.TrigramCollocationFinder.from_words(subject_words)

# Filter to top 20 results; otherwise this will take a LONG time
trigram_finder.apply_freq_filter(20)
for trigram in trigram_finder.score_ngrams(trigram_measures.raw_freq)[:10]:
    print(trigram)

(('photo', 'you.'), 0.0014177756971845115)
(('photo', 'you.'), 0.0014177756971845115)
(('photo', 'you.'), 0.0014177756971845115)
(('photo', 'you.'), 0.0014177756971845115)
(('photo', 'you.'), 0.0014177756971845115)
(('photo', 'you.'), 0.0014177756971845115)
(('photo', 'you.'), 0.0014177756971845115)
(('photo', 'you.'), 0.0014177756971845115)
(('photo', 'you.'), 0.0014177756971845115)
(('photo', 'you.'), 0.0014177756971845115)
