In [9]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
import collections
import os
import random
import numpy as np
from tqdm import tqdm
import sys, email
import pandas as pd 
import math

os.chdir("D:/College Material//Sem2//DMML2//ENFUSE//Code")

In [10]:
#########################################################
# Load Enron dataset
#########################################################

ENRON_EMAIL_DATASET_PATH = "Data//emails.csv"

# load enron dataset
emails_df = pd.read_csv(ENRON_EMAIL_DATASET_PATH)
print(emails_df.shape)
emails_df.head()

(517401, 2)


Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


In [11]:
#########################################################
# Sort out required email features: date, subject, content
#########################################################

# source https://www.kaggle.com/zichen/explore-enron
## Helper functions
def get_text_from_email(msg):
    '''To get the content from email objects'''
    parts = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            parts.append( part.get_payload() )
    return ''.join(parts)

import email
# Parse the emails into a list email objects
messages = list(map(email.message_from_string, emails_df['message']))
emails_df.drop('message', axis=1, inplace=True)
# Get fields from parsed email objects
keys = messages[0].keys()
for key in keys:
    emails_df[key] = [doc[key] for doc in messages]
# Parse content from emails
emails_df['Content'] = list(map(get_text_from_email, messages))

# keep only Subject and Content for this exercise
emails_df = emails_df[['Date','Subject','Content']]

In [12]:
#########################################################
# change wor2vec model to work with Enron emails
#########################################################
 
# point it to our Enron data set
emails_sample_df = emails_df.copy()

import string, re
# clean up subject line
emails_sample_df['Subject'] = emails_sample_df['Subject'].str.lower()
emails_sample_df['Subject'] = emails_sample_df['Subject'].str.replace(r'[^a-z]', ' ')  
emails_sample_df['Subject'] = emails_sample_df['Subject'].str.replace(r'\s+', ' ')  

# clean up content line
emails_sample_df['Content'] = emails_sample_df['Content'].str.lower()
emails_sample_df['Content'] = emails_sample_df['Content'].str.replace(r'[^a-z]', ' ')  
emails_sample_df['Content'] = emails_sample_df['Content'].str.replace(r'\s+', ' ')  

# create sentence list 
emails_text = (emails_sample_df["Subject"] + ". " + emails_sample_df["Content"]).tolist()

sentences = ' '.join(emails_text)
words = sentences.split()

print('Data size', len(words))
 

# get unique words and map to glove set
print('Unique word count', len(set(words))) 
 

# drop rare words
vocabulary_size = 50000

def build_dataset(words):
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in tqdm(words):
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(words)

del words  
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])

  emails_sample_df['Subject'] = emails_sample_df['Subject'].str.replace(r'[^a-z]', ' ')
  emails_sample_df['Subject'] = emails_sample_df['Subject'].str.replace(r'\s+', ' ')
  emails_sample_df['Content'] = emails_sample_df['Content'].str.replace(r'[^a-z]', ' ')
  emails_sample_df['Content'] = emails_sample_df['Content'].str.replace(r'\s+', ' ')


Data size 143167645
Unique word count 595620


100%|███████████████████████████████████████████████████████████████| 143167645/143167645 [01:46<00:00, 1341146.64it/s]


Most common words (+UNK) [['UNK', 2653116], ('the', 5679797), ('to', 4059244), ('and', 2589766), ('of', 2388314)]
Sample data [115, 145, 11, 54, 2018, 39, 115, 4033, 2, 25] ['.', 'here', 'is', 'our', 'forecast', 're', '.', 'traveling', 'to', 'have']


In [16]:
####################################################################
# find matches with glove 
####################################################################
GLOVE_DATASET_PATH = 'GloVe//glove.840B.300d.txt'

from tqdm import tqdm
import string
embeddings_index = {}
f = open(GLOVE_DATASET_PATH, encoding="cp437")
word_counter = 0
for line in tqdm(f):
    values = line.split()
    word = values[0]
    if word in dictionary:
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    word_counter += 1
f.close()

print('Found %s word vectors matching enron data set.' % len(embeddings_index))
print('Total words in GloVe data set: %s' % word_counter)

2196017it [01:38, 22396.44it/s]

Found 38844 word vectors matching enron data set.
Total words in GloVe data set: 2196017





In [17]:
#########################################################
# Check out some clusters
#########################################################

# create a dataframe using the embedded vectors and attach the key word as row header
import pandas as pd
enrond_dataframe = pd.DataFrame(embeddings_index)
enrond_dataframe = pd.DataFrame.transpose(enrond_dataframe)
 
# See what it learns and look at clusters to pull out major themes in the data
CLUSTER_SIZE = 300 
# cluster vector and investigate top groups
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=CLUSTER_SIZE)
cluster_make = kmeans.fit_predict(enrond_dataframe)

labels = kmeans.predict(enrond_dataframe)
import collections
cluster_frequency = collections.Counter(labels)
print(cluster_frequency)
cluster_frequency.most_common()

clusters = {}
n = 0
for item in labels:
    if item in clusters:
        clusters[item].append(list(enrond_dataframe.index)[n])
    else:
        clusters[item] = [list(enrond_dataframe.index)[n]]
    n +=1

for k,v in cluster_frequency.most_common(100):
    print('\n\n')
    print('Cluster:', k)
    print (' '.join(clusters[k]))

Counter({69: 500, 248: 484, 199: 451, 14: 438, 28: 402, 123: 397, 112: 373, 205: 354, 169: 331, 294: 330, 38: 325, 108: 318, 285: 308, 168: 308, 130: 285, 257: 285, 16: 270, 227: 258, 70: 256, 64: 245, 289: 245, 166: 240, 92: 237, 50: 236, 13: 233, 131: 230, 299: 227, 52: 222, 150: 216, 277: 215, 19: 213, 8: 211, 144: 210, 291: 206, 65: 206, 71: 203, 275: 195, 2: 194, 5: 188, 287: 188, 114: 188, 54: 187, 136: 184, 149: 183, 78: 183, 41: 181, 171: 181, 46: 181, 234: 181, 34: 181, 210: 180, 243: 178, 138: 177, 17: 175, 21: 174, 72: 174, 94: 172, 196: 171, 260: 169, 23: 168, 160: 167, 151: 165, 238: 165, 235: 164, 10: 164, 223: 164, 146: 160, 93: 159, 186: 159, 207: 158, 173: 156, 4: 156, 24: 156, 137: 155, 122: 155, 158: 155, 81: 155, 82: 153, 55: 153, 214: 153, 185: 152, 87: 151, 106: 150, 20: 149, 76: 148, 80: 147, 278: 147, 182: 147, 143: 146, 282: 145, 281: 144, 67: 144, 111: 143, 252: 142, 60: 142, 217: 141, 193: 140, 79: 140, 135: 138, 283: 138, 279: 137, 116: 137, 15: 137, 37: 137

In [20]:
clusters[0]

['minutes',
 'miles',
 'distance',
 'km',
 'approximately',
 'min',
 'mi',
 'mile',
 'ft',
 'meters',
 'mins',
 'mph',
 'approx',
 'hrs',
 'metres',
 'sq',
 'hr',
 'kilometers',
 'nm',
 'distanced',
 'kilometres',
 'kms',
 'metre',
 'deg',
 'yds',
 'kilometer',
 'mts',
 'aprox',
 'mtr']