In [1]:
import os
import gensim
import random
import collections

import pandas as pd

from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_pickle('enron_mails.p')
train, test = train_test_split(df, test_size=0.4, random_state=2022)


In [3]:
def create_corpus(input_docs, tokens_only=False):
    for i, text in input_docs:
        tokens = gensim.utils.simple_preprocess(text)
        if tokens_only:
            yield tokens
        else:
            # Tags for training data
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

            
train_corpus = list(create_corpus(train['text'].items()))
test_corpus = list(create_corpus(test['text'].items(), tokens_only=True))

In [4]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=10)

In [5]:
model.build_vocab(train_corpus)

In [6]:
%%time
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

CPU times: user 19min 37s, sys: 3min 8s, total: 22min 45s
Wall time: 10min 22s


%%time
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])
    
counter = collections.Counter(ranks)
print(counter)

In [8]:
doc_id = len(train_corpus) - 1
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Document (310439): «fyi forwarded by steven kean na enron on am kim frumkin ees am to steven kean na enron enron cc subject thank you steve thank you for participating once again in the video for deal central the videos are created housed and maintained in beth tilney area mentioned to beth that corporate may have an interest in sharing utilizing the ees video equipment and expertise robert pearson our in house film expert has masters in video programming taping etc and the good news within the next few weeks the editing equipment will be fully up and running once installed and operational all in house produced videos will have the look and feel of those created by outside vendors please feel free to contact beth ext if you have an interest in exploring this concept and thank you again for your participation best kim»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):



NameError: name 'sims' is not defined

In [10]:
doc_id = random.randint(0, len(train_corpus) - 1)

# Compare and print the second-most-similar document
print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
# sim_id = second_ranks[doc_id]
#print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(train_corpus[sim_id[0]].words)))


Train Document (56666): «business highlights enron global markets coal and emissions trading the first enron online synfuel trade occurred this past week establishing enrononline as the only online marketplace for synfuel the purpose of bringing synfuel on enrononline is to provide market transparency between the coal synfuel spread the coal cash book which began trading only two months ago has traded over million spot tons the fundamental purpose of this book is to capture short term market discrepancies while also adding liquidity to the spot market to further long term trading and marketing of coal enron energy and operational services ena solid fuel initiative ena started development of coal initiative late last year to support the electricity trading desk and to provide enron with hedge in the event that the cost of natural gas continues to escalate above current long term projections because of the culture shock associated the use of the words enron and coal in the same sentence 

In [None]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))


# LDA

In [None]:
from gensim.models import LdaModel

# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)