In [1]:
import pickle
import pandas as pd

enron_df = pd.read_pickle('enron_students.pkl')

In [2]:
# data cleaning
enron_df['To'] = enron_df['To'].fillna('')
enron_df['From'] = enron_df['From'].fillna('')
enron_df['X-From'] = enron_df['X-From'].fillna('')
enron_df['X-To'] = enron_df['X-To'].fillna('')
enron_df['X-cc'] = enron_df['X-cc'].fillna('')
enron_df['X-bcc'] = enron_df['X-bcc'].fillna('')


In [4]:
import preprocessing

enron_df['email_text'] = enron_df['Subject'] + ' ' + enron_df['email_body']
preprocessing.data_cleaning(enron_df)

removed attachments
parsed contacts
removed foward text
filled nan values


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  enron_df['email_text'].fillna(' ', inplace=True)


removed email headers
removed small words
lemmatized


In [1]:
# enron_df.to_pickle('enron_students_cleaned.pkl')
import pandas as pd

enron_df = pd.read_pickle(r'enron_students_cleaned.pkl')

In [2]:
from gensim import corpora
from gensim.models.ldamulticore import LdaMulticore

def build_lda_model(df, num_topics=3, threshold=0.3):

    # Extract and tokenize the email body
    df['tokens'] = df['email_text'].apply(lambda x: x.split())

    # Create a dictionary representation of the documents
    dictionary = corpora.Dictionary(df['tokens'])
    print("Number of unique tokens before filtering:", len(dictionary))
    dictionary.filter_extremes(no_below=5, no_above=0.5)
    print("Number of unique tokens after filtering:", len(dictionary))

    # Create bag-of-words representation of the documents
    df['bow'] = df['tokens'].apply(dictionary.doc2bow)
    # Filter out documents that result in no words after processing
    bow_corpus = df[df['bow'].map(len) > 0]

    # Check if there are any valid documents left
    if bow_corpus.empty:
        raise ValueError("No valid documents to process. All documents resulted in empty BOW representations.")

    # Build an LDA model
    lda_model = LdaMulticore(corpus=list(bow_corpus['bow']),
                             id2word=dictionary,
                             num_topics=num_topics,
                             chunksize=1000,
                             passes=8,
                             per_word_topics=True,
                             workers=8)

    return lda_model, bow_corpus, dictionary


def get_topic_assignments(lda_model: LdaMulticore, bow_corpus, threshold=0.3):
    
    # Assign documents to topics
    topic_assignments = []
    for bow in bow_corpus:
        topic_probs = lda_model.get_document_topics(bow, minimum_probability=0)
        # Sort topics by probability
        topic_probs = sorted(topic_probs, key=lambda x: x[1], reverse=True)
        # Assign "None of these topics" if the highest probability is below the threshold
        if topic_probs[0][1] < threshold:
            topic_assignments.append("None of these topics")
        else:
            topic_assignments.append(f"Topic {topic_probs[0][0]}")

    return topic_assignments


In [3]:
# get 10 % of the enron rows
enron_small = enron_df.sample(frac=0.1, random_state=42)

In [4]:
try:
    lda_model, bow_corpus, dictionary = build_lda_model(enron_df, num_topics=30, threshold=0.3)
    print("Model built successfully with {} topics.".format(lda_model.num_topics))
except ValueError as e:
    print(e)

Number of unique tokens before filtering: 271829
Number of unique tokens after filtering: 100000
Model built successfully with 30 topics.


In [12]:
# def run_lda_multiple_topics(df, topic_numbers):
#     lda_model_res = {}
#     dictionary_res = {}
#     corpus_res = {}
#     for num_topics in topic_numbers:
#         try:
#             lda_model, bow_corpus, dictionary = build_lda_model(df, num_topics=num_topics, threshold=0.3)
#             print("Model built successfully with {} topics.".format(lda_model.num_topics))
#             # Optionally, calculate and print model coherence, perplexity, etc. here
#             lda_model_res[num_topics] = lda_model  # Store the model, or just store metrics like coherence
#             dictionary_res[num_topics] = dictionary
#             corpus_res[num_topics] = bow_corpus
#         except ValueError as e:
#             print(f"Failed to build model with {num_topics} topics: {e}")
#     return lda_model_res, dictionary_res, corpus_res

# # Define the range of topics you want to test
# topic_numbers = [20, 25, 30, 40]

# # Assuming enron_df is your DataFrame
# lda_model_res, dictionary_res, corpus_res = run_lda_multiple_topics(enron_df, topic_numbers)

Number of unique tokens before filtering: 271829
Number of unique tokens after filtering: 100000
Model built successfully with 20 topics.
Number of unique tokens before filtering: 271829
Number of unique tokens after filtering: 100000
Model built successfully with 25 topics.
Number of unique tokens before filtering: 271829
Number of unique tokens after filtering: 100000
Model built successfully with 30 topics.
Number of unique tokens before filtering: 271829
Number of unique tokens after filtering: 100000
Model built successfully with 40 topics.


In [13]:
# topic_numbers_large = [50]

# lda_model_res_large, dictionary_res_large, corpus_res_large = run_lda_multiple_topics(enron_df, topic_numbers_large)

Number of unique tokens before filtering: 271829
Number of unique tokens after filtering: 100000
Model built successfully with 50 topics.


In [5]:
def get_most_probable_topic(doc, model, get_destribution=False):
    if not doc:
        return None  # Handle empty documents
    topic_probs = model.get_document_topics(doc, minimum_probability=0)
    if not topic_probs:
        return None  # Handle cases with no significant topic probability
    
    if get_destribution:
        return topic_probs
    
    return max(topic_probs, key=lambda x: x[1])[0]  # Return only the topic index

# Assume lda_model and valid_df are from your previous output
bow_corpus['Topic'] = [get_most_probable_topic(doc, lda_model, True) for doc in bow_corpus['bow']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bow_corpus['Topic'] = [get_most_probable_topic(doc, lda_model, True) for doc in bow_corpus['bow']]


In [18]:
# corpus_res_large[50]['Topic'] = [get_most_probable_topic(doc, lda_model_res_large[50], True) for doc in corpus_res_large[50]['bow']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  corpus_res[40]['Topic'] = [get_most_probable_topic(doc, lda_model_res[40], True) for doc in corpus_res[40]['bow']]


In [None]:
bow_corpus

In [18]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

# Visualize the topics
vis = gensimvis.prepare(lda_model, list(bow_corpus['bow']), dictionary)
pyLDAvis.display(vis)


KeyboardInterrupt: 

In [26]:
# pyLDAvis.show(vis, local=True)


OSError: [Errno 22] Invalid argument: 'https://cdn.jsdelivr.net/gh/bmabey/pyLDAvis@3.4.0/pyLDAvis/js/ldavis.v1.0.0.css'

In [17]:
# get the best topic for each label
# label_1_1 = enron_small[enron_small['label']]
enron_small.columns

Index(['Date', 'From', 'To', 'X-To', 'X-From', 'X-cc', 'X-bcc', 'Subject',
       'email_body', 'verdict', 'violated_rules', 'email_text', 'is_list',
       'tokens', 'bow'],
      dtype='object')

In [23]:
# from gensim.models import CoherenceModel

# # Compute Coherence Score
# # Calculate perplexity for LDA model
# def get_score():
#     perplexity = lda_model_res[40].log_perplexity(list(corpus_res[40]['bow']))
#     print("Perplexity: ", perplexity)

#     coherence_model_lda = CoherenceModel(model=lda_model_res[40], texts=corpus_res[40]['tokens'], dictionary=dictionary_res[40], coherence='c_v')
#     coherence_lda = coherence_model_lda.get_coherence()
#     print('Coherence Score: ', coherence_lda)

# print(get_score())

Perplexity:  -14.035978990452321
Coherence Score:  0.599971752458084
None


In [None]:
# # i want to get the most probable topic for each email
# print(corpus_res[40]['Topic'])



In [None]:
# corpus_res[40].to_pickle('enron_students_lda_40_corpus.pkl')
# dictionary_res[40].to_pickle('enron_students_lda_40_dict.pkl')
# lda_model_res[40].to_pickle('enron_students_lda_40_model.pkl')

In [None]:
def get_best_topic_for_label(model, dictionary, corpus, label):
    # get the dataset with labels. the corpus already have the topic column
    