In [65]:
import os
import pandas as pd
import numpy as np


try:
  from google.colab import drive 
  drive.mount('/content/drive')
  IN_COLAB = True
except:
  IN_COLAB = False

In [66]:
if IN_COLAB:
  enron_df = pd.read_pickle('/content/drive/MyDrive/Study/Cyber_AI/enron_students.pkl')
else:
  enron_df = pd.read_pickle('enron_students.pkl')
enron_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517401 entries, 0 to 517400
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Date            517401 non-null  object
 1   From            517401 non-null  object
 2   To              495554 non-null  object
 3   X-To            517372 non-null  object
 4   X-From          517372 non-null  object
 5   X-cc            517372 non-null  object
 6   X-bcc           517372 non-null  object
 7   Subject         517401 non-null  object
 8   email_body      517401 non-null  object
 9   verdict         517401 non-null  object
 10  violated_rules  517401 non-null  object
dtypes: object(11)
memory usage: 43.4+ MB


In [67]:
enron_df['email_text'] = enron_df['Subject'] + ' ' + enron_df['email_body']

In [68]:
import re
import json

def remove_attachment_text(text):
    # Split the text at the specific phrase
    parts = text.split("Content-Disposition: attachment;")
    # Return the part before the phrase if it exists
    if parts:
        return parts[0]

    else:
        return text

def parse_contacts(data):
    # count the number of '~' in the data to check if its notes
    count = data.count('~')
    if count < 30:
        return data

    # Normalize the data by removing line continuation characters
    data = data.replace("=\n", "")  # Assumes `=` at the end of the line followed by a newline

    # Split the data into individual records on '#'
    records = data.split('#')

    # Initialize a list to store parsed contacts
    contacts = []

    # Iterate through each record
    for record in records:
        # Split the record into fields using '~'
        fields = record.split('~')

        # TODO: need to check relevant fields
        if len(fields) > 21:  # Check to ensure it's a valid record
            contact = {
                'first_name': fields[1].strip(),
                'last_name': fields[3].strip(),
                'phone_numbers': fields[11:14],
                'position': fields[15].strip(),
                'company': fields[18].strip(),
                'email': fields[21].strip() if len(fields) > 21 else None  # Safeguard for missing email
            }
            contacts.append(contact)
            # print('contact:', contact)

    if not contacts:
        return ' '
    return json.dumps(contacts)


enron_df['email_text'] = enron_df['email_text'].apply(remove_attachment_text)
print('removed attachments')

enron_df['email_text'] = enron_df['email_text'].apply(parse_contacts)
enron_df['email_text'].fillna(' ', inplace=True)
print('parsed contacts')


removed attachments
parsed contacts


In [69]:
# split all data of column email_text with length of 510 and less, or 511 or more
SPLIT_SIZE = 510

def get_size(text):
    return len(text.split())

enron_df['text_size'] = enron_df['email_text'].apply(len)


# split data with length of 510 and less on column text_size
enron_df_small = enron_df[enron_df['text_size'] <= SPLIT_SIZE]
enron_df_large = enron_df[enron_df['text_size'] > SPLIT_SIZE]

enron_df.drop(columns=['text_size'], inplace=True)

# print the number of data that will be split
print('Data with length of tokens 510 and less:', len(enron_df_small))
print('Data with length of tokens 511 and more:', len(enron_df_large))


Data with length of tokens 510 and less: 192268
Data with length of tokens 511 and more: 325133


In [71]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from pandas import isnull

nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    # Tokenize and remove punctuation
    if text is None or isnull(text):
        return ''
    text = text.replace('-', '')
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text.lower())

    # Remove stop words
    stopped_tokens = [i for i in tokens if not i in stopwords.words('english')]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(i) for i in stopped_tokens]

    # Join the words back into one string separated by space
    return ' '.join(lemmatized_tokens)



[nltk_data] Downloading package stopwords to C:\Users\Efraim
[nltk_data]     Yosofov\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Efraim
[nltk_data]     Yosofov\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [72]:
def process_in_batches(dataset, batch_size, preprocess_text):
    # Number of rows in the dataset
    total_rows = len(dataset)

    # Process each batch
    for start in range(0, total_rows, batch_size):
        end = min(start + batch_size, total_rows)
        # Apply the preprocess function to the batch
        dataset.loc[start:end, 'email_text'] = dataset['email_text'][start:end].apply(preprocess_text)
        print(f'Processed rows {start} to {end}')

    return dataset



In [73]:
import re

# Function to check if the dataset file exists to avoid recomputation
def load_or_process_dataset(file_path):
    try:
        # Attempt to load the dataset from the specified file path
        dataset = pd.read_pickle(file_path)
        print("Loaded dataset from:", file_path)
    except (FileNotFoundError, IOError):
        print("File not found. Processing dataset...")
        # If the file does not exist, process the data as per the original steps
        dataset = enron_df_small[['email_text', 'violated_rules']]
        process_in_batches(dataset, 1000, preprocess_text)

        # Save the processed dataset to the specified file path
        dataset.to_pickle(file_path)
        print("Dataset processed and saved to:", file_path)

    return dataset

if IN_COLAB:
  file_path = '/content/drive/MyDrive/Study/Cyber_AI/bert_df_processed.pkl'
else:
  file_path = './bert_df_processed.pkl'
bert_dataset = load_or_process_dataset(file_path)

Loaded dataset from: ./bert_df_processed.pkl


In [74]:
from sklearn.model_selection import train_test_split
# Number of emails violating rules 1.2 and 2.1: 1671
# Number of emails violating rules 1.3 and 2.2: 2094
# Number of emails violating rules 1.3, 2.1 and 2.2: 64
# =====================
# Number of emails violating rules 1.1 and 1.2: 68
# Number of emails violating rules 1.1 and 1.3: 6
# Number of emails violating rules 1.2 and 1.3: 967
# =====================
# Number of emails violating rules 1.1 and 2.3: 4
# Number of emails violating rules 1.2 and 2.3: 1176
# Number of emails violating rules 1.3 and 2.3: 9024

# create labels per rule violation, where -1 is no violation, and multiple violations are combined to new label
def create_label(text):
    """
    create labels for 1.1, 1.2, 1.3, 1.2 & 2.1, 1.3 & 2.2, 1.2 & 1.3, 1.2 & 2.3, 1.3 & 2.3. else -1
    """
    if '1.1' in text:
        return 0
    if '1.2' in text:
        if '2.1' in text:
            return 1
        if '2.3' in text:
            return 2
        return 3
    if '1.3' in text:
        if '2.2' in text:
            return 4
        if '2.3' in text:
            return 5
        return 6
    return -1



# Load and prepare your dataset
# dataset = pd.read_csv('your_dataset.csv')  # Assuming data is loaded from a CSV file
bert_dataset['label'] = bert_dataset['violated_rules'].apply(create_label)

# drop lines where its empty or null
bert_dataset = bert_dataset[bert_dataset['email_text'].notna() & bert_dataset['email_text'].ne('')]


# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    bert_dataset['email_text'], bert_dataset['label'], test_size=0.2, random_state=42)

In [75]:
# get mean length of email_text in enron_df_large 
mean_length = enron_df_large['email_text'].apply(len).mean()
print('Mean length of email_text in enron_df_large:', mean_length)
print('length of df large:', len(enron_df_large))

Mean length of email_text in enron_df_large: 2775.346821147038
length of df large: 325133


In [78]:
import os
from bertopic import BERTopic

def train_bert_model(train_texts, train_labels):
    # Define the path to the model
    if IN_COLAB:
      model_path = '/content/drive/MyDrive/Study/Cyber_AI/my_bertopic_model'
    else:
      model_path = './my_bertopic_model'

    if not os.path.exists(model_path):
        # If the model does not exist, create, fit, and save a new model
        topic_model = BERTopic(language="english", verbose=True, low_memory=True)
        topic_model.fit(train_texts, y=train_labels)
        topic_model.save(model_path, save_embedding_model=False)
        print("Model trained and saved.")
    else:
        # If the model exists, load the existing model
        if IN_COLAB:
          topic_model = BERTopic.load(model_path)
        else:
          topic_model = BERTopic.load(model_path, embedding_model='all-MiniLM-L6-v2')
        print("Model loaded.")

    return topic_model

if 'enron_df' in globals():
  del enron_df


# Train the BERTopic model
topic_model = train_bert_model(train_texts, train_labels)


RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.

In [None]:

test_texts.reset_index(drop=True, inplace=True)

predicted_topics, _ = topic_model.transform(test_texts)

all = zip(predicted_topics, test_labels)
# Evaluate the Model
# You can evaluate the model using the predicted topics and the true labels. For example, you can calculate the accuracy of the model as follows:
accuracy = (predicted_topics == test_labels).mean()
print(f"Accuracy: {accuracy}")

# check each topic if its corresponds to a label in the test_labels
# get topic length
topic_length = len(topic_model.get_topics())

# i want each topic to see if it corresponds to a label between 0 and 6, if not, then i will check the next topic
# each topic is row, and there is 7 column representing the labels. the value is the number of corresponding labels in the topic
for topic_num in range(topic_length):
    # get the topic words
    topic_words = topic_model.get_topic(topic_num)
    # get the topic labels
    

# Visualize the Topics
topic_model.visualize_topics()

Now we build the LDA

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Ensure that necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text_LDA(text):
    # Convert text to lowercase
    text = text.lower()

    # Tokenize text
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words and token.isalpha()]

    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    return ' '.lemmatized_tokens



In [80]:
def preprocess_wrapper(row):
    # Access a counter that's defined outside of the function
    global counter
    if counter % 1000 == 0:
        print(f"Processed {counter} records")
    result = preprocess_text_LDA(row)
    counter += 1
    return result

counter = 0  # Initialize counter outside of apply
enron_df_large['email_text'] = enron_df_large['email_text'].apply(preprocess_wrapper)

Processed 0 records
Processed 1000 records
Processed 2000 records
Processed 3000 records
Processed 4000 records
Processed 5000 records
Processed 6000 records
Processed 7000 records
Processed 8000 records
Processed 9000 records
Processed 10000 records
Processed 11000 records
Processed 12000 records
Processed 13000 records
Processed 14000 records
Processed 15000 records
Processed 16000 records
Processed 17000 records
Processed 18000 records
Processed 19000 records
Processed 20000 records
Processed 21000 records
Processed 22000 records
Processed 23000 records
Processed 24000 records
Processed 25000 records
Processed 26000 records
Processed 27000 records
Processed 28000 records
Processed 29000 records
Processed 30000 records
Processed 31000 records
Processed 32000 records
Processed 33000 records
Processed 34000 records
Processed 35000 records
Processed 36000 records
Processed 37000 records
Processed 38000 records
Processed 39000 records
Processed 40000 records
Processed 41000 records
Proce

In [82]:
# save enron_df_large to pickle file
if IN_COLAB:
    enron_df_large.to_pickle('/content/drive/MyDrive/Study/Cyber_AI/enron_df_large_processed.pkl')
else:
    enron_df_large.to_pickle('enron_df_large_processed.pkl')    


In [None]:
# Build LDA model based on the enron long dataset
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# we will work on this data
# enron_df_large
# Create a CountVectorizer to convert text data into a bag of words
def train_lda_model(data, n_topics=10):
    count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    count_data = count_vectorizer.fit_transform(data)

    # Create and fit an LDA model
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    lda.fit(count_data)

    return lda, count_vectorizer


# Function to display the topics
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx + 1}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))


In [89]:
# save the LDA model
import pickle

# Save the LDA model
with open('lda_model.pkl', 'wb') as file:
    pickle.dump(lda, file)

In [None]:
# Open the LDA model, or trian if not existing
try:
    # Attempt to load the LDA model from the specified file path
    with open('lda_model.pkl', 'rb') as file:
        lda = pickle.load(file)
        print("Loaded LDA model from file.")
except (FileNotFoundError, IOError):
    print("File not found. Training LDA model...")
    # If the file does not exist, train the LDA model
    lda, count_vectorizer = train_lda_model(enron_df_large['email_text'])
    print("LDA model trained.")

In [88]:

# display the topics, where each topic is a set of words
no_top_words = 10
display_topics(lda, count_vectorizer.get_feature_names_out(), no_top_words)


Topic 1:
subject know message sent like time think good pm going
Topic 2:
power enron said energy company state california price utility electricity
Topic 3:
message subject enron sent agreement pm cc thanks attached know
Topic 4:
company business service new market enron management group trading report
Topic 5:
gas price market deal contract power day rate ferc customer
Topic 6:
pm outage scheduled sat london ct fri pt contact data
Topic 7:
td font br http nbsp error tr game database updated
Topic 8:
ect enron cc subject pm forwarded ee enronxgate thanks mark
Topic 9:
http image click email free new information address offer online
Topic 10:
enron meeting houston time texas program way day conference travel
