In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt

df = pd.read_csv("../data/processed/train_emails.csv")
safe_df = df[df['Email Type'] == 'Safe Email']
phishing_df = df[df['Email Type'] == 'Phishing Email']

safe_vectorizer = CountVectorizer()
safe_X = safe_vectorizer.fit_transform(safe_df['Email Text'])

phishing_vectorizer = CountVectorizer()
phishing_X = phishing_vectorizer.fit_transform(phishing_df['Email Text'])

num_topics = 5

safe_lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
safe_lda.fit(safe_X)

phishing_lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
phishing_lda.fit(phishing_X)

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx + 1}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        print()

no_top_words = 5  # Adjust the number of top words to display per topic
feature_names = safe_vectorizer.get_feature_names_out()
display_topics(safe_lda, feature_names, no_top_words)

# Assign topics to each document
topic_results = safe_lda.transform(safe_X)
safe_df['Topic'] = topic_results.argmax(axis=1) + 1  # Add 1 to make topics 1-indexed
print(safe_df[['Email Text', 'Topic']])

no_top_words = 5  # Adjust the number of top words to display per topic
feature_names = phishing_vectorizer.get_feature_names_out()
display_topics(phishing_lda, feature_names, no_top_words)

# Assign topics to each document
topic_results = phishing_lda.transform(phishing_X)
phishing_df['Topic'] = topic_results.argmax(axis=1) + 1  # Add 1 to make topics 1-indexed
print(phishing_df[['Email Text', 'Topic']])

nltk.download('vader_lexicon')

sid = SentimentIntensityAnalyzer()

safe_df['Sentiment Scores'] = safe_df['Email Text'].apply(lambda text: sid.polarity_scores(text))
safe_df['Sentiment Scores'] = safe_df['Sentiment Scores'].apply(lambda score_dict: score_dict['compound'])

print(safe_df[['Email Text', 'Sentiment Score']])

# Optional: Visualize sentiment scores
plt.figure(figsize=(10, 6))
plt.bar(safe_df.index, safe_df['Sentiment Score'], color='skyblue')
plt.title('Sentiment Analysis of Emails')
plt.xlabel('Email Index')
plt.ylabel('Sentiment Score')
plt.xticks(safe_df.index, safe_df['Email Type'], rotation=45)
plt.tight_layout()
plt.show()


phishing_df['Sentiment Scores'] = phishing_df['Email Text'].apply(lambda text: sid.polarity_scores(text))
phishing_df['Sentiment Scores'] = phishing_df['Sentiment Scores'].apply(lambda score_dict: score_dict['compound'])

print(phishing_df[['Email Text', 'Sentiment Score']])

# Optional: Visualize sentiment scores
plt.figure(figsize=(10, 6))
plt.bar(phishing_df.index, phishing_df['Sentiment Score'], color='skyblue')
plt.title('Sentiment Analysis of Emails')
plt.xlabel('Email Index')
plt.ylabel('Sentiment Score')
plt.xticks(phishing_df.index, phishing_df['Email Type'], rotation=45)
plt.tight_layout()
plt.show()

Topic 1:
enron hou subject 2000 2001

Topic 2:
list email 2002 mailing wrote

Topic 3:
linux list irish iluglinuxie maintainer

Topic 4:
university language linguistics email information

Topic 5:
enron company energy power 2001



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  safe_df['Topic'] = topic_results.argmax(axis=1) + 1  # Add 1 to make topics 1-indexed


                                              Email Text  Topic
1      reduplicative constructions polarity moravscik...      4
4                                      url date supplied      3
6      6 15 00 revision robin volumes forwarded choks...      1
7      enbridge buys koch east texas midstream assets...      5
9      call papers linguistics session mla call paper...      4
...                                                  ...    ...
12658  enron case studies eric number case studies en...      1
12663  whitelistingoriginal message glynn mailtodelta...      2
12664  body backgroundimage url color 331f30 alink co...      1
12665  wip report attached updated wip report week 08...      1
12666  eugene leitl original comments context digital...      2

[7755 rows x 2 columns]
Topic 1:
font 00 size pills td

Topic 2:
account money email bank security

Topic 3:
company statements information securities stock

Topic 4:
email free click money business

Topic 5:
http online website li

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phishing_df['Topic'] = topic_results.argmax(axis=1) + 1  # Add 1 to make topics 1-indexed
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Hartl\AppData\Roaming\nltk_data...


                                              Email Text  Topic
0      settled hey link requested hope needed gotta r...      4
2      complete free service mortgage rates lower cre...      4
3      receiving letter expressed interest receiving ...      4
5      give partner pleasure girlfriend loves results...      4
8                                       cheerries remove      5
...                                                  ...    ...
12656  trouble computer problems computeras part nati...      4
12659  girl happy girl unsatisfied potency wait finds...      5
12660  utf 8 oprah r utf 8 olex replicas real reprodu...      5
12661  copy dvd movies click free softwarecopy dvd mo...      4
12662  top quality prescription hardin curiosity culv...      5

[4912 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  safe_df['Sentiment Scores'] = safe_df['Email Text'].apply(lambda text: sid.polarity_scores(text))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  safe_df['Sentiment Scores'] = safe_df['Sentiment Scores'].apply(lambda score_dict: score_dict['compound'])


KeyError: "['Sentiment Score'] not in index"