In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.manifold import TSNE
import spacy

# Load SpaCy model
nlp = spacy.load('en_core_web_md')
nlp.max_length = 20_000_000

# Sample DataFrame
df = pd.read_csv('../data/raw/phishing_emails.csv')

# Ensure all entries in 'Email Text' are strings and handle NaNs
df['Email Text'] = df['Email Text'].astype(str).fillna('')

# Generate word embeddings
def get_text_embedding(text):
    doc = nlp(text)
    return doc.vector

df['embedding'] = df['Email Text'].apply(get_text_embedding)

# Apply t-SNE
embeddings = np.vstack(df['embedding'].values)
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(embeddings)

# Create a DataFrame with t-SNE results
df_tsne = pd.DataFrame(X_tsne, columns=['TSNE1', 'TSNE2'])
df_tsne['Email Type'] = df['Email Type']

# Plot the t-SNE results
plt.figure(figsize=(10, 6))
sns.scatterplot(x='TSNE1', y='TSNE2', hue='Email Type', data=df_tsne, palette='bright')
plt.title('t-SNE visualization of Email Texts')
plt.xlabel('t-SNE Feature 1')
plt.ylabel('t-SNE Feature 2')
plt.show()


  from .autonotebook import tqdm as notebook_tqdm


KeyboardInterrupt: 