# Data Analysis of Email Dataset

In this notebook, we will explore the email dataset to identify patterns and insights related to spam and legitimate emails. We will perform statistical analyses and visualizations to better understand the characteristics of spam emails.

In [1]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set(style='whitegrid')

In [2]:
# Load the dataset
data_path = '../data/emails.csv'
emails_df = pd.read_csv(data_path)

# Display the first few rows of the dataset
emails_df.head()

In [3]:
# Check the distribution of spam and legitimate emails
plt.figure(figsize=(8, 5))
sns.countplot(x='es_spam', data=emails_df)
plt.title('Distribution of Spam and Legitimate Emails')
plt.xlabel('Is Spam (1 = Yes, 0 = No)')
plt.ylabel('Count')
plt.xticks(ticks=[0, 1], labels=['Legitimate', 'Spam'])
plt.show()

In [4]:
# Analyze the most common words in spam emails
from wordcloud import WordCloud

spam_emails = emails_df[emails_df['es_spam'] == 1]
spam_text = ' '.join(spam_emails['contenido'])

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(spam_text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Spam Emails')
plt.show()

In [5]:
# Analyze the relationship between subject lines and spam classification
plt.figure(figsize=(12, 6))
sns.countplot(y='asunto', hue='es_spam', data=emails_df, order=emails_df['asunto'].value_counts().index[:10])
plt.title('Top 10 Subject Lines by Spam Classification')
plt.xlabel('Count')
plt.ylabel('Subject Line')
plt.legend(title='Is Spam', loc='upper right', labels=['Legitimate', 'Spam'])
plt.show()

In [6]:
# Save the cleaned dataset for further analysis
emails_df.to_csv('../data/cleaned_emails.csv', index=False)
print('Cleaned dataset saved as cleaned_emails.csv')