# Raw Emails Processing (self-generated)

## CSV storage

First email is an obvious spam mail,
Second one is obviously non-spam,
Third one is ambiguous.

In [16]:
import pandas as pd

# Data for the emails
data = {
    "Type": ["Spam", "Regular", "Ambiguous"],
    "Subject": [
        "Congratulations! You Have Won a $1,000 Gift Card!",
        "Meeting Reminder - Project Sync-up Tomorrow",
        "Urgent Update Required for Your Account!"
    ],
    "From": [
        "rewards@amazingoffers.com",
        "jdoe@example.com",
        "update@account-services.com"
    ],
    "To": [
        "[Your Email]",
        "team@example.com",
        "[Your Email]"
    ],
    "Body": [
        "Dear Valued Customer,\n\nWe are thrilled to inform you that your email has been selected in our exclusive annual lucky draw! 🎉 You have won a $1,000 Gift Card to use at any of our participating stores nationwide.\n\nTo claim your prize, simply click on the link below and enter your details:\n\n[Claim Your Prize Now!](#)\n\nThis offer is valid for the next 24 hours only, so act fast to secure your winnings!\n\nFor verification, please provide the following when claiming your gift:\n- Full Name\n- Address\n- Date of Birth\n- Phone Number\n\nDon’t miss out on this exclusive opportunity to shop for free at your favorite stores.\n\nWarm regards,\n\nThe Rewards Team\nAmazing Offers, Inc.\n1-800-555-0199\nsupport@amazingoffers.com",
        "Hi Team,\n\nJust a quick reminder about our project sync-up meeting scheduled for tomorrow at 10:00 AM. We'll be meeting in Conference Room B. Please bring your laptops as we'll be reviewing the project timeline and discussing next steps.\n\nIf you have any specific topics you'd like to cover, please reply to this email or bring them up during the meeting.\n\nLooking forward to seeing everyone there!\n\nBest,\nJohn Doe\nProject Manager\nExample Company\nPhone: 1-800-555-0102",
        "Dear User,\n\nWe've noticed some unusual activity in your account, and for your security, we need you to update your information promptly. This is essential to ensure you're not experiencing any disruptions with our services.\n\nPlease visit the link below at your earliest convenience to confirm your identity and update your details:\n\n[Update Your Account Information Now!](#)\n\nIf you fail to confirm your details within 48 hours, your account may be temporarily restricted for security reasons.\n\nThank you for your prompt attention to this matter.\n\nBest regards,\nAccount Services"
    ]
}

# Creating a DataFrame
emails_df = pd.DataFrame(data)

# Saving the DataFrame to a CSV file
csv_file_path = "Raw_emails.csv"
emails_df.to_csv(csv_file_path, index=False)


## Raw emails processing

In [17]:
import numpy as np
from collections import Counter
import re

#Formats a single email (string) as a record from the database
def count_words_in_text(raw_text, word_list):
    # Raw text cleaning
    words = re.findall(r'\b\w+\b', raw_text.lower())
    # Frequency of each word present in raw text
    word_counts = Counter(words)
    # Frequency of each word from word_list within raw text
    counts = np.zeros(len(word_list), dtype=int)
    for i, word in enumerate(word_list):
        counts[i] = word_counts[word.lower()]
    return counts

#Formats a list of emails as a test set
def process_emails(texts, word_list):
    results = np.zeros((len(texts), len(word_list)), dtype=int)
    for i, text in enumerate(texts):
        results[i] = count_words_in_text(text, word_list)
    return results

In [19]:
df = pd.read_csv('Raw_emails.csv')
email_bodies = df['Body']
email_array = email_bodies.to_numpy()

file_path = 'data/emails.csv'
data = pd.read_csv(file_path, nrows=0)
word_list = data.columns.tolist()[1:-1]

external_test_set = process_emails(email_array,word_list)
print(external_test_set)

[[4 5 0 ... 0 0 0]
 [2 3 0 ... 0 0 0]
 [1 5 0 ... 0 0 0]]


In [21]:
import pickle
external_test_path = 'data/external_test.pkl'
with open(external_test_path, 'wb') as file:
    pickle.dump(external_test_set, file)