# Loading and Modeling Mail Data

# Data Loading

In [20]:
import pandas as pd

# Load email data from a CSV file
email_data = pd.read_csv('email50.csv')
email_data

Unnamed: 0,spam,to_multiple,from,cc,sent_email,time,image,attach,dollar,Label,...,viagra,password,num_char,line_breaks,format,Content,exclaim_subj,urgent_subj,exclaim_mess,number
0,0,0,1,0,1,2012-01-04T13:19:16Z,0,0,0,no,...,0,0,21.705,551,1,AAAA,0,0,8,small
1,0,0,1,0,0,2012-02-16T20:10:06Z,0,0,0,no,...,0,0,7.011,183,1,BBBB,0,0,1,big
2,1,0,1,4,0,2012-01-04T15:36:23Z,0,2,0,no,...,0,0,0.631,28,0,AAAA,0,0,2,none
3,0,0,1,0,0,2012-01-04T17:49:52Z,0,0,0,no,...,0,0,2.454,61,0,AAAA,0,0,1,small
4,0,0,1,0,0,2012-01-27T09:34:45Z,0,0,9,no,...,0,1,41.623,1088,1,AAAA,0,0,43,small
5,0,0,1,0,0,2012-01-17T17:31:57Z,0,0,0,no,...,0,0,0.057,5,0,AAAA,0,0,0,small
6,0,0,1,0,0,2012-03-18T04:18:55Z,0,0,0,no,...,0,0,0.809,17,0,AAAA,0,0,0,small
7,0,0,1,0,1,2012-03-31T13:58:56Z,0,0,0,no,...,0,0,5.229,88,1,AAAA,0,0,2,small
8,0,0,1,1,1,2012-01-11T01:57:54Z,0,0,0,no,...,0,0,9.277,242,1,BBBB,1,0,22,small
9,0,0,1,0,0,2012-01-07T19:29:16Z,0,0,23,no,...,0,0,17.17,578,1,BBBB,0,0,3,small


# Data Exploration

In [21]:
# Display the first few rows of the dataset
print(email_data.head())

   spam  to_multiple  from  cc  sent_email                  time  image  \
0     0            0     1   0           1  2012-01-04T13:19:16Z      0   
1     0            0     1   0           0  2012-02-16T20:10:06Z      0   
2     1            0     1   4           0  2012-01-04T15:36:23Z      0   
3     0            0     1   0           0  2012-01-04T17:49:52Z      0   
4     0            0     1   0           0  2012-01-27T09:34:45Z      0   

   attach  dollar Label  ...  viagra  password  num_char  line_breaks  format  \
0       0       0    no  ...       0         0    21.705          551       1   
1       0       0    no  ...       0         0     7.011          183       1   
2       2       0    no  ...       0         0     0.631           28       0   
3       0       0    no  ...       0         0     2.454           61       0   
4       0       9    no  ...       0         1    41.623         1088       1   

   Content exclaim_subj  urgent_subj  exclaim_mess  number  
0

In [22]:
# Check for missing values
print(email_data.isnull().sum())

spam            0
to_multiple     0
from            0
cc              0
sent_email      0
time            0
image           0
attach          0
dollar          0
Label           0
inherit         0
viagra          0
password        0
num_char        0
line_breaks     0
format          0
Content         0
exclaim_subj    0
urgent_subj     0
exclaim_mess    0
number          0
dtype: int64


In [23]:
# Check the data types of columns
print(email_data.dtypes)

spam              int64
to_multiple       int64
from              int64
cc                int64
sent_email        int64
time             object
image             int64
attach            int64
dollar            int64
Label            object
inherit           int64
viagra            int64
password          int64
num_char        float64
line_breaks       int64
format            int64
Content          object
exclaim_subj      int64
urgent_subj       int64
exclaim_mess      int64
number           object
dtype: object


# Data Preprocessing:

In [24]:
# Drop rows with missing values
email_data.dropna(inplace=True)

# Tokenize and clean the email content
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()
email_content = count_vectorizer.fit_transform(email_data['Content'])


In [27]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(email_content, email_data['Label'], test_size=0.2, random_state=0)

# Train a classification model (e.g., Naive Bayes)
model = MultinomialNB()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 1.00


# another example for Loading and Modeling Mail Data

In [28]:
emails = ["friend@example.com", "family@example.com", "news@example.com", "spam@example.com"]

for email in emails:
    if "friend" in email:
        print("Move to Friends Folder:", email)
    elif "family" in email:
        print("Move to Family Folder:", email)
    else:
        print("Move to Other Folder:", email)

Move to Friends Folder: friend@example.com
Move to Family Folder: family@example.com
Move to Other Folder: news@example.com
Move to Other Folder: spam@example.com


In [29]:
spam_keywords = ["win a prize", "get rich quick", "free money"]

email_content = "Congratulations! You've won a prize and can get rich quick!"

is_spam = any(keyword in email_content for keyword in spam_keywords)

if is_spam:
    print("This is a spam email. Block it!")
else:
    print("This email is safe.")


This is a spam email. Block it!


In [30]:
emails = ["email1@example.com", "email2@example.com", "email3@example.com"]

total_emails = len(emails)
print("You have", total_emails, "emails.")


You have 3 emails.
