In [1]:
import pandas as pd
import matplotlib as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
data = pd.read_csv('spam.csv', encoding='latin-1', names=['Status', 'Message', 'un1', 'un2', 'un3'])

data.drop(['un1', 'un2', 'un3'], inplace=True, axis=1)

data.info()

data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Status   5572 non-null   object
 1   Message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


Unnamed: 0,Status,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ï¿½_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
data.describe()

Unnamed: 0,Status,Message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [11]:
data['Status'] = data['Status'].map({'ham': 0 , 'spam': 1})

data


Unnamed: 0,Status,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ï¿½_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [12]:
message = data.Message.values
status = data.Status.values

message_train, message_test, status_train, status_test = train_test_split(message, status, test_size=0.2, random_state=500)

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(message)
x_train = vectorizer.transform(message_train)
x_test = vectorizer.transform(message_test)

In [15]:
classifier = LogisticRegression()
classifier.fit(x_train, status_train)

In [18]:
accuracy = classifier.score(x_test, status_test)

print(f'accuracy: {accuracy*100:.4f} %')

accuracy: 98.3857 %


In [30]:
# Testing our model on a new dataset
email_messages = [
    {
        "subject": "Meeting Reminder",
        "sender": "john.doe@company.com",
        "recipient": "team@company.com",
        "body": "Hello team,\n\nThis is a reminder about our quarterly planning meeting tomorrow at 10:00 AM in Conference Room B.\n\nBest regards,\nJohn"
    },
    {
        "subject": "Your Amazon Order #12345",
        "sender": "orders@amazon.com",
        "recipient": "customer@gmail.com",
        "body": "Dear Customer,\n\nYour order #12345 has been shipped and will arrive on Friday, June 10.\n\nTrack your package here: [tracking link]\n\nThank you for shopping with Amazon!"
    },
    {
        "subject": "Weekend Plans?",
        "sender": "sarah.j@gmail.com",
        "recipient": "friends.group@email.com",
        "body": "Hey everyone!\n\nAnyone up for hiking this Saturday? I was thinking we could try the new trail at Pine Ridge.\n\nLet me know if you're interested!\n\n-Sarah"
    },
    {
        "subject": "Your Monthly Newsletter",
        "sender": "newsletter@technews.com",
        "recipient": "subscriber@example.org",
        "body": "This Month in Tech:\n\n1. Apple announces new MacBook Pro\n2. Google releases Android 13 beta\n3. Microsoft acquires gaming studio\n\nRead more on our website!"
    },
    {
        "subject": "Password Reset Request",
        "sender": "security@service.com",
        "recipient": "user.account@mail.com",
        "body": "We received a request to reset your password. If this was you, please click the link below to reset your password:\n\n[reset link]\n\nIf you didn't request this, please ignore this email."
    }
]


for email in email_messages:
    text = f"{email['subject']} {email['body']}"
    x_new = vectorizer.transform([text])
    prediction = classifier.predict(x_new)[0]
    print(f'subject: {email['subject']}')
    print('spam'  if prediction == 1  else 'Not spam')
    

subject: Meeting Reminder
Not spam
subject: Your Amazon Order #12345
spam
subject: Weekend Plans?
Not spam
subject: Your Monthly Newsletter
Not spam
subject: Password Reset Request
spam
