In [1]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 23.1.1
[notice] To update, run: C:\Users\heram\AppData\Local\Programs\Python\Python311\python.exe -m pip install --upgrade pip


In [2]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# download necessary NLTK datasets
nltk.download('punkt')
nltk.download('stopwords')

# initialize stemmer and vectorizer
stemmer = PorterStemmer()
vectorizer = CountVectorizer()

# define spam and ham emails
spam_emails = ["Free Viagra now!!!", "Get rich quick schemes", "Make money fast"]
ham_emails = ["Hi, just wanted to check in", "Looking forward to seeing you soon", "I'll be there in 10 minutes"]

# preprocess and tokenize emails
def preprocess(emails):
    preprocessed = []
    for email in emails:
        # remove non-alphabetic characters and tokenize
        tokens = word_tokenize(re.sub(r'[^a-zA-Z]', ' ', email.lower()))
        # remove stopwords and stem tokens
        preprocessed.append(" ".join([stemmer.stem(word) for word in tokens if word not in stopwords.words('english')]))
    return preprocessed

# create training data
training_data = preprocess(spam_emails + ham_emails)
training_labels = [1] * len(spam_emails) + [0] * len(ham_emails)

# transform training data into vectors
training_vectors = vectorizer.fit_transform(training_data)

# train Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(training_vectors, training_labels)

# test with new emails
new_emails = ["Hello! Long-time members of the Proton community will notice this newsletter is a little different from our previous ones"]
new_emails_preprocessed = preprocess(new_emails)
new_emails_vectors = vectorizer.transform(new_emails_preprocessed)
predictions = classifier.predict(new_emails_vectors)

# print results
for i in range(len(new_emails)):
    if predictions[i] == 1:
        print(f"{new_emails[i]} is spam")
    else:
        print(f"{new_emails[i]} is not spam")


Hello! Long-time members of the Proton community will notice this newsletter is a little different from our previous ones is not spam


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\heram\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\heram\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
