In [1]:
# Install required packages (run these once in your terminal or notebook)
# pip install pandas numpy nltk scikit-learn

import string
import numpy as np
import pandas as pd
import nltk

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Download stopwords
nltk.download('stopwords')
stopwords_set = set(stopwords.words('english'))

# Load dataset
df = pd.read_csv('spam_ham_dataset.csv')
df['text'] = df['text'].apply(lambda x: x.replace('\r\n', ' '))  # Clean line breaks

# Initialize stemmer and corpus list
stemmer = PorterStemmer()
corpus = []

# Preprocess each email in the dataset
for i in range(len(df)):
    text = df['text'].iloc[i].lower()  # Lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    words = text.split()  # Tokenize
    words = [stemmer.stem(word) for word in words if word not in stopwords_set]  # Remove stopwords + stem
    cleaned_text = ' '.join(words)  # Rejoin words
    corpus.append(cleaned_text)

# Vectorize the corpus (Bag of Words)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus).toarray()

# Target variable
Y = df['label_num']

# Split into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Train Random Forest Classifier
clf = RandomForestClassifier(n_jobs=-1, random_state=42)
clf.fit(X_train, Y_train)

# Evaluate model
accuracy = clf.score(X_test, Y_test)
print(f"Model Accuracy: {accuracy:.2%}")

# --- Classify a Single Email ---
# Select the 10th email to test
email_to_classify = df['text'].iloc[10]

# Preprocess the new email
email_clean = email_to_classify.lower()
email_clean = email_clean.translate(str.maketrans('', '', string.punctuation))
email_words = email_clean.split()
email_words = [stemmer.stem(word) for word in email_words if word not in stopwords_set]
email_processed = ' '.join(email_words)

# Vectorize and classify
email_vector = vectorizer.transform([email_processed])
prediction = clf.predict(email_vector)[0]

# Output result
print("\nEmail content:")
print(email_to_classify)
print("\nPrediction:", "Spam" if prediction == 1 else "Ham")
print("Actual:", "Spam" if df['label_num'].iloc[10] == 1 else "Ham")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ESHOP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model Accuracy: 97.78%

Email content:
Subject: vocable % rnd - word asceticism vcsc - brand new stock for your attention vocalscape inc - the stock symbol is : vcsc vcsc will be our top stock pick for the month of april - stock expected to bounce to 12 cents level the stock hit its all time low and will bounce back stock is going to explode in next 5 days - watch it soar watch the stock go crazy this and next week . breaking news - vocalscape inc . announces agreement to resell mix network services current price : $ 0 . 025 we expect projected speculative price in next 5 days : $ 0 . 12 we expect projected speculative price in next 15 days : $ 0 . 15 vocalscape networks inc . is building a company that ' s revolutionizing the telecommunications industry with the most affordable phone systems , hardware , online software , and rates in canada and the us . vocalscape , a company with global reach , is receiving international attention for the development of voice over ip ( voip ) applic

In [7]:
real_email = """
Hey how are you doing
"""
def preprocess_email(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stopwords_set]
    return ' '.join(words)

processed_email = preprocess_email(real_email)
email_vector = vectorizer.transform([processed_email])
prediction = clf.predict(email_vector)[0]
print("Email content",real_
print("Prediction:", "Spam" if prediction == 1 else "Ham")



Prediction: Spam
