In [5]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
!unzip smsspamcollection.zip


--2024-12-23 17:11:04--  https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘smsspamcollection.zip’

smsspamcollection.z     [<=>                 ]       0  --.-KB/s               smsspamcollection.z     [ <=>                ] 198.65K  --.-KB/s    in 0.03s   

2024-12-23 17:11:04 (6.40 MB/s) - ‘smsspamcollection.zip’ saved [203415]

Archive:  smsspamcollection.zip
  inflating: SMSSpamCollection       
  inflating: readme                  


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import time

class SpamClassifier:
    def __init__(self):
        # Read the local dataset
        self.data = pd.read_csv('SMSSpamCollection', sep='\t', names=['label', 'text'])
        print(f"Dataset loaded successfully with {len(self.data)} entries")

    def preprocess_text(self, text):
        text = str(text).lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()

        stop_words = set(stopwords.words('english'))
        ps = PorterStemmer()
        words = text.split()
        words = [ps.stem(word) for word in words if word not in stop_words]

        return ' '.join(words)

    def prepare_data(self):
        print("Preprocessing data...")
        self.data['processed_text'] = self.data['text'].apply(self.preprocess_text)

        self.vectorizer = TfidfVectorizer(max_features=5000)
        X = self.vectorizer.fit_transform(self.data['processed_text'])
        y = (self.data['label'] == 'spam').astype(int)

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=0.3, random_state=42)

    def train_models(self):
        self.models = {
            'Naive Bayes': MultinomialNB(),
            'Decision Tree': DecisionTreeClassifier(random_state=42)
        }

        self.results = {}

        for name, model in self.models.items():
            print(f"\nTraining {name}...")
            start_time = time.time()

            model.fit(self.X_train, self.y_train)
            y_pred = model.predict(self.X_test)

            accuracy = accuracy_score(self.y_test, y_pred)
            conf_matrix = confusion_matrix(self.y_test, y_pred)
            class_report = classification_report(self.y_test, y_pred)
            training_time = time.time() - start_time

            self.results[name] = {
                'accuracy': accuracy,
                'confusion_matrix': conf_matrix,
                'classification_report': class_report,
                'training_time': training_time
            }

    def display_results(self):
        for name, metrics in self.results.items():
            print(f"\n{name} Results:")
            print(f"Accuracy: {metrics['accuracy']:.4f}")
            print(f"Training Time: {metrics['training_time']:.2f} seconds")
            print("\nConfusion Matrix:")
            print(metrics['confusion_matrix'])
            print("\nClassification Report:")
            print(metrics['classification_report'])

    def predict_email(self, email_text):
        processed_email = self.preprocess_text(email_text)
        email_vectorized = self.vectorizer.transform([processed_email])

        predictions = {}
        for name, model in self.models.items():
            prediction = model.predict(email_vectorized)[0]
            predictions[name] = "Spam" if prediction == 1 else "Not Spam"

        return predictions

def main():
    classifier = SpamClassifier()
    classifier.prepare_data()
    classifier.train_models()
    classifier.display_results()

    test_email = "Congratulations! You've won a free iPhone. Click here to claim your prize!"
    predictions = classifier.predict_email(test_email)
    print("\nSample Email Classification:")
    for model, prediction in predictions.items():
        print(f"{model}: {prediction}")

if __name__ == "__main__":
    main()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Dataset loaded successfully with 5572 entries
Preprocessing data...

Training Naive Bayes...

Training Decision Tree...

Naive Bayes Results:
Accuracy: 0.9707
Training Time: 0.02 seconds

Confusion Matrix:
[[1448    0]
 [  49  175]]

Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1448
           1       1.00      0.78      0.88       224

    accuracy                           0.97      1672
   macro avg       0.98      0.89      0.93      1672
weighted avg       0.97      0.97      0.97      1672


Decision Tree Results:
Accuracy: 0.9581
Training Time: 0.31 seconds

Confusion Matrix:
[[1419   29]
 [  41  183]]

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      1448
           1       0.86      0.82      0.84       224

    accuracy                           0.96      1672
   macro avg       0.92      0.90      0.91      1672
weigh