In [None]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import ipywidgets as widgets
from IPython.display import display

# Download stopwords
nltk.download('stopwords')

# Load dataset
data = pd.read_csv('/content/spam.csv', encoding='latin-1')
data = data.iloc[:, :2]  # Selecting only relevant columns
data.columns = ['label', 'text']

data['label'] = data['label'].map({'ham': 0, 'spam': 1})  # Convert labels to binary

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    text = re.sub(r"\d+", "", text)  # Remove numbers
    tokens = text.split()
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

# Apply preprocessing
data['text'] = data['text'].apply(preprocess_text)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)

# Build pipeline
model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', MultinomialNB())
])

# Train the model
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Chatbox interface
def classify_email(email):
    processed_email = preprocess_text(email)
    prediction = model.predict([processed_email])[0]
    return "Spam" if prediction == 1 else "Ham"

input_box = widgets.Text(placeholder='Enter email text...')
output_label = widgets.Label()

def on_submit(change):
    output_label.value = f"Prediction: {classify_email(change['new'])}"

input_box.observe(on_submit, names='value')
display(input_box, output_label)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Model Accuracy: 0.97


Text(value='', placeholder='Enter email text...')

Label(value='')