In [1]:
# Step 1: Install & Import Libraries
!pip install -q nltk
import pandas as pd
import nltk
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

nltk.download('stopwords')
from nltk.corpus import stopwords

# Step 2: Load Dataset from URL
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])


# Step 3: Convert labels to binary
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Step 4: Clean the Text
def clean_text(msg):
    msg = msg.lower()
    msg = ''.join([char for char in msg if char not in string.punctuation])
    msg = ' '.join([word for word in msg.split() if word not in stopwords.words('english')])
    return msg

df['cleaned'] = df['message'].apply(clean_text)

# Step 5: Split and Vectorize
X_train, X_test, y_train, y_test = train_test_split(df['cleaned'], df['label'], test_size=0.2, random_state=42)
cv = CountVectorizer()
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

# Step 6: Train Model
model = MultinomialNB()
model.fit(X_train_cv, y_train)

# Step 7: Evaluate
y_pred = model.predict(X_test_cv)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}\n")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Step 8: Test on Custom Input
sample = ["You won a lottery! Call now to claim", "Hey, are we meeting at 6?"]
sample_cv = cv.transform(sample)
print("\nSample Prediction:", model.predict(sample_cv))  # 1=Spam, 0=Ham


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Accuracy: 0.9857

Confusion Matrix:
 [[962   4]
 [ 12 137]]

Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       0.97      0.92      0.94       149

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115


Sample Prediction: [1 0]
