In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

df = pd.read_csv("/content/spam.csv", encoding='latin-1')
df = df[['v1', 'v2']]
df.columns = ['label', 'message']



X_train, X_test, y_train, y_test = train_test_split(
    df['message'], df['label'], test_size=0.2, random_state=42
)



vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train)

X_test_tfidf = vectorizer.transform(X_test)



vocab = vectorizer.get_feature_names_out()
print("\nTotal Vocabulary Size:", len(vocab))
print("\nSample Vocabulary (first 30 words):", vocab[:30])



tfidf_matrix = pd.DataFrame(
    X_train_tfidf[:5].toarray(),
    columns=vocab
)
print("\nTF-IDF Matrix (first 5 messages, first 20 words):")
print(tfidf_matrix.iloc[:, :20])



model = MultinomialNB()
model.fit(X_train_tfidf, y_train)



y_pred = model.predict(X_test_tfidf)



print("\nModel Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))



Total Vocabulary Size: 7735

Sample Vocabulary (first 30 words): ['00' '000' '000pes' '008704050406' '0089' '0121' '01223585236'
 '0125698789' '02' '0207' '02072069400' '02073162414' '02085076972' '021'
 '03' '04' '0430' '05' '050703' '0578' '06' '07' '07046744435'
 '07090201529' '07090298926' '07099833605' '07123456789' '0721072'
 '07732584351' '07734396839']

TF-IDF Matrix (first 5 messages, first 20 words):
         00  000  000pes  008704050406  0089  0121  01223585236  0125698789  \
0  0.000000  0.0     0.0           0.0   0.0   0.0          0.0         0.0   
1  0.243404  0.0     0.0           0.0   0.0   0.0          0.0         0.0   
2  0.000000  0.0     0.0           0.0   0.0   0.0          0.0         0.0   
3  0.000000  0.0     0.0           0.0   0.0   0.0          0.0         0.0   
4  0.000000  0.0     0.0           0.0   0.0   0.0          0.0         0.0   

    02  0207  02072069400  02073162414  02085076972  021   03       04  0430  \
0  0.0   0.0          0.0     

In [18]:
test_message = ["Congratulations! You have won a free ticket to Bahamas. Text WIN to 12345"]
test_message_counts = vectorizer.transform(test_message)
prediction = model.predict(test_message_counts)
print("test message:", test_message[0])
print("predicted label:", prediction[0])

test message: Congratulations! You have won a free ticket to Bahamas. Text WIN to 12345
predicted label: spam


In [19]:
test_message = ["Hey, are we still meeting for lunch tomorrow at 1 pm?"]
test_message_counts = vectorizer.transform(test_message)
prediction = model.predict(test_message_counts)
print("test message:", test_message[0])
print("predicted label:", prediction[0])

test message: Hey, are we still meeting for lunch tomorrow at 1 pm?
predicted label: ham
