In [None]:
# 1. Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report 
import os
print(os.getcwd())

In [None]:
# 2. Load Dataset (correct for tab-separated file)
df_raw = pd.read_csv('spam.csv', encoding='latin-1', sep='\t', header=None)

# Rename columns for clarity
df_raw.columns = ['label', 'message']

# Preview the data
print(df_raw.head())
print(df_raw.columns)



In [None]:
#Check the distribution of spam vs ham messages
print(df_raw['label'].value_counts())

In [None]:
#Split your data into features and labels
X = df_raw['message']  # Text messages
y = df_raw['label']    # Labels: spam or ham



In [None]:
# Split dataset into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)


In [None]:
#Vectorize the text data (convert text messages into numeric features)
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)  # Fit on train data and transform
X_test_vec = vectorizer.transform(X_test)        # Only transform test data


In [None]:
#Train a classifier (Naive Bayes is common for spam detection)
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_vec, y_train)



In [None]:
#Make predictions on test data
y_pred = model.predict(X_test_vec)


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Predict on the test set
y_pred = model.predict(X_test_vec)

# Evaluate the model performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


In [None]:
# 9. Save the model and vectorizer
joblib.dump(model, "spam_classifier.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")

In [None]:
# 10. Load model and vectorizer for prediction (example)
loaded_model = joblib.load("spam_classifier.pkl")
loaded_vectorizer = joblib.load("vectorizer.pkl")

In [None]:
# Example prediction on new messages
new_messages = ["Congratulations! You've won a prize!", "Hey, are we meeting today?"]
new_messages_vec = loaded_vectorizer.transform(new_messages)
predictions = loaded_model.predict(new_messages_vec)
print("Predictions for new messages:", predictions)