In [10]:
import pandas as pd

# Read CSV with correct encoding
df = pd.read_csv('spam.csv', encoding='latin1')

# Display first few rows
df.head()


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [12]:
# Keep only useful columns
df = df[['v1', 'v2']]

# Rename columns to something meaningful
df = df.rename(columns={'v1': 'label', 'v2': 'text'})

# Check cleaned data
df.head()


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [14]:
# Convert labels to 0 and 1
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})


In [16]:
X = df['text']         # Input text
y = df['label_num']    # Output label (0 or 1)


In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [22]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_vec, y_train)


In [24]:
print("Accuracy:", model.score(X_test_vec, y_test))


Accuracy: 0.967713004484305


In [26]:
# Predict on a new message
def predict_message(message):
    message_vec = vectorizer.transform([message])
    prediction = model.predict(message_vec)
    return 'Spam' if prediction[0] == 1 else 'Ham'

# Example
print(predict_message("Congratulations! You've won a $1000 Walmart gift card!"))
print(predict_message("Hey, are we still meeting for lunch today?"))


Ham
Ham


In [28]:
! pip install joblib




In [30]:
import joblib

# Save the model
joblib.dump(model, 'spam_classifier_model.pkl')

# Save the vectorizer
joblib.dump(vectorizer, 'spam_vectorizer.pkl')


['spam_vectorizer.pkl']

In [32]:
# Load the saved model and vectorizer
model = joblib.load('spam_classifier_model.pkl')
vectorizer = joblib.load('spam_vectorizer.pkl')

# Use the loaded model to predict a new message
def predict_message(message):
    message_vec = vectorizer.transform([message])
    prediction = model.predict(message_vec)
    return 'Spam' if prediction[0] == 1 else 'Ham'

# Example usage
print(predict_message("Congratulations, you've won a prize!"))


Ham


In [34]:
import joblib

# 1. Save both vectorizer and model together
joblib.dump((vectorizer, model), 'spam_classifier_pipeline.joblib')

# 2. (Later) Load them back like this:
vec_loaded, model_loaded = joblib.load('spam_classifier_pipeline.joblib')

# 3. Test the loaded pipeline on a new sample
def predict_with_loaded(message):
    msg_vec = vec_loaded.transform([message])
    pred = model_loaded.predict(msg_vec)[0]
    return 'Spam' if pred == 1 else 'Ham'

print(predict_with_loaded("Congratulations! You've won a free cruise!"))
print(predict_with_loaded("Don't forget the meeting at 3pm today."))


Ham
Ham
