In [4]:
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assume you already have your dataset and have preprocessed it into X and y
df = pd.read_csv("spam.csv")

# Convert 'Category' column to binary labels (0 for ham, 1 for spam)
df['Spam'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0)

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['Message'], df['Spam'], test_size=0.25)

# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

# Save the model to a file using pickle
with open('spam_classification_model.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

# Later, you can load the model from the file and use it for classification
# Load the model from the file
with open('spam_classification_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Example usage:
example_email = ["Congratulations! You've won a free vacation."]
predicted_label = loaded_model.predict(example_email)
print(predicted_label)


Accuracy: 0.9627
Precision: 1.0000
Recall: 0.6941
F1-score: 0.8194
[0]
