In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# Replace 'spam.csv' with the actual filename if different
df = pd.read_csv('spam.csv')
print(df.head())

In [None]:
# Check for missing values
print(df.isnull().sum())

# Drop rows with missing values (if any)
df.dropna(inplace=True)

# Check the column names and adjust if needed
print(df.columns)

In [None]:
# Convert 'Category' to binary: 1 for spam, 0 for ham
df['Category'] = df['Category'].map({'spam': 1, 'ham': 0})

In [None]:
# Split data into features and target
X = df['Message']
y = df['Category']

# Convert text to TF-IDF features
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [None]:
model = MultinomialNB()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Classification report
print(classification_report(y_test, y_pred))

# Confusion matrix
print(confusion_matrix(y_test, y_pred))

In [None]:
# Example: Predict if a new email is spam or not
new_emails = ["Congratulations! You've won a free ticket.", "Hi, can we meet tomorrow for lunch?"]
new_emails_tfidf = vectorizer.transform(new_emails)
predictions = model.predict(new_emails_tfidf)
print(predictions)  # 1 = spam, 0 = not spam 