In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [8]:
data = pd.read_csv('spam.csv', encoding='latin1')

# Convert v1s to binary (0 for ham, 1 for spam)
data['v1'] = data['v1'].map({'ham': 0, 'spam': 1})

In [9]:
X_train, X_test, y_train, y_test = train_test_split(data['v2'], data['v1'], test_size=0.2, random_state=42)

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [10]:
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
svm = SVC()
grid_search = GridSearchCV(svm, param_grid, cv=3)
grid_search.fit(X_train_tfidf, y_train)

best_params = grid_search.best_params_
print("Best parameters:", best_params)

Best parameters: {'C': 10, 'kernel': 'linear'}


In [11]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))

Accuracy: 0.9820627802690582
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.97      0.89      0.93       150

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [13]:
random_messages = [
    "Congratulations! You've won a free trip to Hawaii. Click here to claim your prize.",
    "Hi there, how are you doing?",
    "URGENT: Your account has been suspended. Please click the link to verify your information.",
    "Don't forget to pick up groceries on your way home.",
    "You've been selected for a special offer. Reply YES to claim.",
    "Just wanted to say hello. Have a great day!",
]

random_messages_tfidf = tfidf_vectorizer.transform(random_messages)
predicted_labels = best_model.predict(random_messages_tfidf)
predicted_labels_str = np.where(predicted_labels == 1, 'spam', 'ham')
for message, label in zip(random_messages, predicted_labels_str):
    print(f"Message: {message}")
    print(f"Predicted Label: {label}")
    print()


Message: Congratulations! You've won a free trip to Hawaii. Click here to claim your prize.
Predicted Label: spam

Message: Hi there, how are you doing?
Predicted Label: ham

Message: URGENT: Your account has been suspended. Please click the link to verify your information.
Predicted Label: spam

Message: Don't forget to pick up groceries on your way home.
Predicted Label: ham

Message: You've been selected for a special offer. Reply YES to claim.
Predicted Label: spam

Message: Just wanted to say hello. Have a great day!
Predicted Label: ham

