<a href="https://colab.research.google.com/github/KeshikaSathishKumar/CodsoftTask1/blob/main/Codsoft_SMS_SpamDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Upload the dataset
from google.colab import files
uploaded = files.upload()  # Upload spam.csv here

# Load and preprocess the data
import pandas as pd

df = pd.read_csv("spam.csv", encoding='ISO-8859-1')[["v1", "v2"]]
df.columns = ["label", "text"]

# Convert label to binary
df["label"] = df["label"].map({"ham": 0, "spam": 1})

print("Sample Data:\n", df.head())

# Split the data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["label"], test_size=0.2, random_state=42)

# TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train the SVM model
from sklearn.svm import LinearSVC

model = LinearSVC()
model.fit(X_train_tfidf, y_train)

# Evaluate the model
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test_tfidf)

print("\n Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=["Ham", "Spam"]))

print(" Confusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))

# Save the model and vectorizer
import joblib

joblib.dump(model, "spam_classifier_svm.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

print(" Model and vectorizer saved!")


Saving spam.csv to spam.csv
Sample Data:
    label                                               text
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...

 Classification Report:

              precision    recall  f1-score   support

         Ham       0.98      0.99      0.99       965
        Spam       0.96      0.87      0.92       150

    accuracy                           0.98      1115
   macro avg       0.97      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115

 Confusion Matrix:

[[960   5]
 [ 19 131]]
 Model and vectorizer saved!


In [4]:
# Example function to test custom messages
def test_message(msg):
    msg_tfidf = vectorizer.transform([msg])
    prediction = model.predict(msg_tfidf)
    return "Spam" if prediction[0] == 1 else "Ham (Legit)"

# Try some messages
print(test_message("Congratulations! You’ve won a free iPhone. Click here to claim."))
print(test_message("Hey, are we still meeting at 6 PM today?"))


Spam
Ham (Legit)
