<h1>AI model to detect Spam SMS using Naive Bayes technique </h1>

Importing required libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


Loading dataset and Accessing needed coloumns

In [2]:
def load(file_path):
    df = pd.read_csv(file_path, encoding='latin-1')
    df = df[['v1', 'v2']]
    df.columns = ['label', 'message']
    df['label'] = df['label'].map({'ham': 0, 'spam': 1})
    return df

Training the model

In [3]:
def train(df):
    X_train, X_test, y_train, y_test = train_test_split(
        df['message'], df['label'], test_size=0.3, random_state=42
    )
    return X_train, X_test, y_train, y_test

def vectorize_text(X_train, X_test):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)
    return X_train_tfidf, X_test_tfidf, vectorizer

def train_naive_bayes(X_train_tfidf, y_train):
    model = MultinomialNB()
    model.fit(X_train_tfidf, y_train)
    return model

def evaluate_model(model, X_test_tfidf, y_test):
    y_pred = model.predict(X_test_tfidf)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

def predict_new_sms(model, vectorizer, new_sms):
    new_sms_tfidf = vectorizer.transform(new_sms)
    predictions = model.predict(new_sms_tfidf)
    return ["Spam" if pred == 1 else "Ham" for pred in predictions]


Main function 

In [4]:
if __name__ == "__main__":
    file_path = 'F:\\codesoft\\task4\\spam.csv'
    df = load(file_path)
    X_train, X_test, y_train, y_test = train(df)
    X_train_tfidf, X_test_tfidf, vectorizer = vectorize_text(X_train, X_test)
    model = train_naive_bayes(X_train_tfidf, y_train)
    evaluate_model(model, X_test_tfidf, y_test)

Accuracy: 0.9694976076555024

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98      1453
           1       1.00      0.77      0.87       219

    accuracy                           0.97      1672
   macro avg       0.98      0.88      0.93      1672
weighted avg       0.97      0.97      0.97      1672



Collecting input from the user

In [7]:
print("\nTesting with new SMS...")
new_sms = [
        "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18'",
    ]
predictions = predict_new_sms(model, vectorizer, new_sms)
for sms, pred in zip(new_sms, predictions):
        print(f"Message: {sms}\nPrediction: {pred}\n")


Testing with new SMS...
Message: Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18'
Prediction: Spam

