In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('/kaggle/input/ham-vs-spam-sms-classification-dataset/SMS_spam_collection.csv')
df

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


# Cleaning

In [3]:
print(df.duplicated(subset='Message').sum())
print(df.isnull().sum())

403
Label      0
Message    0
dtype: int64


In [4]:
df.drop_duplicates(subset='Message', inplace=True)
df.reset_index(drop=True, inplace=True)
print(df.duplicated(subset='Message').sum())

0


In [5]:
import re

def clean_email(text):
    text = str(text).lower()  # lowercase
    text = re.sub(r"http\S+|www.\S+", "", text)  # remove links
    text = re.sub(r"\S+@\S+", "", text)  # remove email addresses
    text = re.sub(r"[^a-z\s]", "", text)  # keep only alphabets
    text = re.sub(r"\s+", " ", text).strip()  # remove extra spaces
    return text

df["text"] = df["Message"].apply(clean_email)
df = df[['text', 'Label']]
df

Unnamed: 0,text,Label
0,go until jurong point crazy available only in ...,ham
1,ok lar joking wif u oni,ham
2,free entry in a wkly comp to win fa cup final ...,spam
3,u dun say so early hor u c already then say,ham
4,nah i dont think he goes to usf he lives aroun...,ham
...,...,...
5164,this is the nd time we have tried contact u u ...,spam
5165,will b going to esplanade fr home,ham
5166,pity was in mood for that soany other suggestions,ham
5167,the guy did some bitching but i acted like id ...,ham


In [6]:
from sklearn.model_selection import train_test_split

# Split first
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["Label"], test_size=0.2, random_state=42)


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
y_train = y_train.map({'ham': 0, 'spam': 1})
y_test = y_test.map({'ham': 0, 'spam': 1})


# Training

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

model = LogisticRegression(max_iter=1000)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Result

In [9]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9584139264990329

Confusion Matrix:
 [[891   3]
 [ 40 100]]

Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       894
           1       0.97      0.71      0.82       140

    accuracy                           0.96      1034
   macro avg       0.96      0.86      0.90      1034
weighted avg       0.96      0.96      0.96      1034

