# NLP Spam Classification
This notebook classifies SMS messages as Spam or Ham using NLP techniques.

In [10]:
import pandas as pd
import numpy as np
import re
import nltk

# Suppress NLTK download output
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# 1. Data Loading
df = pd.read_csv('Spam_SMS.csv', encoding='latin-1')
df.columns = df.columns.str.replace('\ufeff', '')  # Remove BOM
display(df.head())

Unnamed: 0,ï»¿Class,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
# 2. Text Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['Clean_Message'] = df['Message'].apply(preprocess_text)
df[['Message', 'Clean_Message']].head()

Unnamed: 0,Message,Clean_Message
0,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,"Nah I don't think he goes to usf, he lives aro...",nah dont think go usf life around though


In [13]:
# 3. Feature Extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Ensure column names are clean and consistent
df.columns = df.columns.str.strip().str.replace('\ufeff', '', regex=True)
df.rename(columns={df.columns[0]: "Class", df.columns[1]: "Message"}, inplace=True)

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Clean_Message'])

le = LabelEncoder()
y = le.fit_transform(df['Class'])


In [None]:
# 4. Model Training
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)

In [None]:
# 5. Prediction & Evaluation
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

y_pred = model.predict(X_test)

print('\nConfusion Matrix:\n', confusion_matrix(y_test, y_pred))
print('\nClassification Report:\n', classification_report(y_test, y_pred))
print('\nAccuracy:', accuracy_score(y_test, y_pred))