In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df=pd.read_csv("spam.csv",encoding="latin-1")

In [None]:
df.head()

In [None]:
df.shape

**1. Data Cleaning**

In [None]:
df.info()

In [None]:
df.drop(columns=["Unnamed: 2","Unnamed: 3","Unnamed: 4"],inplace=True)

In [None]:
df.rename(columns={'v1':'target','v2':'text'},inplace=True)

In [None]:
df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()

In [None]:
#spam:1 ham:0
df['target']= encoder.fit_transform(df['target'])

In [None]:
df.head(5)

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df=df.drop_duplicates(keep='first')

In [None]:
df.shape

**2. EDA**

In [None]:
df.target.value_counts()

In [None]:
df.target.value_counts().plot.barh()

In [None]:
import nltk

In [None]:
nltk.download('punkt_tab')

In [None]:
df['num_char']=df['text'].apply(len)

In [None]:
df['num_words']=df['text'].apply(lambda x:len(nltk.word_tokenize(x)))

In [None]:
df['num_sent']=df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))

In [None]:
df.head()

In [None]:
#ham
df[df['target']==0][['num_char','num_words','num_sent']].describe()

In [None]:
#spam
df[df['target']==1][['num_char','num_words','num_sent']].describe()

In [None]:
import seaborn as sns

In [None]:
sns.histplot(df[df['target']==0]['num_char'])
sns.histplot(df[df['target']==1]['num_char'],color='red')

In [None]:
sns.histplot(df[df['target']==0]['num_words'])
sns.histplot(df[df['target']==1]['num_words'],color='red')

In [None]:
sns.histplot(df[df['target']==0]['num_sent'])
sns.histplot(df[df['target']==1]['num_sent'],color='red')

In [None]:
#correlation matrix
sns.heatmap(df[['target','num_char','num_words','num_sent']].corr(),annot=True,cmap='coolwarm')

**3. Text Preprocessing**

In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import string
import re
def text_transform(text):
    #lowercase
    text=text.lower()
    #tokenization
    text=nltk.word_tokenize(text)
    #removing special characters
    y=[]
    for word in text:
        word=re.sub(r'[^a-zA-Z0-9\s]','',word)
        if word:
            y.append(word)
    text=y[:]
    y.clear()
    #removing stopwords and punctuation
    for word in text:
        if word not in stopwords.words('english') and word not in string.punctuation:
            y.append(word)
    text=y[:]
    y.clear()
    #Stemming
    ps=PorterStemmer()
    for word in text:
        y.append(ps.stem(word))
    
    return " ".join(y)
    

In [None]:
text="I am dancing Today, I am loving it"
text_transform(text)

In [None]:
df['transformed_text']=df['text'].apply(text_transform)

In [None]:
df.head()

In [None]:
from wordcloud import WordCloud
wc=WordCloud(width=1000,height=500,min_font_size=10,background_color='white')

In [None]:
spam_wc=wc.generate(df[df['target']==1]['transformed_text'].str.cat(sep=" "))

In [None]:
plt.imshow(spam_wc)

In [None]:
ham_wc=wc.generate(df[df['target']==0]['transformed_text'].str.cat(sep=" "))
plt.imshow(ham_wc)

**Feature Extraction and Modeling**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df['transformed_text'], 
    df['target'], 
    test_size=0.2, 
    random_state=42, 
    stratify=df['target']
)

In [None]:
#TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=3000)
X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf = tfidf.transform(X_test).toarray()

**Support Vector Machine**

In [None]:
svm_model = SVC(kernel='sigmoid', gamma=1.0, class_weight='balanced')
svm_model.fit(X_train_tfidf, y_train)
y_pred_svm = svm_model.predict(X_test_tfidf)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_svm))

cm = confusion_matrix(y_test, y_pred_svm)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('SVM Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.xticks([0.5, 1.5], ['Ham', 'Spam'])
plt.yticks([0.5, 1.5], ['Ham', 'Spam'])
plt.show()

**Multinomial Naive Bayes**

In [None]:
from sklearn.naive_bayes import MultinomialNB
mnb=MultinomialNB()
mnb.fit(X_train_tfidf, y_train)
y_pred_mnb=mnb.predict(X_test_tfidf)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred_mnb))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_mnb))

cm = confusion_matrix(y_test, y_pred_mnb)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Naive Bayes Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.xticks([0.5, 1.5], ['Ham', 'Spam'])
plt.yticks([0.5, 1.5], ['Ham', 'Spam'])
plt.show()