# Spam or Ham Classifier Using NLP

In [83]:
import pandas as pd
import numpy as np
import matplotlib as plt
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [84]:
df=pd.read_csv(r"D:\ML\DataSets\spam.csv",encoding='latin-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [86]:
df.drop(columns=["Unnamed: 2","Unnamed: 3","Unnamed: 4"],inplace=True)

In [87]:
counts=df['v1'].value_counts()
counts

v1
ham     4825
spam     747
Name: count, dtype: int64

In [88]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
df['v1']=encoder.fit_transform(df['v1'])

In [89]:
df

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [90]:
df.isnull().sum()

v1    0
v2    0
dtype: int64

In [91]:
df.duplicated().sum()

np.int64(403)

In [92]:
df=df.drop_duplicates(keep='first')

In [93]:
df.shape

(5169, 2)

In [94]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [95]:
stopwords=set(stopwords.words('english'))
lemmatizer=WordNetLemmatizer()

In [96]:
def preprocess_text(text):
    #Lower Case
    text = text.lower()
    #Remove special characters and digits
    cleaned_text = ''.join([char if char.isalpha() or char.isspace() else ' ' for char in text])
    # Tokenize
    tokens = cleaned_text.split()
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords]
    # Lemmatize
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Combine
    return ' '.join(lemmatized_tokens)

df['v2'] = df['v2'].apply(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['v2'] = df['v2'].apply(preprocess_text)


In [97]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer=TfidfVectorizer()

In [98]:
x=vectorizer.fit_transform(df['v2']).toarray()
y=df['v1']

In [99]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=45)

In [100]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [101]:
models = {
    'LogisticRegression': LogisticRegression(),
    'DecisionTreeClassifier':DecisionTreeClassifier(),
    'RandomForestClassifier':RandomForestClassifier(),
    'SVC':SVC(),
    'GaussianNB': GaussianNB(),
    'KNN':KNeighborsClassifier()   
}


for name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    print(f"Model: {name}: Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1: {f1:.2f}")
  


Model: LogisticRegression: Accuracy: 0.96, Precision: 0.96, Recall: 0.96, F1: 0.95
Model: DecisionTreeClassifier: Accuracy: 0.96, Precision: 0.96, Recall: 0.96, F1: 0.96
Model: RandomForestClassifier: Accuracy: 0.97, Precision: 0.97, Recall: 0.97, F1: 0.97
Model: SVC: Accuracy: 0.97, Precision: 0.97, Recall: 0.97, F1: 0.97
Model: GaussianNB: Accuracy: 0.89, Precision: 0.92, Recall: 0.89, F1: 0.90
Model: KNN: Accuracy: 0.92, Precision: 0.92, Recall: 0.92, F1: 0.89


In [103]:
from sklearn.model_selection import cross_val_score

# Perform k-fold cross-validation to validate model performance
for name, model in models.items():
    scores = cross_val_score(model, x, y, cv=10)
    print("Cross-validation score of {}:".format(name), scores) 

Cross-validation score of LogisticRegression: [0.95938104 0.96131528 0.94970986 0.95744681 0.96518375 0.95744681
 0.9516441  0.96131528 0.95551257 0.96511628]
Cross-validation score of DecisionTreeClassifier: [0.96711799 0.96905222 0.96131528 0.96711799 0.96324952 0.96711799
 0.95938104 0.95551257 0.96324952 0.96705426]
Cross-validation score of RandomForestClassifier: [0.98646035 0.9787234  0.97678917 0.9729207  0.97485493 0.97485493
 0.9729207  0.9729207  0.96711799 0.98062016]
Cross-validation score of SVC: [0.98646035 0.97485493 0.97098646 0.9729207  0.97678917 0.97098646
 0.96905222 0.97678917 0.97098646 0.9748062 ]
Cross-validation score of GaussianNB: [0.8762089  0.88781431 0.86073501 0.8762089  0.87814313 0.88394584
 0.87234043 0.88007737 0.90522244 0.87984496]
Cross-validation score of KNN: [0.91876209 0.91682785 0.91682785 0.91489362 0.91102515 0.91295938
 0.91489362 0.91295938 0.91295938 0.9244186 ]
