In [72]:
# import the libraries

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


In [65]:
data = pd.read_csv('/content/spam detection dataset.csv')

data.sample(12)

Unnamed: 0,Category,Message
4757,ham,Don't make life too stressfull.. Always find t...
1341,ham,Might ax well im there.
5136,ham,There are some nice pubs near here or there is...
2909,ham,Why must we sit around and wait for summer day...
5126,ham,"To the wonderful Okors, have a great month. We..."
4136,ham,No need to say anything to me. I know i am an ...
5313,ham,My sister going to earn more than me da.
4665,ham,Mum not going robinson already.
5237,spam,Someonone you know is trying to contact you vi...
4887,ham,Or just do that 6times


In [66]:
data.rename(columns = {"Category":"Target", "Message":"Text"}, inplace = True)
data.head()

Unnamed: 0,Target,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [67]:
data.isna().sum()

Target    0
Text      0
dtype: int64

In [41]:
data.shape

(5572, 2)

In [54]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
 2   Spam      5572 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 130.7+ KB


In [68]:
# Defining a function to clean up the text
def Clean(Text):
    sms = re.sub('[^a-zA-Z]', ' ', Text) #Replacing all non-alphabetic characters with a space
    sms = sms.lower() #converting to lowecase
    sms = sms.split()
    sms = ' '.join(sms)
    return sms

data["Clean_Text"] = data["Text"].apply(Clean)
#Lets have a look at a sample of texts after cleaning
print("\033[1m\u001b[45;1m The First 5 Texts after cleaning:\033[0m",*data["Clean_Text"][:5], sep = "\n")

[1m[45;1m The First 5 Texts after cleaning:[0m
go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat
ok lar joking wif u oni
free entry in a wkly comp to win fa cup final tkts st may text fa to to receive entry question std txt rate t c s apply over s
u dun say so early hor u c already then say
nah i don t think he goes to usf he lives around here though


In [74]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [75]:
#Tokenization
data["Tokenize_Text"]=data.apply(lambda row: nltk.word_tokenize(row["Clean_Text"]), axis=1)

print("\033[1m\u001b[45;1m The First 5 Texts after Tokenizing:\033[0m",*data["Tokenize_Text"][:5], sep = "\n")

[1m[45;1m The First 5 Texts after Tokenizing:[0m
['go', 'until', 'jurong', 'point', 'crazy', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'there', 'got', 'amore', 'wat']
['ok', 'lar', 'joking', 'wif', 'u', 'oni']
['free', 'entry', 'in', 'a', 'wkly', 'comp', 'to', 'win', 'fa', 'cup', 'final', 'tkts', 'st', 'may', 'text', 'fa', 'to', 'to', 'receive', 'entry', 'question', 'std', 'txt', 'rate', 't', 'c', 's', 'apply', 'over', 's']
['u', 'dun', 'say', 'so', 'early', 'hor', 'u', 'c', 'already', 'then', 'say']
['nah', 'i', 'don', 't', 'think', 'he', 'goes', 'to', 'usf', 'he', 'lives', 'around', 'here', 'though']


In [77]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [78]:
#remove stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    filtered_text = [word for word in text if word not in stop_words]
    return filtered_text

data["Nostopword_Text"] = data["Tokenize_Text"].apply(remove_stopwords)

print("\033[1m\u001b[45;1m The First 5 Texts after removing the stopwords:\033[0m",*data["Nostopword_Text"][:5], sep = "\n")

[1m[45;1m The First 5 Texts after removing the stopwords:[0m
['go', 'jurong', 'point', 'crazy', 'available', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amore', 'wat']
['ok', 'lar', 'joking', 'wif', 'u', 'oni']
['free', 'entry', 'wkly', 'comp', 'win', 'fa', 'cup', 'final', 'tkts', 'st', 'may', 'text', 'fa', 'receive', 'entry', 'question', 'std', 'txt', 'rate', 'c', 'apply']
['u', 'dun', 'say', 'early', 'hor', 'u', 'c', 'already', 'say']
['nah', 'think', 'goes', 'usf', 'lives', 'around', 'though']


In [80]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [81]:
lemmatizer = WordNetLemmatizer()
# lemmatize string
def lemmatize_word(text):
    #word_tokens = word_tokenize(text)
    # provide context i.e. part-of-speech
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in text]
    return lemmas

data["Lemmatized_Text"] = data["Nostopword_Text"].apply(lemmatize_word)
print("\033[1m\u001b[45;1m The First 5 Texts after lemitization:\033[0m",*data["Lemmatized_Text"][:5], sep = "\n")

[1m[45;1m The First 5 Texts after lemitization:[0m
['go', 'jurong', 'point', 'crazy', 'available', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'get', 'amore', 'wat']
['ok', 'lar', 'joke', 'wif', 'u', 'oni']
['free', 'entry', 'wkly', 'comp', 'win', 'fa', 'cup', 'final', 'tkts', 'st', 'may', 'text', 'fa', 'receive', 'entry', 'question', 'std', 'txt', 'rate', 'c', 'apply']
['u', 'dun', 'say', 'early', 'hor', 'u', 'c', 'already', 'say']
['nah', 'think', 'go', 'usf', 'live', 'around', 'though']


In [109]:
data.head()

Unnamed: 0,Target,Text,Clean_Text,Tokenize_Text,Nostopword_Text,Lemmatized_Text
0,0,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...,"[go, until, jurong, point, crazy, available, o...","[go, jurong, point, crazy, available, bugis, n...","[go, jurong, point, crazy, available, bugis, n..."
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]"
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup final ...,"[free, entry, in, a, wkly, comp, to, win, fa, ...","[free, entry, wkly, comp, win, fa, cup, final,...","[free, entry, wkly, comp, win, fa, cup, final,..."
3,0,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, u, c, already, t...","[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, early, hor, u, c, already, say]"
4,0,"Nah I don't think he goes to usf, he lives aro...",nah i don t think he goes to usf he lives arou...,"[nah, i, don, t, think, he, goes, to, usf, he,...","[nah, think, goes, usf, lives, around, though]","[nah, think, go, usf, live, around, though]"


In [82]:
#Creating a corpus of text feature to encode further into vectorized form
corpus= []
for i in data["Lemmatized_Text"]:
    msg = ' '.join([row for row in i])
    corpus.append(msg)

corpus[:5]
print("\033[1m\u001b[45;1m The First 5 lines in corpus :\033[0m",*corpus[:5], sep = "\n")

[1m[45;1m The First 5 lines in corpus :[0m
go jurong point crazy available bugis n great world la e buffet cine get amore wat
ok lar joke wif u oni
free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply
u dun say early hor u c already say
nah think go usf live around though


In [83]:
#Changing text data in to numbers.
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(corpus).toarray()
#Let's have a look at our feature
X.dtype

dtype('float64')

In [84]:
label_encoder = LabelEncoder()
data["Target"] = label_encoder.fit_transform(data["Target"])

In [88]:
y=data['Target']

In [89]:
from sklearn.model_selection import train_test_split

X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [27]:
! pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [99]:
# train and test on the Catboost model

from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

# cat_features = data['Category']

cat = CatBoostClassifier()

cat.fit(X_train,y_train)

y_pred = cat.predict(X_test)

accuracy_score_cat = accuracy_score(y_test,y_pred)

print(accuracy_score_cat)

Learning rate set to 0.019502
0:	learn: 0.6710468	total: 62.7ms	remaining: 1m 2s
1:	learn: 0.6494332	total: 112ms	remaining: 55.8s
2:	learn: 0.6293224	total: 160ms	remaining: 53.3s
3:	learn: 0.6107736	total: 204ms	remaining: 50.9s
4:	learn: 0.5922843	total: 251ms	remaining: 49.9s
5:	learn: 0.5735314	total: 296ms	remaining: 49s
6:	learn: 0.5595834	total: 345ms	remaining: 48.9s
7:	learn: 0.5449895	total: 391ms	remaining: 48.5s
8:	learn: 0.5306050	total: 449ms	remaining: 49.5s
9:	learn: 0.5158818	total: 498ms	remaining: 49.3s
10:	learn: 0.5025049	total: 546ms	remaining: 49.1s
11:	learn: 0.4897461	total: 595ms	remaining: 49s
12:	learn: 0.4758001	total: 641ms	remaining: 48.7s
13:	learn: 0.4642820	total: 688ms	remaining: 48.5s
14:	learn: 0.4503942	total: 733ms	remaining: 48.1s
15:	learn: 0.4399405	total: 781ms	remaining: 48s
16:	learn: 0.4289597	total: 826ms	remaining: 47.8s
17:	learn: 0.4190014	total: 872ms	remaining: 47.6s
18:	learn: 0.4086461	total: 924ms	remaining: 47.7s
19:	learn: 0.399

In [103]:
# check the real time prediction by the Catboost model on a sample email

new_email = "Free money! Click here to claim your prize!"
new_email_vectorized = tfidf.transform([new_email])

prediction = cat.predict(new_email_vectorized)

# Print the prediction
if prediction[0] == 1:
    print("The email is spam.")
else:
    print("The email is not spam.")


The email is spam.


In [95]:
#train and test on the Random Forest model
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()

rfc.fit(X_train,y_train)

y_pred = rfc.predict(X_test)

accuracy_score_rfc = accuracy_score(y_test,y_pred)

print(accuracy_score_rfc)

0.9766816143497757


In [107]:
# check the real time prediction by the Random forest model on a sample email
new_email_2 = 'Hi Alan, are you coming to the football game tonight??'
new_email_vectorized_2 = tfidf.transform([new_email_2])

prediction = rfc.predict(new_email_vectorized_2)

# Print the prediction
if prediction[0] == 1:
    print("The email is spam.")
else:
    print("The email is not spam.")

The email is not spam.
