In [1]:
import torch

print(torch.__version__)

2.5.1+cpu


In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('./data_source/spam.csv', encoding='latin-1')

df = df[['v1', 'v2']]

df.head(20)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


# rename columns from v1, v2 ...to label and text

In [4]:
df = df.rename(columns={'v1': 'label', 'v2':'text'})

df['text'] = df['text'].apply(str.lower)

df.head(20)

Unnamed: 0,label,text
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."
5,spam,freemsg hey there darling it's been 3 week's n...
6,ham,even my brother is not like to speak with me. ...
7,ham,as per your request 'melle melle (oru minnamin...
8,spam,winner!! as a valued network customer you have...
9,spam,had your mobile 11 months or more? u r entitle...


# remove punctuations and special characters

In [23]:
import regex as re

In [27]:
df['text'] = df['text'].astype(str).apply(lambda x : re.sub(r'[^a-zA-Z0-9\s]','', x))

df.head(20)

Unnamed: 0,label,text
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i dont think he goes to usf he lives aroun...
5,spam,freemsg hey there darling its been 3 weeks now...
6,ham,even my brother is not like to speak with me t...
7,ham,as per your request melle melle oru minnaminun...
8,spam,winner as a valued network customer you have b...
9,spam,had your mobile 11 months or more u r entitled...


# remove stopwords

In [12]:
from nltk.corpus import stopwords

In [31]:
stop_words = set(stopwords.words('english'))

df['text'] = df['text'].astype(str).apply(lambda sentence: ' '.join([word for word in sentence.split() if word.lower() not in stop_words]))

df.head(20)

Unnamed: 0,label,text
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor u c already say
4,ham,nah dont think goes usf lives around though
5,spam,freemsg hey darling 3 weeks word back id like ...
6,ham,even brother like speak treat like aids patent
7,ham,per request melle melle oru minnaminunginte nu...
8,spam,winner valued network customer selected receiv...
9,spam,mobile 11 months u r entitled update latest co...


# carry out stemming

In [18]:
from nltk.stem import WordNetLemmatizer

In [34]:
wnl = WordNetLemmatizer()

df['text'] = df['text'].astype(str).apply(lambda sentence: ' '.join([wnl.lemmatize(word) for word in sentence.split()]))

df.head(20)

Unnamed: 0,label,text
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor u c already say
4,ham,nah dont think go usf life around though
5,spam,freemsg hey darling 3 week word back id like f...
6,ham,even brother like speak treat like aid patent
7,ham,per request melle melle oru minnaminunginte nu...
8,spam,winner valued network customer selected receiv...
9,spam,mobile 11 month u r entitled update latest col...


In [38]:
preprocessed_df = df.copy()

preprocessed_df['label'] = preprocessed_df['label'].map({'ham': 0, 'spam': 1})

preprocessed_df.head(20)

Unnamed: 0,label,text
0,0,go jurong point crazy available bugis n great ...
1,0,ok lar joking wif u oni
2,1,free entry 2 wkly comp win fa cup final tkts 2...
3,0,u dun say early hor u c already say
4,0,nah dont think go usf life around though
5,1,freemsg hey darling 3 week word back id like f...
6,0,even brother like speak treat like aid patent
7,0,per request melle melle oru minnaminunginte nu...
8,1,winner valued network customer selected receiv...
9,1,mobile 11 month u r entitled update latest col...


# convert them words to vectors

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [40]:
x = preprocessed_df['text']
y = preprocessed_df['label']

In [47]:
tfidf = TfidfVectorizer(max_features=5000)

x_tfidf = tfidf.fit_transform(x)

# split into train and test data

In [48]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_tfidf, y, test_size=0.2, random_state=42)

In [49]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

In [50]:
lr.fit(x_train, y_train)

In [54]:
y_pred = lr.predict(x_test)



# check accuracy

In [55]:
from sklearn.metrics import accuracy_score

In [57]:
accScore = accuracy_score(y_test, y_pred)


print(f"the accuracy score {accScore}")

the accuracy score 0.8654708520179372


# save models

In [58]:
import joblib

joblib.dump(lr, 'spam_classifier.pkl')
joblib.dump(tfidf, 'tfidf.pkl')

['tfidf.pkl']