In [None]:
! pip install nltk scikit-learn spacy

In [None]:
! pip install pandas

## Step 1 : Loading the dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('spam_or_ham_dataset/spam.csv', encoding="latin1")
print(data.head())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [4]:
data = data.drop(labels=["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [6]:
data['v1'] = data['v1'].map({'spam':1, 'ham':0})

In [7]:
data.head()

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


#### finished loading up the data

## Step 2 : Cleaning up the data and some NLP functions
#### basically lemmatize and lowercase all the text  so that it becomes more easy to classify

In [8]:
import nltk
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /home/malachy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))
print(stop_words)

{'no', "you'll", 'an', 'few', 'its', 'their', 'it', "they'd", 'under', 'myself', 'all', 'off', "shan't", "he'd", 'doing', 'don', "it's", 'between', 'further', 'yourself', 'y', 'those', 'did', 'themselves', 'once', 'to', 'own', 'needn', 'how', 'that', 'while', 're', "she'd", "don't", 'hadn', 'but', 'couldn', 'here', 'o', 'is', 'whom', 'after', "won't", 'now', 'same', "you're", 'theirs', 'these', 'and', 'too', "that'll", 'so', "he'll", 'with', 'where', 'me', 'into', 'or', 'ourselves', 'down', 'not', 'when', 'been', 'd', 'll', 'below', "they'll", 'yourselves', 'very', "we've", "we'd", "wasn't", 'who', 'weren', 'you', 'his', 'has', 'a', 'have', 'are', 'this', "hasn't", 'haven', "isn't", "i've", 'won', 'above', "they're", 'ma', 's', 'any', 'ain', 'shouldn', 'was', 'more', 'other', 'both', 'in', 'isn', "i'm", 'should', 'we', 'your', 'again', 'having', 'wasn', "mightn't", 'shan', 'each', 'at', 'herself', 'about', "it'd", 'aren', 'there', 'will', "couldn't", "they've", "haven't", 'mightn', "ne

In [10]:
data.head()

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


#### tokenize the whole text, extract only text and lowercase everything

In [11]:
data['v2'] = data['v2'].map(lambda x: [word.lower() for word in word_tokenize(x) if word.isalpha()])
data.head()

Unnamed: 0,v1,v2
0,0,"[go, until, jurong, point, crazy, available, o..."
1,0,"[ok, lar, joking, wif, u, oni]"
2,1,"[free, entry, in, a, wkly, comp, to, win, fa, ..."
3,0,"[u, dun, say, so, early, hor, u, c, already, t..."
4,0,"[nah, i, do, think, he, goes, to, usf, he, liv..."


#### remove all stopwords

In [12]:
data['v2'] = data['v2'].map(lambda x: [word for word in x if word not in stop_words])
data.head()

Unnamed: 0,v1,v2
0,0,"[go, jurong, point, crazy, available, bugis, n..."
1,0,"[ok, lar, joking, wif, u, oni]"
2,1,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,0,"[u, dun, say, early, hor, u, c, already, say]"
4,0,"[nah, think, goes, usf, lives, around, though]"


#### lemmatize everything so that it becomes easier to process 

In [13]:
data['v2'] = data['v2'].map(lambda x: [lemmatizer.lemmatize(word) for word in x])
data.head()

Unnamed: 0,v1,v2
0,0,"[go, jurong, point, crazy, available, bugis, n..."
1,0,"[ok, lar, joking, wif, u, oni]"
2,1,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,0,"[u, dun, say, early, hor, u, c, already, say]"
4,0,"[nah, think, go, usf, life, around, though]"


## Step 3 : Vectorization

#### for vectorizing a text we must first convert it into a single string rather than a list of string tokens
#### vectorization convert the text into numerical feature like how many times the data appears across the dataset and how important the word is etc

In [14]:
#### making a single string at each row
data['v2'] = data['v2'].map(lambda x: " ".join(x))
data.head()

Unnamed: 0,v1,v2
0,0,go jurong point crazy available bugis n great ...
1,0,ok lar joking wif u oni
2,1,free entry wkly comp win fa cup final tkts may...
3,0,u dun say early hor u c already say
4,0,nah think go usf life around though


#### we are using a TF-IDF model to classify spam or ham because not only does it calculate text-frequency
#### it also helps in inverse document frequency that makes it a much easier task to calculate words that are actually important and unimportant and thus assign the required wieghts accordingly

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [16]:
X = vectorizer.fit_transform(data['v2'])

In [17]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 42409 stored elements and shape (5572, 6632)>
  Coords	Values
  (0, 2340)	0.1553832682565631
  (0, 3022)	0.34964066951813666
  (0, 4298)	0.23874204132409838
  (0, 1275)	0.27347317349394423
  (0, 407)	0.263636136194868
  (0, 783)	0.29537785050245857
  (0, 2407)	0.19350246002539456
  (0, 6482)	0.23438840859128224
  (0, 3126)	0.29537785050245857
  (0, 781)	0.3337698123643414
  (0, 1038)	0.29537785050245857
  (0, 2376)	0.1624104261917568
  (0, 197)	0.34964066951813666
  (0, 6305)	0.19535789504858608
  (1, 3962)	0.2750266187756885
  (1, 3155)	0.4055206444105666
  (1, 2995)	0.5305219804799712
  (1, 6403)	0.4286642680974141
  (1, 3986)	0.5428689891730492
  (2, 2179)	0.12989252432309256
  (2, 1819)	0.40412020569016743
  (2, 6447)	0.21390274128667586
  (2, 1133)	0.2208026416977681
  (2, 6413)	0.1649725073560683
  (2, 1939)	0.5273374766482737
  :	:
  (5567, 1194)	0.3142223590478919
  (5567, 3585)	0.3336073354454144
  (5567, 1715)	0.36

In [18]:
Y = data['v1']
Y

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: v1, Length: 5572, dtype: int64

## Step 4 : Classification and Machine Learning part

In [19]:
from sklearn.model_selection import train_test_split

x_tr, x_te, y_tr, y_te = train_test_split(X, Y, test_size=0.2, random_state=42)

In [33]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score, accuracy_score, classification_report, confusion_matrix

In [23]:
model = MultinomialNB()
model.fit(x_tr, y_tr)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [24]:
y_pred = model.predict(x_te)

In [25]:
print(f"accuracy : {accuracy_score(y_te, y_pred)}")

accuracy : 0.9605381165919282


In [29]:
print(f"precision : {precision_score(y_te, y_pred)}")

precision : 1.0


In [30]:
print(classification_report(y_te, y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.71      0.83       150

    accuracy                           0.96      1115
   macro avg       0.98      0.85      0.90      1115
weighted avg       0.96      0.96      0.96      1115



#### not really the best as many false positives have been predicted... as most of the data was ham and thus had very little spam to learn from
#### > solution would be to use a more balanced dataset, or to make it balanced artificially by using weights on the classes that are spam 
#### > better quick fix --> use a slightly lower classification treshold

In [31]:
import joblib

In [32]:
joblib.dump((vectorizer, model), "spam_NLPClassifier.pkl")

['spam_NLPClassifier.pkl']

## using a slightly lower classification threshold

In [38]:
y_prob = model.predict_proba(x_te)[:, 1]
y_prob

array([0.22387999, 0.06089892, 0.28192179, ..., 0.03560859, 0.05548958,
       0.74361321], shape=(1115,))

In [41]:
custom_treshold = 0.4
y_pred_custom = (y_prob >= custom_treshold).astype(int)
y_pred_custom

array([0, 0, 0, ..., 0, 0, 1], shape=(1115,))

In [44]:
confusion_matrix(y_te, y_pred_custom)

array([[961,   4],
       [ 33, 117]])

In [46]:
print(classification_report(y_te, y_pred_custom))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98       965
           1       0.97      0.78      0.86       150

    accuracy                           0.97      1115
   macro avg       0.97      0.89      0.92      1115
weighted avg       0.97      0.97      0.97      1115



#### we need to decrese it further as it has only changes form 0.71 to 0.78 as false negatives are worse and has to be minimized as much as possible

In [51]:
custom_treshold = 0.3
y_pred_custom = (y_prob >= custom_treshold).astype(int)
confusion_matrix(y_te, y_pred_custom)
print(classification_report(y_te, y_pred_custom))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98       965
           1       0.93      0.85      0.89       150

    accuracy                           0.97      1115
   macro avg       0.96      0.92      0.94      1115
weighted avg       0.97      0.97      0.97      1115



## Step 5 : Conclusion

# The results indicate that the model performs very well at identifying spam messages, with a strong balance between precision and recall. Although a few legitimate messages may be mistakenly flagged as spam (false positives), the improvement in spam detection (reducing false negatives) makes this trade-off acceptable for practical use.