In [71]:
# Import the necessary libraries
import numpy as np
import pandas as pd

# Install NLTK if not installed
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize

nltk.download ('punkt_tab')
nltk.download ('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [72]:
# Taking a paragraph from Gates notes and using it as a corpus: Thursday, May8, 2025:
# Note: Every row in a corpus is called as a document and every word in a document is called as a term.

corpus = ["That is why I have decided to give my money back to society much faster",
          "than I had originally planned",
          "I will give away virtually all my wealth through the Gates Foundation over the next 20 years",
          "to the cause of saving and improving lives around the world",
          "And on December 31, 2045, the foundation will close its doors permanently"]


### Using Bag Of Words approach to create a sparse matrix

In [73]:
# The above approach converts a collection of text documents into a bag-of-words representation, 
# where each unique word becomes a feature (column), and its value is the count of times that word appears in each document.
# CountVectoriser helps us in this operation by:
                                                # Tokenizing the text (splits it into words).
                                                # Rows = documents
                                                # Columns = words in the vocabulary
                                                # Cell values = count of each word in that document, represented by cell(i,j) where
# j is the number of times the term appearrs in a documnet 'i'

from sklearn.feature_extraction.text import CountVectorizer

In [74]:
cv = CountVectorizer()

In [75]:
vectorized_data = cv.fit_transform(corpus)

In [76]:
vectorized_data
# In the vectorized_data sparse matrix only 54 non-zero entries exist. Remaining 176 out of 230 are zeros. 
# sparse matriz ensures computation is faster and memory usage is less

<5x46 sparse matrix of type '<class 'numpy.int64'>'
	with 54 stored elements in Compressed Sparse Row format>

### Getting number of columns or unique words in the matrix

In [77]:
cv.get_feature_names_out()

array(['20', '2045', '31', 'all', 'and', 'around', 'away', 'back',
       'cause', 'close', 'december', 'decided', 'doors', 'faster',
       'foundation', 'gates', 'give', 'had', 'have', 'improving', 'is',
       'its', 'lives', 'money', 'much', 'my', 'next', 'of', 'on',
       'originally', 'over', 'permanently', 'planned', 'saving',
       'society', 'than', 'that', 'the', 'through', 'to', 'virtually',
       'wealth', 'why', 'will', 'world', 'years'], dtype=object)

### Creating a dataframe 

In [78]:
data = pd.DataFrame(vectorized_data.toarray(), columns = cv.get_feature_names_out())
data

#NOte: # Pandas DataFrame can't directly display or manipulate sparse matrices. Hence, converting it into array is essential.
# toarray() converts the sparse matrix into a dense NumPy array to creates a full matrix where all zeroes and non-zeroes 
# are shown explicitly. This dense array can then be passed cleanly into pd.DataFrame(...).

Unnamed: 0,20,2045,31,all,and,around,away,back,cause,close,...,that,the,through,to,virtually,wealth,why,will,world,years
0,0,0,0,0,0,0,0,1,0,0,...,1,0,0,2,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,1,0,0,1,0,0,0,...,0,2,1,0,1,1,0,1,0,1
3,0,0,0,0,1,1,0,0,1,0,...,0,2,0,1,0,0,0,0,1,0
4,0,1,1,0,1,0,0,0,0,1,...,0,1,0,0,0,0,0,1,0,0


### A brief overview on scaling techniques in NLP
We can use Stemming, Lemmatization or Regular Expression - based removals(later in the part).

STEMMING
Reduces words to their root form by chopping off suffixes (e.g., "playing" → "play", "better" → "better").
It’s fast but can be crude (e.g., "studies" → "studi").

LEMMATIZATION 
Also reduces words to their base form, but uses vocabulary and grammar (e.g., "was" → "be", "better" → "good").
More accurate than stemming but slower.Useful when linguistic correctness matters.

REGEX-BASED REMOVALS
Uses regular expressions to remove or extract patterns like: 
                                                           # Punctuation 
                                                           # Numbers
                                                           # HTML tags
                                                           # URLs
                                                           # Emojis
                                                           # Extra spaces
Essential for cleaning noisy or unstructured text.


In [79]:
ps = PorterStemmer()

In [80]:
print(ps.stem('amazement'))
print(ps.stem('amazing'))
print(ps.stem('amazingly'))

amaz
amaz
amazingli


In [81]:
wl = WordNetLemmatizer()

In [82]:
print(wl.lemmatize('choke', pos = 'n')) # Noun
print(wl.lemmatize('choking', pos = 'r')) # Adjective
print(wl.lemmatize('chokingly', pos = 's')) # Adverb

choke
choking
chokingly


In [83]:
# Using tagging for a sentence
pos_tag(word_tokenize('I will give away virtually all my wealth through the Gates Foundation over the next 20 years to the cause of saving and improving lives around the world'))

[('I', 'PRP'),
 ('will', 'MD'),
 ('give', 'VB'),
 ('away', 'RP'),
 ('virtually', 'RB'),
 ('all', 'DT'),
 ('my', 'PRP$'),
 ('wealth', 'NN'),
 ('through', 'IN'),
 ('the', 'DT'),
 ('Gates', 'NNP'),
 ('Foundation', 'NNP'),
 ('over', 'IN'),
 ('the', 'DT'),
 ('next', 'JJ'),
 ('20', 'CD'),
 ('years', 'NNS'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('cause', 'NN'),
 ('of', 'IN'),
 ('saving', 'VBG'),
 ('and', 'CC'),
 ('improving', 'VBG'),
 ('lives', 'NNS'),
 ('around', 'IN'),
 ('the', 'DT'),
 ('world', 'NN')]

### Spam classification using NLP technique

In [84]:
dataset = pd.read_csv('spam.csv', encoding = 'latin')
dataset

Unnamed: 0,Target,Text,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [85]:
# Let's use only 2 columns for our dataset: 'Target' and 'Text'
dataset = dataset[['Target', 'Text']]
dataset

Unnamed: 0,Target,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


### Creating a sparse Document Term matrix of the above dataset

In [86]:
CV = CountVectorizer(min_df = 10, stop_words = 'english')

In [87]:
vectorized_dataset = CV.fit_transform(dataset['Text'])

In [88]:
vectorized_dataset

<5572x833 sparse matrix of type '<class 'numpy.int64'>'
	with 27978 stored elements in Compressed Sparse Row format>

In [89]:
CV.get_feature_names_out()

array(['00', '000', '03', '04', '0800', '08000839402', '08000930705',
       '10', '100', '1000', '10p', '11', '12', '12hrs', '150', '150p',
       '150ppm', '16', '18', '1st', '20', '200', '2000', '2003', '250',
       '2lands', '2nd', '2nite', '30', '350', '50', '500', '5000', '750',
       '800', '8007', '86688', '87066', '900', 'abiola', 'able', 'abt',
       'ac', 'account', 'actually', 'address', 'admirer', 'aft',
       'afternoon', 'age', 'ago', 'ah', 'aight', 'alright', 'amp',
       'angry', 'ans', 'answer', 'anytime', 'apply', 'ard', 'area',
       'asap', 'ask', 'askd', 'asked', 'asking', 'ass', 'attempt',
       'auction', 'available', 'await', 'award', 'awarded', 'away',
       'awesome', 'b4', 'babe', 'baby', 'bad', 'balance', 'bank', 'bath',
       'bathe', 'bcoz', 'beautiful', 'bed', 'believe', 'best', 'better',
       'big', 'birthday', 'bit', 'blue', 'bonus', 'book', 'booked',
       'bored', 'bout', 'box', 'boy', 'boytoy', 'break', 'bring',
       'brother', 'bslvyl

### Creating a dataframe

In [90]:
X = pd.DataFrame(vectorized_dataset.toarray(), columns = CV.get_feature_names_out())
dataset

Unnamed: 0,Target,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [91]:
y = dataset['Target']

In [92]:
# Extracting special characters and digits using Regular Expression-based removals
import re

In [93]:
def special_characters(input_sent):
    pattern = '[^a-zA-z0-9\s]'
    return len(re.findall(pattern, input_sent))/len(input_sent)

# Calculates the ratio of special characters to the total message length. becauses,
# spam messages often contain more special symbols (for attention-grabbing or obfuscation)

#### pattern = '[^a-zA-Z0-9\s]' means this pattern matches any character that is NOT: 
                                   # 1. a lowercase or uppercase letter (a-zA-Z)
                                   # 2. a digit (0-9)
                                   # 3. whitespace (\s)
                                   # The ^ inside the square brackets means "not".
                                   # Purpose: To capture special characters (like @, #, !, ', &, (, etc.)

In [94]:
def digit_extraction(input_sent):
    pattern = '[0-9]'
    return len(re.findall(pattern, input_sent))/len(input_sent)

    # Calculates the ratio of digits to the total message length

In [95]:
dataset['Text'].apply(lambda x:special_characters(x))
# dataset['Text']: Accesses the 'Text' column in the dataset DataFrame, which contains text data
# .apply(lambda x: ...): Applies a function to each value x (i.e., each message) in the 'Text' column.
# special_characters(x): Calls the custom function that calculates the proportion of special characters in the text x.

0       0.081081
1       0.206897
2       0.038710
3       0.122449
4       0.032787
          ...   
5567    0.062112
5568    0.054054
5569    0.122807
5570    0.008000
5571    0.038462
Name: Text, Length: 5572, dtype: float64

In [96]:
dataset['Text'].apply(lambda x:digit_extraction(x))
# dataset['Text']: Accesses the 'Text' column in the dataset DataFrame, which contains text data
# .apply(lambda x: ...): Applies a function to each value x (i.e., each message) in the 'Text' column.
# digit_extraction(x): Calls the custom function that calculates the proportion of special characters in the text x.

0       0.000000
1       0.000000
2       0.161290
3       0.000000
4       0.000000
          ...   
5567    0.130435
5568    0.000000
5569    0.000000
5570    0.000000
5571    0.000000
Name: Text, Length: 5572, dtype: float64

In [98]:
X['special_characters'] = dataset['Text'].apply(lambda x:special_characters(x))

In [99]:
X['digit_extraction'] = dataset['Text'].apply(lambda x:digit_extraction(x))

# Both lines above apply a function to the 'Text' column of the dataset DataFrame and 
# save the results in new columns in the X DataFrame.

### Model building

In [101]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *

In [102]:
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size = 0.30, random_state = 42)

In [103]:
rf = RandomForestClassifier()

In [104]:
rf.fit(x_train, y_train)

In [105]:
y_pred = rf.predict(x_test)
y_pred

array(['ham', 'ham', 'spam', ..., 'ham', 'ham', 'ham'], dtype=object)

In [106]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1453
        spam       0.99      0.90      0.94       219

    accuracy                           0.99      1672
   macro avg       0.99      0.95      0.97      1672
weighted avg       0.99      0.99      0.99      1672

