#### Import all neccesary libraries

In [1]:
import pandas as pd 
from spam_words import spam_words
import re
import string
import nltk
from nltk.corpus import stopwords
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

#### set stopwords, intialise lemmatiser and set display columns width for df

In [2]:
stop_words = stopwords.words("english")
wn = nltk.WordNetLemmatizer() # collection of verbs, nouns, adjectives and adverbs - lemmitzer runs off this corpus
pd.set_option("display.max_colwidth",200) # set the widith of what appears

#### Load raw data into a df and set column names

In [3]:
file_path = r"SMSSpamCollection.tsv"
df = pd.read_csv(file_path,header=None,sep="\t")
df.columns = ["label","text"]

#### All functions

In [4]:
def process_text(text:str)->list:
    '''Performs punctuation removal, tokenization, stop word removal, and lemmatization on the input text.'''
    text_no_punc = "".join([char.lower() for char in text if char not in string.punctuation]) # remove punc and lowercase the text
    tokens = re.split("\W+", text_no_punc) # tokenize the text
    tokens_no_stop = [word for word in tokens if word not in stop_words] # remove stop words
    lemmatized_tokens = [wn.lemmatize(word) for word in tokens_no_stop] # leematize the tokens
    return lemmatized_tokens

def message_length(text:str)->int:
    '''Returns the length of a string excludings whitespace as a int'''
    return len(text.replace(" ",""))

def percent_punc(text:str)->float:
    '''Return the percentage punc in the text as a float'''
    total_char = message_length(text)
    total_punc = [x for x in text if x in string.punctuation]
    return (len(total_punc)/total_char)*100

def count_spam_words(text:str)->int:
    '''Returns the count of spam words as a int'''
    word_list = text.split(" ")
    return len([x for x in word_list if x in spam_words])

In [5]:
tfid_vect = TfidfVectorizer(analyzer=process_text)
X_tfidf = tfid_vect.fit_transform(df["text"])
print(X_tfidf.shape)
print(tfid_vect.get_feature_names_out()[:30])

(5568, 8914)
['' '0' '008704050406' '0089my' '0121' '01223585236' '01223585334'
 '0125698789' '02' '020603' '0207' '02070836089' '02072069400'
 '02073162414' '02085076972' '020903' '021' '050703' '0578' '06' '060505'
 '061104' '07008009200' '07046744435' '07090201529' '07090298926'
 '07099833605' '071104' '07123456789' '0721072']
