In [8]:
import re
import os
import gc
import datetime
import string
import itertools
from collections import Counter

import pandas as pd
import numpy as np

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

import spacy

from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

import gensim
from gensim.models import word2vec
from gensim.models import KeyedVectors # implements word vectors
from gensim.test.utils import datapath, get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec

from tqdm.auto import tqdm
tqdm.pandas()

import matplotlib.cm as cm
import matplotlib.pyplot as plt
import seaborn as sns

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
# define path to the dataset and to the output directory
input_dir_path = '/kaggle/input/'
dataset_path=f"{input_dir_path}email-spam-detection-dataset-classification/spam.csv"
output_dir_path="/kaggle/working/"

In [10]:
# download dataset
df = pd.read_csv(dataset_path, encoding='latin-1')
df.sample(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
2061,ham,Hey ! I want you ! I crave you ! I miss you ! ...,,,
4737,ham,I bought the test yesterday. Its something tha...,,,
5238,ham,Yeah I can still give you a ride,,,
2082,ham,I'm done oredi...,,,
149,ham,Sindu got job in birla soft ..,,,


# **** Performing EDA and text normalization ****

In [11]:
# get basic info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [12]:
# drop redandant columns and remane useful ones
df=df[['v1', 'v2']]
df=df.rename(columns={
    'v1': 'target',
    'v2': 'message'
})
df.sample(5)

Unnamed: 0,target,message
1977,spam,Reply to win å£100 weekly! Where will the 2006...
4593,ham,I had a good time too. Its nice to do somethin...
2351,spam,Download as many ringtones as u like no restri...
86,ham,For real when u getting on yo? I only need 2 m...
3032,ham,"Aight, lemme know what's up"


In [13]:
# check target value distribution
df['target'].value_counts()

target
ham     4825
spam     747
Name: count, dtype: int64

We see that we have disbalance of classes and we have to take it into account when choosing the estimation metrics.

In [14]:
# convert target values to numeric ones
df['target']=df['target'].apply(lambda x: 1 if x=="ham" else 0)
df.sample(5)

Unnamed: 0,target,message
5017,1,Babe ! What are you doing ? Where are you ? Wh...
4171,1,"Sorry, I'll call later"
4194,0,Double mins and txts 4 6months FREE Bluetooth ...
2925,1,Im done. Just studyn in library
1243,1,No shoot me. I'm in the docs waiting room. :/


In [15]:
# check for duplicates
df.duplicated().sum()


403

In [16]:
# drop duplicates
df = df.drop_duplicates().reset_index(drop=True)
df.shape

(5169, 2)

In [17]:
# prepare complementary list of words for text normalisation
contractions = {
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}



In [18]:
# define stop words
stop_words = set(stopwords.words('english')).union({'also','would','much','many'})

In [19]:
# define stemmer
porter_stemmer = PorterStemmer()

In [20]:
# define legitimazer from spacy pipeline 
nlp = spacy.load("en_core_web_sm", disable = ['parser', 'ner'])

In [21]:
# create function for text normalization
def normalize_text(raw_review, stemmer=None, pipeline=None):
  # remove html tags
  text = re.sub("<[^>]*>", " ", raw_review) # match <> and everything in between. [^>] - match everything except >

  # remove emails
  text = re.sub("\\S*@\\S*[\\s]+", " ", text) # match non-whitespace characters, @ and a whitespaces in the end

  # remove links
  text = re.sub("https?:\\/\\/.*?[\\s]+", " ", text) # match http, s - zero or once, //, # any char 0-unlimited, whitespaces in the end

  # convert to lower case, split into individual words
  text = text.lower().split()

  # replace contractions with their full versions
  text = [contractions.get(word) if word in contractions else word for word in text]

  # re-splitting for the correct stop-words extraction
  text = " ".join(text).split()

  # remove stop words
  text = [word for word in text if not word in stop_words]

  text = " ".join(text)

  # remove non-letters
  text = re.sub("[^a-zA-Z' ]", "", text) # match everything except letters and '

  # stem words
  if stemmer:
    text = [stemmer.stem(word) for word in text.split()]
    text = " ".join(text)

  # lemmatize words
  if pipeline:
    docs = pipeline(text)
    text = " ".join([token.lemma_ for token in docs if len(token.lemma_)>1])

  # remove excesive whitespaces
  text = re.sub("[\\s]+", " ", text)

  # Join the words back into one string separated by space, and return the result.
  return text

In [22]:
df.head()

Unnamed: 0,target,message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [23]:
text=df.iloc[0]['message']
text

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [24]:
# test our function on some example
text=df.iloc[2]['message']

norm_text=normalize_text(text,porter_stemmer, nlp)
print('Original text', text, '#'*30, sep='\n\n')
print('Normalized text', norm_text, sep='\n\n')

Original text

Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's

##############################
Normalized text

free entri wkli comp win fa cup final tkt st may text fa receiv entri questionstd txt ratetc appli over


In [25]:
# apply normalization for our dataset
df['message_normalized']=df['message'].progress_apply(normalize_text, args=(porter_stemmer, nlp))
df.head()

  0%|          | 0/5169 [00:00<?, ?it/s]

Unnamed: 0,target,message,message_normalized
0,1,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi great world l...
1,1,Ok lar... Joking wif u oni...,ok lar joke wif oni
2,0,Free entry in 2 a wkly comp to win FA Cup fina...,free entri wkli comp win fa cup final tkt st m...
3,1,U dun say so early hor... U c already then say...,dun say earli hor alreadi say
4,1,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [26]:
# split on train and test datasets
# X_train, X_test, y_train, y_test=train_test_split(df['message_normalized'],df['target'],test_size=0.2, random_state=42)

In [27]:
# define function for model train and ROC-AUC presentation
def get_preds(text_column, algorithm, ngrams=(1,1)):
    # split on train and test datasets
    train_idxs = df.sample(frac=0.7, random_state=42).index
    test_idxs = [idx for idx in df.index if idx not in train_idxs]
    
    X_train = df.loc[train_idxs, text_column]
    X_test = df.loc[test_idxs, text_column]

    y_train = df.loc[train_idxs, 'target']
    y_test = df.loc[test_idxs, 'target']

    if algorithm == 'cv':
        vect = CountVectorizer(ngram_range=ngrams).fit(X_train)
    elif algorithm == 'tfidf':
        vect = TfidfVectorizer(ngram_range=ngrams).fit(X_train)
    else:
        raise ValueError('Select correct algorithm: `cv` or `tfidf`')

    print('Vocabulary length: ', len(vect.vocabulary_))

    # transform the documents in the training data to a document-term matrix

    X_train_vectorized = vect.transform(X_train)
    print('Document-term matrix shape:', X_train_vectorized.shape)

    model = LogisticRegression(random_state=42)
    model.fit(X_train_vectorized, y_train)

    predictions = model.predict(vect.transform(X_test))

    print('AUC: ', roc_auc_score(y_test, predictions))

**Training models and getting ROC-AUC for different algorithms and methods ('Bag of Words', 'TF-IDF', 'N-Grams')**

In [28]:
get_preds('message_normalized', 'cv')

Vocabulary length:  5618
Document-term matrix shape: (3618, 5618)
AUC:  0.9038511298947731


In [29]:
get_preds('message', 'cv')

Vocabulary length:  7046
Document-term matrix shape: (3618, 7046)
AUC:  0.9206270484733483


In [30]:
get_preds('message_normalized', 'tfidf')

Vocabulary length:  5618
Document-term matrix shape: (3618, 5618)
AUC:  0.8086941521476626


In [31]:
get_preds('message', 'tfidf')

Vocabulary length:  7046
Document-term matrix shape: (3618, 7046)
AUC:  0.841879420389857


In [32]:
get_preds('message_normalized', 'cv', ngrams=(1,2))

Vocabulary length:  27049
Document-term matrix shape: (3618, 27049)
AUC:  0.891948421597378


In [33]:
get_preds('message_normalized', 'tfidf', ngrams=(1,2))

Vocabulary length:  27049
Document-term matrix shape: (3618, 27049)
AUC:  0.7896110056925996


In [34]:
get_preds('message', 'cv', ngrams=(1,2))

Vocabulary length:  38027
Document-term matrix shape: (3618, 38027)
AUC:  0.908724340175953


In [35]:
get_preds('message_normalized', 'cv', (2,2))

Vocabulary length:  21431
Document-term matrix shape: (3618, 21431)
AUC:  0.7486631016042781


In [36]:
get_preds('message', 'cv', ngrams=(2,2))

Vocabulary length:  30981
Document-term matrix shape: (3618, 30981)
AUC:  0.7700534759358288


We achieved the highest ROC-AUC score of 0.92 using the 'Bag of Words' method without N-grams. This result aligns with the nature of our specific task of spam classification. Typically, spam messages consist of a set of common words, and their sequence or context within the document is less important. For this reason, methods like "TF-IDF" and the use of "N-Grams" are less effective in this case. Additionally, the fact that the best results were obtained with non-normalized text can be attributed to the presence of symbols, numbers, emojis, etc., which are often removed during normalization but are actually significant indicators of spam.

**Pretrained models usage**

In [37]:
# define a class for work with pretrained vector embeddings models
class WordEmbedding: 

    def __init__(self):
        self.model = {}
        
    def convert(self, source, ipnut_file_path, output_file_path):
        '''
        Converts word embeddings from GloVe format to Word2Vec format
        '''
        if source == 'glove':
            input_file = datapath(ipnut_file_path)
            output_file = get_tmpfile(output_file_path)
            glove2word2vec(input_file, output_file)
        elif source in ['word2vec', 'fasttext', 'from_scratch']:
            pass
        else:
            raise ValueError('Possible value of source are glove, word2vec, fasttext, or from_scratch')
        
    def load(self, source, file_path):
        '''
        Loads a specified word embedding model from a file
        '''
        print(datetime.datetime.now(), 'start: loading', source)
        if source in ['glove', 'fasttext']:
            self.model[source] = gensim.models.KeyedVectors.load_word2vec_format(file_path)
        elif source in ['word2vec', 'from_scratch']:
            self.model[source] = gensim.models.KeyedVectors.load_word2vec_format(file_path, binary=True)
        else:
            raise ValueError('Possible value of source are glove, word2vec, fasttext, or from_scratch')
            
        print(datetime.datetime.now(), 'end: loading', source)
            
        return self
    
    def get_model(self, source):
        '''
        Retrieves the loaded word embedding model
        '''
        if source not in ['glove', 'word2vec', 'fasttext', 'from_scratch']:
            raise ValueError('Possible value of source are glove, word2vec, fasttext, or from_scratch')
            
        return self.model[source]
    
    def get_words(self, source, size=None):
        '''
        Retrieves a list of words from the model
        '''
        if source not in ['glove', 'word2vec', 'fasttext', 'from_scratch']:
            raise ValueError('Possible value of source are glove, word2vec, fasttext, or from_scratch')

        if size is None:
            return [w for w in self.get_model(source=source).key_to_index]
        else:
            results = []
            for i, word in enumerate(self.get_model(source=source).key_to_index):
                if i >= size:
                    break
                results.append(word)
            return results
        
        return Exception('Unexpected flow')
    
    def get_dimension(self, source):
        '''
        Retrieves the dimension of word vectors in the model
        '''
        if source not in ['glove', 'word2vec', 'fasttext', 'from_scratch']:
            raise ValueError('Possible value of source are glove, word2vec, fasttext, or from_scratch')
        
        return self.get_model(source=source).vectors[0].shape[0]
    
    def get_vectors(self, source, words=None):
        '''
        Retrieves vectors for specified words or for all words in the model
        '''
        if source not in ['glove', 'word2vec', 'fasttext', 'from_scratch']:
            raise ValueError('Possible value of source are glove, word2vec, fasttext, or from_scratch')
        
        if words is None:
            words = self.get_words(source=source)
            
        embedding = np.empty((len(words), self.get_dimension(source=source)), dtype=np.float32)
        for i, word in enumerate(words):
            embedding[i] = self.get_vector(source=source, word=word)
                
        return embedding
        
    
    def get_vector(self, source, word):
        '''
        Retrieves the vector representation of a single word
        '''
        if source not in ['glove', 'word2vec', 'fasttext', 'from_scratch']:
            raise ValueError('Possible value of source are glove, word2vec, fasttext, or from_scratch')
            
        if source not in self.model:
            raise ValueError('Did not load %s model yet' % source)
        
        try:
            return self.model[source][word]
        except KeyError as e:
            dims = self.model[source][0].shape
            vect = np.empty(dims)
            vect[:] = np.nan
            return vect

In [68]:
# define paths to pretrained word vectors
word2vec_file_path = f'{input_dir_path}googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin'
fasttext_file_path = f'{input_dir_path}fasttext-wikinews/wiki-news-300d-1M.vec'


In [69]:
# let's create a function that will convert token list to their vector representations
def tok2vec(word_emb, tokens, source, avg):
    vects = word_emb.get_vectors(source=source, words=tokens)
    
    if avg == "mean":
        return np.nanmean(vects, axis=0)
    elif avg == "sum":
        return np.nansum(vects, axis=0)
    else:
        raise ValueError("Select correct averaging method: 'sum' or 'mean'")

In [70]:
# create function for model train, test and ROC-AUC metrics display
def get_pred_for_pretrained(source, word_emb):
    # split on test and train datasets
    train_idxs = df.sample(frac=0.8, random_state=42).index
    test_idxs = [idx for idx in df.index if idx not in train_idxs]

    # apply word_tokenize method of nltk package for text tokenization and then get the vector representations
    X_train = df.loc[train_idxs, 'message_normalized'].apply(
        word_tokenize).apply(lambda x: tok2vec(word_emb, x, source, 'sum')).to_numpy()

    X_test = df.loc[test_idxs, 'message_normalized'].apply(
        word_tokenize).apply(lambda x: tok2vec(word_emb, x, source, 'sum')).to_numpy()

    X_train = np.stack(X_train, axis=0)
    X_test = np.stack(X_test, axis=0)

    y_train = df.loc[train_idxs, 'target']
    y_test = df.loc[test_idxs, 'target']
    
    # build a LogisticRegression model and train it
    model = LogisticRegression(random_state=42, max_iter=1000)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)

    print(f'ROC-AUC {source}: ', roc_auc_score(y_test, predictions))

In [71]:
# create object WordEmbeddings and load pretrained Word2Vec model
word_embeddings_word2vec = WordEmbedding()
source_w2v='word2vec'
word_embeddings_word2vec.load(source=source_w2v, file_path=word2vec_file_path)

get_pred_for_pretrained(source_w2v, word_embeddings_word2vec)

2024-09-28 12:41:36.382069 start: loading word2vec
2024-09-28 12:42:31.625901 end: loading word2vec
ROC-AUC word2vec:  0.917079104102005


In [72]:
# create object WordEmbeddings and load pretrained FastText model
word_embeddings_fast_text = WordEmbedding()
source_ft='fasttext'
word_embeddings_fast_text.load(source=source_ft, file_path=fasttext_file_path)

get_pred_for_pretrained(source_ft, word_embeddings_fast_text)

2024-09-28 12:42:53.163487 start: loading fasttext
2024-09-28 12:48:25.122506 end: loading fasttext
ROC-AUC fasttext:  0.9106245747467122


Both pretrained models 'Word2Vec' and 'FastText' yield strong ROC-AUC results in the range of 0.91-0.92, demonstrating their effectiveness in spam detection tasks. However, the "Bag of Words" method offers similar ROC-AUC values while being simpler and faster, making it a highly efficient alternative as well.