In [1]:
# General libraries
import re
import time

# Data manipulation and analysis
import pandas as pd
import numpy as np

# NLP and text processing
import spacy
import gensim
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

# Machine learning tools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

# Progress bar for loops
from tqdm.auto import tqdm

# Download NLTK resources
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('punkt_tab')
! unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package wordnet to /Users/julia/nltk_data...


unzip:  cannot find or open /usr/share/nltk_data/corpora/wordnet.zip, /usr/share/nltk_data/corpora/wordnet.zip.zip or /usr/share/nltk_data/corpora/wordnet.zip.ZIP.


[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/julia/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/julia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/julia/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
data = pd.read_csv('spam.csv', encoding = "ISO-8859-1")
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
columns_to_concat = ['v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']
data['text'] = data[columns_to_concat].fillna('').astype(str).agg(' '.join, axis=1)

data['spam_ham'] = np.where(data['v1'] == 'spam', 1, 0)

data = data.drop(columns=['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])
print(data.head(15))

                                                 text  spam_ham
0   Go until jurong point, crazy.. Available only ...         0
1                    Ok lar... Joking wif u oni...            0
2   Free entry in 2 a wkly comp to win FA Cup fina...         1
3   U dun say so early hor... U c already then say...         0
4   Nah I don't think he goes to usf, he lives aro...         0
5   FreeMsg Hey there darling it's been 3 week's n...         1
6   Even my brother is not like to speak with me. ...         0
7   As per your request 'Melle Melle (Oru Minnamin...         0
8   WINNER!! As a valued network customer you have...         1
9   Had your mobile 11 months or more? U R entitle...         1
10  I'm gonna be home soon and i don't want to tal...         0
11  SIX chances to win CASH! From 100 to 20,000 po...         1
12  URGENT! You have won a 1 week FREE membership ...         1
13  I've been searching for the right words to tha...         0
14             I HAVE A DATE ON SUNDAY W

In [4]:
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words('english'))
contractions = {
    "can't": "cannot",
    "won't": "will not",
    "it's": "it is",
    "i'm": "i am",
    "he's": "he is",
    "she's": "she is",
    "they're": "they are",
    "we're": "we are",
    "you've": "you have",
    "i've": "i have",
    "don't": "do not",
    "didn't": "did not",
    "isn't": "is not",
    "aren't": "are not",
    "wasn't": "was not",
    "weren't": "were not",
    "there's": "there is",
    "that's": "that is",
}

def preprocess_text(text):
    text = re.sub(r"<[^>]*>", " ", text)
    text = re.sub(r"\S*@\S*\s+", " ", text)
    text = re.sub(r"https?:\/\/.*?\s+", " ", text)
    text = text.lower()
    text = " ".join([contractions.get(word, word) for word in text.split()])
    
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    
    text = " ".join(tokens)
    text = re.sub(r"[^a-zA-Z' ]", "", text)
    
    doc = nlp(text)
    text = " ".join([token.lemma_ for token in doc if len(token.lemma_) > 1])
    text = re.sub(r"\s+", " ", text).strip()

    return text


In [5]:
data['cleaned_text'] = data['text'].apply(preprocess_text)

data = data.drop(columns=['text'])

X = data['cleaned_text']
y = data['spam_ham']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")

Training set size: 4457
Validation set size: 1115


In [6]:
vectorizer_bow = CountVectorizer(max_features=1000, ngram_range=(1, 2), stop_words='english')

X_train_bow = vectorizer_bow.fit_transform(X_train)
X_val_bow = vectorizer_bow.transform(X_val)

print("BoW матриця для навчального набору:", X_train_bow.shape)
print("BoW матриця для валідаційного набору:", X_val_bow.shape)

BoW матриця для навчального набору: (4457, 1000)
BoW матриця для валідаційного набору: (1115, 1000)


In [7]:
vectorizer_tfidf = TfidfVectorizer(max_features=1000, ngram_range=(1, 2), stop_words='english')

X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
X_val_tfidf = vectorizer_tfidf.transform(X_val)

print("TF-IDF матриця для навчального набору:", X_train_tfidf.shape)
print("TF-IDF матриця для валідаційного набору:", X_val_tfidf.shape)

TF-IDF матриця для навчального набору: (4457, 1000)
TF-IDF матриця для валідаційного набору: (1115, 1000)


In [8]:
word2vec_file_path = 'embeddings/wiki.simple.vec'
fasttext_file_path = 'embeddings/GoogleNews-vectors-negative300.bin'
glove_file_path = 'embeddings/glove.6B.300d.txt'

In [9]:
def load_word2vec_model(file_path, binary=False):
    print(f"Loading model from: {file_path}")
    return gensim.models.KeyedVectors.load_word2vec_format(file_path, binary=binary)

def load_glove_model(file_path):
    print(f"Loading GloVe embeddings from: {file_path}")
    glove_model = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            glove_model[word] = vector
    print(f"Loaded {len(glove_model)} words from GloVe embeddings.")
    return glove_model

def get_embeddings(texts, model, vector_size, model_type="word2vec"):
    embeddings = []
    for text in texts:
        words = text.split()
        if model_type == "glove":
            word_vectors = [model[word] for word in words if word in model]
        else:
            word_vectors = [model[word] for word in words if word in model.key_to_index]

        if word_vectors:
            embeddings.append(np.mean(word_vectors, axis=0))
        else:
            embeddings.append(np.zeros(vector_size))
    return np.array(embeddings)

In [10]:
word2vec_model = load_word2vec_model(word2vec_file_path, binary=False)
glove_model = load_glove_model(glove_file_path)
fasttext_model = load_word2vec_model(fasttext_file_path, binary=True)

Loading model from: embeddings/wiki.simple.vec
Loading GloVe embeddings from: embeddings/glove.6B.300d.txt
Loaded 400000 words from GloVe embeddings.
Loading model from: embeddings/GoogleNews-vectors-negative300.bin


In [11]:
print("Generating Word2Vec embeddings...")
X_train_embed_Word2Vec = get_embeddings(X_train, word2vec_model, vector_size=300, model_type="word2vec")
X_val_embed_Word2Vec = get_embeddings(X_val, word2vec_model, vector_size=300, model_type="word2vec")

print("Generating GloVe embeddings...")
X_train_embed_GloVe = get_embeddings(X_train, glove_model, vector_size=300, model_type="glove")
X_val_embed_GloVe = get_embeddings(X_val, glove_model, vector_size=300, model_type="glove")

print("Generating FastText embeddings...")
X_train_embed_FastText = get_embeddings(X_train, fasttext_model, vector_size=300, model_type="fasttext")
X_val_embed_FastText = get_embeddings(X_val, fasttext_model, vector_size=300, model_type="fasttext")

print("Word2Vec training embeddings shape:", X_train_embed_Word2Vec.shape)
print("Word2Vec validation embeddings shape:", X_val_embed_Word2Vec.shape)
print("GloVe training embeddings shape:", X_train_embed_GloVe.shape)
print("GloVe validation embeddings shape:", X_val_embed_GloVe.shape)
print("FastText training embeddings shape:", X_train_embed_FastText.shape)
print("FastText validation embeddings shape:", X_val_embed_FastText.shape)

Generating Word2Vec embeddings...
Generating GloVe embeddings...
Generating FastText embeddings...
Word2Vec training embeddings shape: (4457, 300)
Word2Vec validation embeddings shape: (1115, 300)
GloVe training embeddings shape: (4457, 300)
GloVe validation embeddings shape: (1115, 300)
FastText training embeddings shape: (4457, 300)
FastText validation embeddings shape: (1115, 300)
