In [8]:
import pandas as pd
import numpy as np
import re
from sklearn.utils import resample
import random
from collections import Counter

Load and Preprocess Data

In [9]:
# Load the dataset
data = pd.read_csv("spam_or_not_spam.csv") 

data = data.dropna(subset=['email'])
data['email'] = data['email'].astype(str)

spamData = data[data['label'] == 1]
notSpamData = data[data['label'] == 0]

# Upsample the minority class (spam emails)
unsampledSpam = resample(spamData, 
                          replace=True, 
                          n_samples=len(notSpamData), 
                          random_state=42)

# Combine the balanced data
newData = pd.concat([unsampledSpam, notSpamData]).sample(frac=1, random_state=42).reset_index(drop=True)

# Function to preprocess text
def preProcessData(text):
    """
    Preprocesses the input text by removing numbers, special characters, and converting to lowercase.
    """
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation and special characters
    text = text.lower()  # Convert to lowercase
    return text

# Apply preprocessing and tokenize
newData['cleaned_email'] = newData['email'].apply(preProcessData)
newData['tokens'] = newData['cleaned_email'].apply(lambda x: x.split())

print("Data preprocessing completed.")

Data preprocessing completed.


Build Vocabulary and Initialize Embeddings

In [10]:
# Build the vocabulary
corpus = newData['tokens'].tolist()
wordCount = Counter([word for sentence in corpus for word in sentence])
vocab = list(wordCount.keys())
wordToIndex = {word: i for i, word in enumerate(vocab)}
indexToWord = {i: word for i, word in enumerate(vocab)}
vocabSize = len(vocab)

print(f"Vocabulary size: {vocabSize}")

# Initialize word and context embeddings
embeddingsDIM = 10
wordEmbeddings = np.random.uniform(-1, 1, (vocabSize, embeddingsDIM))
contextEmbeddings = np.random.uniform(-1, 1, (vocabSize, embeddingsDIM))

Vocabulary size: 34212


Define Helper Functions

In [11]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def generateTrainingData(corpus, windowSize):
    pairs = []
    for sentence in corpus:
        for centerIndex, centerWord in enumerate(sentence):
            startWindow = max(0, centerIndex - windowSize)
            endWindow = min(len(sentence), centerIndex + windowSize + 1)
            for contextIndex in range(startWindow, endWindow):
                if contextIndex != centerIndex and contextIndex < len(sentence):
                    contextWord = sentence[contextIndex]
                    pairs.append((centerWord, contextWord))
    return pairs

def getNegativeSamples(targetIndex, vocabSize, numberOfSamples):
    samples = []
    while len(samples) < numberOfSamples:
        sampleIndex = random.randint(0, vocabSize - 1)
        if sampleIndex != targetIndex:
            samples.append(sampleIndex)
    return samples

Generate Training Pairs

In [12]:
# Set parameters
windowSize = 2 
negativeSamples = 5 

trainingPairs = generateTrainingData(corpus, windowSize)
print(f"Total training pairs: {len(trainingPairs)}")

Total training pairs: 5224154


Train Word2Vec Model

In [None]:
# Training parameters
learningRate = 0.01
epochs = 5

# Training loop
for epoch in range(epochs):
    lotalLoss = 0
    random.shuffle(trainingPairs)  # Shuffle training data each epoch
    
    for centerWord, contextWord in trainingPairs:
        centerIndex = wordToIndex[centerWord]
        contextIndex = wordToIndex[contextWord]
        
        # Positive sample
        z = np.dot(wordEmbeddings[centerIndex], contextEmbeddings[contextIndex])
        p_pos = sigmoid(z)
        loss = -np.log(p_pos)
        
        # Gradients for positive sample
        gradientCenter = (p_pos - 1) * contextEmbeddings[contextIndex]
        gradientContext = (p_pos - 1) * wordEmbeddings[centerIndex]
        
        # Update embeddings for positive sample
        wordEmbeddings[centerIndex] -= learningRate * gradientCenter
        contextEmbeddings[contextIndex] -= learningRate * gradientContext
        
        # Negative sampling
        negativeSampleIndices = getNegativeSamples(centerIndex, vocabSize, negativeSamples)
        for negativeIndex in negativeSampleIndices:
            z_neg = np.dot(wordEmbeddings[centerIndex], contextEmbeddings[negativeIndex])
            p_neg = sigmoid(z_neg)
            loss += -np.log(1 - p_neg)
            
            # Gradients for negative sample
            gradientCenter_Negative = p_neg * contextEmbeddings[negativeIndex]
            gradientContext_Negative = p_neg * wordEmbeddings[centerIndex]
            
            # Update embeddings for negative sample
            wordEmbeddings[centerIndex] -= learningRate * gradientCenter_Negative
            contextEmbeddings[negativeIndex] -= learningRate * gradientContext_Negative
        
        lotalLoss += loss
    
    averageLoss = lotalLoss / len(trainingPairs)
    print(f"Epoch {epoch + 1}/{epochs}, Average Loss: {averageLoss:.4f}")

print("Word2Vec training completed.")

Save the Trained Embeddings

In [None]:
# Save the word embeddings
np.save("wordEmbeddings.npy", wordEmbeddings)
print("Trained embeddings saved as 'wordEmbeddings.npy'")

Trained embeddings saved as 'wordEmbeddings.npy'
