In [None]:
from zipfile import ZipFile
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import re 
from collections import Counter
from symspellpy import SymSpell, Verbosity

In [78]:
def extract_subject(email_lines):
    subjects = []
    contents = []

    for text in email_lines:
        text = text.strip()
        match = re.match(r"Subject:\s*([^\(\)]+?)\s*:\s*(.+)", text)
        if match:
            possible_tag = match.group(1).strip()
            remaining = match.group(2).strip()
            subjects.append(possible_tag)
            contents.append(remaining)
            continue
        subjects.append("")
        contents.append(text)

    return subjects, contents


In [79]:
with ZipFile('archive.zip') as zipObject:
    with zipObject.open('emails.csv') as csvObject: 
        df = pd.read_csv(csvObject)
X , y = df["text"] , df["spam"] 

X_subject , X_content = extract_subject(X) 
count = Counter(X_subject)

In [58]:
def data_preprocessing(X):
    X_lower = [data.lower() for data in X]

    X_nonspecial = []
    for data in X_lower:
        cleaned = re.sub(r'[^a-zA-Z\s]', '', data)
        X_nonspecial.append(cleaned)

    X_tokenize = [data.split() for data in X_nonspecial]

    total_tokens = []
    for tokens in X_tokenize:
        total_tokens.extend(tokens)

    count = Counter(total_tokens).most_common(2000)
    return count , X_lower , X_nonspecial , X_tokenize


In [None]:
token_frequency,X_lower,X_nonspecial,X_tokenize = data_preprocessing(X_content)
dictionary  = {token_frequency[i][0]: i+1 for i in range(len(token_frequency))}
X_lower 
def split_sentences(X,min_sentence_lenght = 8):
    total_sentences = [] 
    for text in X: 
        sentences = re.split(r'[.?,]+\s*',text)
        for s in sentences: 
            cleaned_sentences = re.sub(r'[^a-zA-Z\s]', ' ', s)
            cleaned_sentences = re.sub(r'\b[a-zA-Z]\b', '', cleaned_sentences)
            cleaned_sentences = re.sub(r'\s+',' ',cleaned_sentences)
            if(len(cleaned_sentences.strip().split(' ')) > min_sentence_lenght):
                total_sentences.append(cleaned_sentences.strip())
    return total_sentences
sentences = split_sentences(X_lower)
sentences

In [None]:
def spell_checker(sentences):
    sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
    dictionary_path = "frequency_dictionary_en_82_765.txt"
    sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
    
    corrected_sentences = []
    for sentence in sentences:
        corrected_words = []
        words = sentence.split()  
        for word in words:
            suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
            if suggestions:
                corrected_words.append(suggestions[0].term) 
            else:
                corrected_words.append(word)
        corrected_sentences.append(' '.join(corrected_words))
    return corrected_sentences

sentences = ["hi i studi in schooo for eleven hourse"]
corrected = spell_checker(sentences)
print(corrected)


In [None]:
with ZipFile('cv-unique-has-end-punct-sentences.csv.zip') as zip_object:
    with zip_object.open('cv-unique-has-end-punct-sentences.csv') as file_object:
        df = pd.read_csv(file_object)

sentences = df['sentence']
lower = [data.lower() for data in sentences]

cleaned_sentences = split_sentences(lower)

frequency_table, _, _, _ = data_preprocessing(cleaned_sentences)
frequency_dict = {key: index for index, (key, value) in enumerate(frequency_table)}
vocab_size = len(frequency_dict)

def make_pairs_to_train(cleaned_sentences, frequency_dict, max_length):
    pairs = []
    for sentence in cleaned_sentences:
        words = sentence.split()
        for i in range(len(words) - 1):
            w1, w2 = words[i], words[i + 1]
            if w1 in frequency_dict and w2 in frequency_dict:
                pairs.append((w1, w2))
            if len(pairs) >= max_length:
                return pairs
    return pairs

pairs = make_pairs_to_train(cleaned_sentences, frequency_dict, 30000)

def one_hot(index, size):
    vec = np.zeros(size)
    vec[index] = 1
    return vec

dataset = []
for w1, w2 in pairs:
    idx1 = frequency_dict[w1]
    idx2 = frequency_dict[w2]
    input_vector = one_hot(idx1, vocab_size)
    target_vector = one_hot(idx2, vocab_size)
    dataset.append((input_vector, target_vector))
pairs

    

In [None]:
class CBOWTrainer:
    def __init__(self, data, epochs, vocab_size, hidden_size, learning_rate):
        self.data = data 
        self.epochs = epochs
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.learning_rate = learning_rate
        self.weights_input = self.initialize_weights(vocab_size, hidden_size)
        self.weights_output = self.initialize_weights(hidden_size, vocab_size)

    def initialize_weights(self, input_size, output_size):
        limit = np.sqrt(6 / (input_size + output_size))
        return np.random.uniform(-limit, limit, (input_size, output_size))

    def softmax(self, x):
        x = x - np.max(x)
        exp_x = np.exp(x)
        return exp_x / np.sum(exp_x)

    def cross_entropy_loss(self, y_pred, y_true):
        return -np.sum(y_true * np.log(y_pred + 1e-9))

    def forward(self, input_vector, target_vector):
        hidden_activation = np.dot(input_vector, self.weights_input)
        output_scores = np.dot(hidden_activation, self.weights_output)  
        y_pred = self.softmax(output_scores)
        loss = self.cross_entropy_loss(y_pred, target_vector)
        return y_pred, loss, hidden_activation

    def train(self):
        for epoch in range(self.epochs):
            total_loss = 0
            for input_vector, target_vector in self.data:
                y_pred, loss, hidden_activation = self.forward(input_vector, target_vector)

                error = y_pred - target_vector  
                dW2 = np.outer(hidden_activation, error) 
                dW1 = np.outer(input_vector, np.dot(self.weights_output, error)) 
                self.weights_input -= self.learning_rate * dW1
                self.weights_output -= self.learning_rate * dW2

                total_loss += loss

            print(f"Epoch {epoch+1}/{self.epochs}, Loss: {total_loss:.4f}")

    def convert_(self, data): 
        if np.sum(data) == 0:
            print("This word does not exist in the vocab.")
        return np.dot(data, self.weights_input)


In [None]:
wordtovec = CBOWTrainer(dataset,20,2000,10,0.05)
wordtovec.train()

In [None]:

attempt_2 = one_hot(frequency_dict['good'],vocab_size)
attempt_3 = one_hot(frequency_dict['bad'],vocab_size) 
alpha , beta  = wordtovec.convert_(attempt_2),wordtovec.convert_(attempt_3)
def cosine_similarity(a,b): 
    a_dot_b = np.dot(a,b)
    return np.abs((a_dot_b)/(np.linalg.norm(a)*np.linalg.norm(b)))
print("the cosine similarity is " , cosine_similarity(alpha,beta)) 
alpha , beta