In [94]:
import torch as t
import pandas as pd 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from torch.nn.utils.rnn import pad_sequence

In [None]:
df = pd.read_csv("spam.csv" , encoding="latin1")
to_drop = ["Unnamed: 2"	,"Unnamed: 3"	,"Unnamed: 4"]
df.drop(columns=to_drop , inplace=True)
stop_words = stopwords.words("english")

words =[]
for sentence in df["v2"]:
    tokenWord = word_tokenize(sentence , "english")
    words.append(tokenWord)
    
all_words = [w.lower() for sentence in words for w in sentence if w.lower() not in stop_words]

vocab = {word: idx for idx, word in enumerate(set(all_words))}
print("vocab size:", len(vocab))
vocab_size = len(vocab)

df['ids'] = [[vocab[w.lower()] for w in sentence if w.lower() not in stop_words] for sentence in words]


len(df["ids"][1])
len(df["ids"][0])


seqs = [t.tensor(ids) for ids in df['ids']]


seqs_padded = pad_sequence(seqs, batch_first=True, padding_value=0)
print(seqs_padded.shape)  # (num_samples, seq_len)


seqs_padded[20].shape

vocab size: 9310
torch.Size([5572, 207])


torch.Size([207])

In [None]:
class EmbeddingModel(t.nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super().__init__()
        self.embedding = t.nn.Embedding(vocab_size, embed_dim)

  
        self.network = t.nn.Sequential(
            t.nn.Linear(embed_dim, 128),  
            t.nn.ReLU(),
            t.nn.Linear(128, num_classes)
        )

    def forward(self, x):
        x = self.embedding(x)        # (batch_size, seq_len, embed_dim)
        x = x.mean(dim=1)            # (batch_size, embed_dim)
        x = self.network(x)          # (batch_size, num_classes)
        return x





In [97]:
from torch.utils.data import TensorDataset , DataLoader
from sklearn.model_selection import train_test_split


labels = df['v1'].map({'ham':0, 'spam':1}).to_numpy()
X_train , X_test , Y_train , Y_test = train_test_split(seqs_padded , labels , random_state=42)

import torch

train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.long),
                              torch.tensor(Y_train, dtype=torch.long))

test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.long),
                             torch.tensor(Y_test, dtype=torch.long))

batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

  train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.long),
  test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.long),


In [98]:

model = EmbeddingModel(vocab_size, 207, 2)  # 207 = embed_dim
epochs = 15
criterion = t.nn.CrossEntropyLoss()
optimizer = t.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(epochs):
    for batch in train_loader:
        batch_x, batch_y = batch  # فرقنا الـ input عن الـ labels
        
        optimizer.zero_grad()      # مهم جداً قبل كل backward
        y_pred = model(batch_x)    # forward pass
        
        loss = criterion(y_pred, batch_y)  # حساب الخسارة
        loss.backward()            # backward pass
        optimizer.step()           # تحديث الأوزان

    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")
    
    
    

Epoch 1, Loss: 0.4237
Epoch 2, Loss: 0.5322
Epoch 3, Loss: 0.4152
Epoch 4, Loss: 0.2813
Epoch 5, Loss: 0.2080
Epoch 6, Loss: 0.2595
Epoch 7, Loss: 0.0203
Epoch 8, Loss: 0.0452
Epoch 9, Loss: 0.0167
Epoch 10, Loss: 0.0462
Epoch 11, Loss: 0.0483
Epoch 12, Loss: 0.0283
Epoch 13, Loss: 0.3265
Epoch 14, Loss: 0.0511
Epoch 15, Loss: 0.0024


In [None]:

model.eval()  


sentence = "hey man ! are you coming to the school today ?"

# tokenize + lowercase + convert to IDs
tokens = word_tokenize(sentence.lower())
ids = [vocab[w] for w in tokens if w in vocab] 

x_input = torch.tensor(ids).unsqueeze(0)  


seq_len = 50 
if x_input.shape[1] < seq_len:
    pad_len = seq_len - x_input.shape[1]
    x_input = torch.cat([x_input, torch.zeros(1, pad_len, dtype=torch.long)], dim=1)
elif x_input.shape[1] > seq_len:
    x_input = x_input[:, :seq_len]  


with torch.no_grad():
    y_pred = model(x_input) 
    predicted_class = torch.argmax(y_pred, dim=1).item()

classes = {0: "ham", 1: "spam"}
print(f"Predicted class: {classes[predicted_class]}")


with torch.no_grad():
    sentence_embedding = model.embedding(x_input).mean(dim=1) 
print("Sentence embedding shape:", sentence_embedding.shape)


# from torch.functional import F
# word1, word2 = 'free', 'win'
# id1, id2 = vocab[word1], vocab[word2]
# vec1, vec2 = model.embedding.weight[id1], model.embedding.weight[id2]
# similarity = F.cosine_similarity(vec1.unsqueeze(0), vec2.unsqueeze(0))
# print(f"Cosine similarity between '{word1}' and '{word2}': {similarity.item():.4f}")

Predicted class: ham
Sentence embedding shape: torch.Size([1, 207])
