In [4]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import nltk

In [6]:
import pandas as pd
df=pd.read_csv("spam1.csv",encoding='ISO-8859-1')
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [9]:
df.columns = ['label', 'message']

In [10]:
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [14]:
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = le.fit_transform(df['label'])


In [15]:
stop_words = set(stopwords.words('english'))


In [16]:
df

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [17]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

In [18]:
df['message'] = df['message'].apply(preprocess_text)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['message'] = df['message'].apply(preprocess_text)


In [19]:
vocab = Counter()
for message in df['message']:
    vocab.update(message)

In [20]:
vocab = {word: i+1 for i, (word, _) in enumerate(vocab.items())}
vocab_size = len(vocab) + 1

In [21]:
def encode_message(message):
    return [vocab[word] for word in message]


In [22]:
df['message'] = df['message'].apply(encode_message)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['message'] = df['message'].apply(encode_message)


In [23]:
max_len = max(df['message'].apply(len))
df['message'] = df['message'].apply(lambda x: x + [0]*(max_len - len(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['message'] = df['message'].apply(lambda x: x + [0]*(max_len - len(x)))


In [24]:
X = torch.tensor(df['message'].tolist())
y = torch.tensor(df['label'].values)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [25]:
class SpamDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [26]:
train_data = SpamDataset(X_train, y_train)
test_data = SpamDataset(X_test, y_test)

train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)


In [27]:
class SpamRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size, n_layers=1):
        super(SpamRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded)
        out = self.fc(output[:, -1, :])
        return out

In [28]:
embed_size = 128
hidden_size = 64
output_size = 1
n_layers = 1

model = SpamRNN(vocab_size, embed_size, hidden_size, output_size, n_layers)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training the model
n_epochs = 5


In [29]:
for epoch in range(n_epochs):
    model.train()
    running_loss = 0.0
    
    for messages, labels in train_loader:
        labels = labels.float().unsqueeze(1)
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(messages)
        
        # Loss calculation
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    print(f'Epoch [{epoch+1}/{n_epochs}], Loss: {running_loss/len(train_loader):.4f}')


Epoch [1/5], Loss: 0.4010
Epoch [2/5], Loss: 0.3950
Epoch [3/5], Loss: 0.3945
Epoch [4/5], Loss: 0.3955
Epoch [5/5], Loss: 0.3964


In [30]:
model.eval()
correct = 0
total = 0

In [31]:
with torch.no_grad():
    for messages, labels in test_loader:
        labels = labels.float().unsqueeze(1)
        outputs = model(messages)
        predicted = (torch.sigmoid(outputs) > 0.5).int()
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.8655


In [34]:
def predict_message(model, message, vocab, max_len):
    model.eval()
    
    # Preprocess the message
    tokens = preprocess_text(message)
    encoded_message = [vocab.get(word, 0) for word in tokens]
    padded_message = encoded_message + [0] * (max_len - len(encoded_message))
    
    # Convert to tensor
    input_tensor = torch.tensor(padded_message).unsqueeze(0)
    
    # Make prediction
    with torch.no_grad():
        output = model(input_tensor)
        prediction = torch.sigmoid(output).item()
        
    return "spam" if prediction > 0.5 else "ham"

# Example predictions
new_messages = [
    "Congratulations! You've won a free ticket to Bahamas. Reply with YES to claim.",
    "hello, you are good?",
]

for message in new_messages:
    prediction = predict_message(model, message, vocab, max_len)
    print(f'Message: "{message}"\nPrediction: {prediction}\n')

Message: "Congratulations! You've won a free ticket to Bahamas. Reply with YES to claim."
Prediction: ham

Message: "hello, you are good?"
Prediction: ham

