In [1]:
import re
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords', download_dir='D:/nltk_data')
nltk.download('wordnet', download_dir='D:/nltk_data')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to D:/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to D:/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
import pandas as pd
df = pd.read_csv('SMSSpamCollection.txt', sep='\t', names=['label', 'message'])

In [3]:
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
for i in range(0, len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['message'][i])  # Remove non-alphabetic characters
    review = review.lower()  # Convert to lowercase
    review = review.split()  # Split into individual words

    # Lemmatize each word
    review = [lemmatizer.lemmatize(word) for word in review]
    
    # Join the words back into a single string
    df.at[i, 'message'] = ' '.join(review)

In [5]:
df

Unnamed: 0,label,message
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in a wkly comp to win fa cup final ...
3,ham,u dun say so early hor u c already then say
4,ham,nah i don t think he go to usf he life around ...
...,...,...
5567,spam,this is the nd time we have tried contact u u ...
5568,ham,will b going to esplanade fr home
5569,ham,pity wa in mood for that so any other suggestion
5570,ham,the guy did some bitching but i acted like i d...


In [6]:
import plotly.express as px

fig_label_dist = px.histogram(
    df, 
    x='label', 
    title='Distribution of Labels (Ham vs Spam)',
    labels={'label': 'Label', 'count': 'Count'},
)
fig_label_dist.show()

In [7]:
from collections import Counter

# Join all words in the messages
all_words = ' '.join(df['message']).split()
spam_words = ' '.join(df[df['label'] == 'spam']['message']).split()
ham_words = ' '.join(df[df['label'] == 'ham']['message']).split()

# Count the frequency of words
word_freq = Counter(all_words)
spam_word_freq = Counter(spam_words)
ham_word_freq = Counter(ham_words)

# Convert to DataFrame for Plotly visualization
word_freq_df = pd.DataFrame(word_freq.items(), columns=['Word', 'Frequency']).sort_values(by='Frequency', ascending=False)
spam_word_freq_df = pd.DataFrame(spam_word_freq.items(), columns=['Word', 'Frequency']).sort_values(by='Frequency', ascending=False)
ham_word_freq_df = pd.DataFrame(ham_word_freq.items(), columns=['Word', 'Frequency']).sort_values(by='Frequency', ascending=False)

# Plot the top 20 words
fig_word_freq = px.bar(word_freq_df.head(20), x='Word', y='Frequency', title='Top 20 Most Frequent Words')
fig_word_freq.show()

fig_spam_word_freq = px.bar(spam_word_freq_df.head(20), x='Word', y='Frequency', title='Top 20 Most Frequent Words in Spam', color_discrete_sequence=['red'])
fig_spam_word_freq.show()

fig_ham_word_freq = px.bar(ham_word_freq_df.head(20), x='Word', y='Frequency', title='Top 20 Most Frequent Words in Ham', color_discrete_sequence=['green'])
fig_ham_word_freq.show()


In [8]:
# Calculate message lengths
df['message_length'] = df['message'].apply(len)

# Plot distribution of message lengths
fig_length = px.histogram(df, x='message_length', title='Distribution of Message Lengths',
                          labels={'message_length': 'Message Length (characters)'})
fig_length.update_layout(xaxis_title='Message Length (characters)', yaxis_title='Count')
fig_length.show()

In [9]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.2, random_state=10)

print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")

Training set size: 4457
Validation set size: 1115


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

max_features = 5000  # Top 5000 words

vectorizer = TfidfVectorizer(max_features=max_features)

# Fit and transform the training data, and transform the validation data
X_train = vectorizer.fit_transform(train_df['message']).toarray()
X_val = vectorizer.transform(val_df['message']).toarray()

# Convert labels to numerical format
y_train = train_df['label'].map({'ham': 0, 'spam': 1}).values
y_val = val_df['label'].map({'ham': 0, 'spam': 1}).values

In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)

# When using BCELoss, ensure that your target labels (y_train and y_val) 
# are in the form of floats (0.0 and 1.0) rather than integers (0 and 1):
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).unsqueeze(1)

In [12]:
# Create PyTorch datasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

In [13]:
train_dataset.tensors

(tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 tensor([[0.],
         [0.],
         [0.],
         ...,
         [0.],
         [0.],
         [0.]]))

In [14]:
# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [15]:
class SpamClassifier(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x

In [16]:
model = SpamClassifier(input_size=max_features)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [17]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        # Validation
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0

        with torch.no_grad():
            for inputs, labels in val_loader:
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                predictions = (outputs > 0.5).float()  # Predictions are binary: 0 or 1
                total += labels.size(0)
                correct += (predictions == labels).sum().item()
        
        print(f"Epoch {epoch+1}/{num_epochs}, "
              f"Train Loss: {running_loss/len(train_loader):.4f}, "
              f"Validation Loss: {val_loss/len(val_loader):.4f}, "
              f"Validation Accuracy: {100 * correct / total:.2f}%")

# Example call to train_model
train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10)


Epoch 1/10, Train Loss: 0.3050, Validation Loss: 0.1147, Validation Accuracy: 97.76%
Epoch 2/10, Train Loss: 0.0470, Validation Loss: 0.0632, Validation Accuracy: 98.21%
Epoch 3/10, Train Loss: 0.0120, Validation Loss: 0.0746, Validation Accuracy: 98.12%
Epoch 4/10, Train Loss: 0.0040, Validation Loss: 0.0780, Validation Accuracy: 98.12%
Epoch 5/10, Train Loss: 0.0020, Validation Loss: 0.0934, Validation Accuracy: 98.03%
Epoch 6/10, Train Loss: 0.0009, Validation Loss: 0.1024, Validation Accuracy: 97.94%
Epoch 7/10, Train Loss: 0.0004, Validation Loss: 0.1061, Validation Accuracy: 97.94%
Epoch 8/10, Train Loss: 0.0003, Validation Loss: 0.1096, Validation Accuracy: 97.94%
Epoch 9/10, Train Loss: 0.0002, Validation Loss: 0.1131, Validation Accuracy: 97.94%
Epoch 10/10, Train Loss: 0.0001, Validation Loss: 0.1165, Validation Accuracy: 97.94%


In [18]:
# Function to preprocess and vectorize the message
def preprocess_message(message, vectorizer):
    # Clean the message (similar to how you did during training)
    review = re.sub('[^a-zA-Z]', ' ', message)
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review]
    review = ' '.join(review)
    
    # Convert to vector using the same vectorizer used in training
    vectorized_message = vectorizer.transform([review]).toarray()
    
    # Convert to tensor
    tensor_message = torch.tensor(vectorized_message, dtype=torch.float32)
    
    return tensor_message

# Function to predict the class of a message
def predict_message(message, model, vectorizer):
    # Preprocess and vectorize the message
    tensor_message = preprocess_message(message, vectorizer)
    
    # Make sure the model is in evaluation mode
    model.eval()
    
    # No need to track gradients during inference
    with torch.no_grad():
        # Get the model's output
        output = model(tensor_message)
        
        # Apply sigmoid to get probability
        prob = torch.sigmoid(output)
        
        # Convert probability to binary prediction (0 or 1)
        prediction = (prob > 0.5).float()
    
    # Map the prediction to a label
    return "spam" if prediction.item() == 1 else "ham"

# Example usage
message = "Congratulations! You've won a free ticket to the Bahamas!"
prediction = predict_message(message, model, vectorizer)
print(f"The message is classified as: {prediction}")

The message is classified as: spam


In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [23]:
pred1 = lr.predict(X_val)
pred1

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [24]:
accuracy_score(y_val, pred1)

0.9641255605381166

In [25]:
confusion_matrix(y_val, pred1)

array([[972,   0],
       [ 40, 103]], dtype=int64)

972 messages were correctly classified as ham.

0 messages were incorrectly classified as spam (none of the ham messages were misclassified as spam).

40 spam messages were incorrectly classified as ham.

103 spam messages were correctly classified as spam.

In [26]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

In [27]:
pred2 = rfc.predict(X_val)
pred2

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [28]:
accuracy_score(y_val, pred2)

0.9668161434977578

In [29]:
confusion_matrix(y_val, pred2)

array([[972,   0],
       [ 37, 106]], dtype=int64)

972 messages were correctly classified as ham.

0 messages were incorrectly classified as spam (none of the ham messages were misclassified as spam).

37 spam messages were incorrectly classified as ham.

106 spam messages were correctly classified as spam.