In [61]:
import torch

In [62]:
device = torch.device("cuda:0")

This model will use:

https://www.kaggle.com/uciml/sms-spam-collection-dataset

In [63]:
import pandas as pd
df = pd.read_csv('spam.csv', encoding = "ISO-8859-1")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [64]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
df.columns =['is_spam', 'message']
df.head()

Unnamed: 0,is_spam,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [65]:
df['is_spam']=df['is_spam'].apply(lambda is_spam: 1 if is_spam == 'spam' else 0)
df.head()

Unnamed: 0,is_spam,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [66]:
df['is_spam'].value_counts()

Unnamed: 0_level_0,count
is_spam,Unnamed: 1_level_1
0,4825
1,747


Why is imbalanced data an issue?

Imagine you have 100 animal images:
*   95 dog images.
*   5 cat images.

And we want to create a
Model that accurately determines dog images versus cat images.

You could create a
Model that **always** guesses dog images,
which would give you a misleading 95% accuracy on the 100 animal images.

When the reality is your Model
has a 0% accuracy for cat images.

In [67]:
spam_df = df[df['is_spam'] == 1]
not_spam_df = df[df['is_spam'] == 0].sample(n=len(spam_df))

# Combine and shuffle.
balanced_df = pd.concat([spam_df, not_spam_df]).sample(frac = 1)

In [68]:
balanced_df['is_spam'].value_counts()

Unnamed: 0_level_0,count
is_spam,Unnamed: 1_level_1
0,747
1,747


In [69]:
balanced_df.head(10)

Unnamed: 0,is_spam,message
2841,0,aathi..where are you dear..
129,0,K..k:)how much does it cost?
5050,0,With my sis lor... We juz watched italian job.
2175,0,I'm at work. Please call
2005,0,Can't take any major roles in community outrea...
5010,1,You have WON a guaranteed å£1000 cash or a å£2...
4078,0,O i played smash bros &lt;#&gt; religiously.
3076,0,There is no sense in my foot and penis.
1090,1,WIN URGENT! Your mobile number has been awarde...
3826,1,Congratulations U can claim 2 VIP row A Ticket...


In [70]:
from sklearn.model_selection import train_test_split

train_messages, test_messages, train_labels, test_labels = train_test_split(
    balanced_df['message'].values,
    balanced_df['is_spam'].values,
    test_size=0.2
)

In [71]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [72]:
print("Tokenized data: ", tokenizer("Kevin is great!", truncation=True, padding=True))

Tokenized data:  {'input_ids': [101, 4901, 2003, 2307, 999, 102], 'attention_mask': [1, 1, 1, 1, 1, 1]}


In [73]:
train_tokenized_data = tokenizer(list(train_messages), truncation=True, padding=True)
test_tokenized_data = tokenizer(list(test_messages), truncation=True, padding=True)

In [74]:
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, tokenized_data, labels):
        self.tokenized_data = {token_type: torch.tensor(token_values) for token_type, token_values in tokenized_data.items()}
        self.labels = torch.tensor(labels)

    def __getitem__(self, index):
        return {
            'input_ids': self.tokenized_data['input_ids'][index],
            'attention_mask': self.tokenized_data['attention_mask'][index],
            'is_spam': self.labels[index]
        }

    def __len__(self):
        return len(self.labels)


train_dataset = CustomDataset(train_tokenized_data, train_labels)
test_dataset = CustomDataset(test_tokenized_data, test_labels)

In [75]:
# Mini-batch Stochastic Gradient Descent.
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=64)

In [76]:
from transformers import DistilBertForSequenceClassification
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased').to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [77]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)

In [78]:
epochs = 5

train_losses_average_per_epoch = []
train_accuracies_average_per_epoch = []

for current_epoch in range(epochs):

    cumulative_accuracy_current_epoch, cumulative_loss_current_epoch = 0, 0

    for train_batch in train_dataloader:
        train_input_ids_batch = train_batch['input_ids'].to(device)
        train_attention_mask_batch = train_batch['attention_mask'].to(device)
        train_labels_batch = train_batch['is_spam'].to(device)

        # Forward pass.
        outputs = model(train_input_ids_batch, attention_mask=train_attention_mask_batch, labels=train_labels_batch)
        loss, logits = outputs.loss, outputs.logits

        # Backpropagation.
        loss.backward() # Compute gradients.
        optimizer.step() # Update weights.
        optimizer.zero_grad() # Update gradients to 0 for start of next batch.

        # Batch Loss.
        cumulative_loss_current_epoch += loss.item()

        # Batch Accuracy.
        preds = torch.argmax(logits, 1)
        num_correct_preds = (preds == train_labels_batch).sum().item()
        batch_accuracy = num_correct_preds / len(train_labels_batch)
        cumulative_accuracy_current_epoch += batch_accuracy

    train_loss_average_current_epoch = cumulative_loss_current_epoch / len(train_dataloader)
    train_accuracy_average_current_epoch = cumulative_accuracy_current_epoch / len(train_dataloader)

    train_losses_average_per_epoch.append(train_loss_average_current_epoch)
    train_accuracies_average_per_epoch.append(train_accuracy_average_current_epoch)

    cumulative_accuracy_current_epoch, cumulative_loss_current_epoch = 0, 0

    print(
        f'Epoch {current_epoch + 1}  '
        f'Train Loss: {train_losses_average_per_epoch[current_epoch]:.2f} '
        f'Train Accuracy: {train_accuracies_average_per_epoch[current_epoch]:.2f} '
    )

Epoch 1  Train Loss: 0.58 Train Accuracy: 0.90 
Epoch 2  Train Loss: 0.24 Train Accuracy: 0.97 
Epoch 3  Train Loss: 0.10 Train Accuracy: 0.98 
Epoch 4  Train Loss: 0.06 Train Accuracy: 0.99 
Epoch 5  Train Loss: 0.04 Train Accuracy: 0.99 


In [79]:
cumulative_accuracy_current_epoch = 0

with torch.no_grad():
    for test_batch in test_dataloader:
        test_input_ids_batch = test_batch['input_ids'].to(device)
        test_attention_mask_batch = test_batch['attention_mask'].to(device)
        test_labels_batch = test_batch['is_spam'].to(device)

        outputs = model(test_input_ids_batch, attention_mask=test_attention_mask_batch)
        loss, logits = outputs.loss, outputs.logits

        # Batch Accuracy.
        preds = torch.argmax(logits, 1)
        num_correct_preds = (preds == test_labels_batch).sum().item()
        batch_accuracy = num_correct_preds / len(test_labels_batch)
        cumulative_accuracy_current_epoch += batch_accuracy

print(f'Final test accuracy: {cumulative_accuracy_current_epoch / len(test_dataloader):.2%}')

Final test accuracy: 98.75%


In [80]:
import torch.nn.functional as F

def predict_message(message):
    with torch.no_grad():
        tokenized_data = tokenizer(message, truncation=True, padding=True, return_tensors='pt')
        input_ids = tokenized_data['input_ids']
        attention_mask = tokenized_data['attention_mask']

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        probabilities = F.softmax(logits, dim=1)
        return probabilities

model.to("cpu")

messages = [
    'Hey lets grab lunch sometimes, it has been a while',
    'When will you arrive here',
    "You are approved for a credit card. Apply now",
    "Congratulations. You just won a gift card. Click here to redeem"
]

for message in messages:
    print(f"{message}:\n{predict_message(message)}\n")

Hey lets grab lunch sometimes, it has been a while:
tensor([[0.9849, 0.0151]])

When will you arrive here:
tensor([[0.9779, 0.0221]])

You are approved for a credit card. Apply now:
tensor([[0.1233, 0.8767]])

Congratulations. You just won a gift card. Click here to redeem:
tensor([[0.0737, 0.9263]])

