# Building a Recurrent Neural Network from Scratch

We will be using the Ham Spam Dataset to train a neural network which can classify messages as spam or ham. 

The Gameplan:
1. Dataset 
2. DataLoader
3. Building RNN
4. Training Loop
5. Testing Loop

In [13]:
# import statements
import torch
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import numpy as np
import pandas as pd
import re

## Dataset

In [2]:
df = pd.read_csv("hamspam.csv", index_col = ["Unnamed: 0"])

In [3]:
df.head()

Unnamed: 0,v1,v2,split
0,ham,"Go until jurong point, crazy.. Available only ...",train
1,ham,Ok lar... Joking wif u oni...,train
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,train
3,ham,U dun say so early hor... U c already then say...,train
4,ham,"Nah I don't think he goes to usf, he lives aro...",train


In [4]:
df.columns = ["target", "message", "split"]

In [7]:
conversion_dict = {"ham": 0, "spam": 1}
def conversion_fn(target_val):
    return conversion_dict[target_val]

In [11]:
df["target"] = df["target"].map(lambda x: conversion_fn(x))

In [12]:
df.head()

Unnamed: 0,target,message,split
0,0,"Go until jurong point, crazy.. Available only ...",train
1,0,Ok lar... Joking wif u oni...,train
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,train
3,0,U dun say so early hor... U c already then say...,train
4,0,"Nah I don't think he goes to usf, he lives aro...",train


In [35]:
class Vocabulary:
    def __init__(self, messages):
        self.token_to_idx = {}
        self.idx_to_token = {}
        self.messages = messages
        self.add_token("<UNK>")
        self.special_char = re.compile(r'[;\\/,!.:*?\"<>|&\']')
        for message in messages:
            for word in message.split(" "):
                word = re.sub(self.special_char, " ", word)
                word = word.lower()
                self.add_token(word)
        
    def add_token(self,token):
        if token in self.token_to_idx:
            index = self.token_to_idx[token]
        else:
            index = len(self.token_to_idx)
            self.token_to_idx[token] = index
            self.idx_to_token[index] = token
            
    def vectorize(self, message):
        one_hot = torch.zeros(len(self.token_to_idx))
        for num, word in enumerate(message.split(" ")):
            word = re.sub(self.special_char, " ", word)
            word = word.lower()
            if word in self.token_to_idx:
                one_hot[self.token_to_idx[word]] = 1
            else:
                word = "<UNK>"
                one_hot[self.token_to_idx[word]] = 1
        return one_hot
    

In [36]:
class SpamDataset(Dataset):
    def __init__(self, df, messages_col, target_col, transform = None):
        self.df = df
        self.transform = transform
        
        self.messages = self.df[messages_col]
        self.target = self.df[target_col]
        
        self.vocab = Vocabulary(self.messages)
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        message = self.messages[index]
        target = self.target[index]
        
        if self.transform is not None:
            message = self.transform(message)
            
        vectorized_message = torch.tensor(self.vocab.vectorize(message))
        vectorized_target = torch.tensor(target)
        
        return vectorized_message, vectorized_target

In [37]:
train_dataset = SpamDataset(df, "message", "target")

In [38]:
train_dataset[1]

  vectorized_message = torch.tensor(self.vocab.vectorize(message))


(tensor([0., 0., 0.,  ..., 0., 0., 0.]), tensor(0))

In [39]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)

In [40]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.h2o = nn.Linear(hidden_size ,output_size)
        self.softmax = nn.LogSoftmax(dim = 1)
        
    def forward(self, input_message, hidden):
        combined = torch.cat((input_message, hidden), 1)
        hidden = self.i2h(combined)
        output = self.h2o(hidden)
        output = self.softmax(output)
        
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

In [41]:
device = "mps"

In [43]:
input_size = len(token_to_idx)
hidden_size = len(token_to_idx)
output_size = 2
model = RNN(input_size, hidden_size, output_size).to(device)
print(model)

NameError: name 'token_to_idx' is not defined