In [1]:
# Importing neccessary libraries
import torch
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import string
from argparse import Namespace
from collections import Counter
import re

In [2]:
# Setting training device
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(device)

mps


In [3]:
special_char = re.compile(r'[;\\/,!.:*?\"<>|&\']')

In [4]:
resp = re.sub(special_char, " ", "Hey,,,,Holla,.!")
print(resp)

Hey    Holla   


In [5]:
import pandas as pd
import numpy as np

In [6]:
df = pd.read_csv("hamspam.csv", index_col = ["Unnamed: 0"])
df.head()

Unnamed: 0,v1,v2,split
0,ham,"Go until jurong point, crazy.. Available only ...",train
1,ham,Ok lar... Joking wif u oni...,train
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,train
3,ham,U dun say so early hor... U c already then say...,train
4,ham,"Nah I don't think he goes to usf, he lives aro...",train


In [7]:
df = df.rename({"v2" : "message"}, axis = "columns")

In [8]:
token_to_idx = {}
idx_to_token = {}

In [9]:
def OneHotEncoder(df, df_column):
    uniq_values = df[df_column].unique()
    number_of_features = len(uniq_values)
    main_dict = {}
    for num, val in enumerate(uniq_values):
        # If you have main_dict[val] = zero_list, then you invariably create a reference and this would mess up the One Hot Encoding
        main_dict[val] = list(zero_list)
    for num, record in enumerate(df[df_column]):
        main_dict[str(record)][num] = 1.0
    df_one_hot = pd.DataFrame.from_dict(main_dict)
    frames = [df, df_one_hot]
    df_result = pd.concat(frames,axis=1, join='inner')
    return df_result

In [10]:
df = OneHotEncoder(df, "v1")
print(df.head())

     v1                                            message  split  ham  spam
0   ham  Go until jurong point, crazy.. Available only ...  train  1.0   0.0
1   ham                      Ok lar... Joking wif u oni...  train  1.0   0.0
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...  train  0.0   1.0
3   ham  U dun say so early hor... U c already then say...  train  1.0   0.0
4   ham  Nah I don't think he goes to usf, he lives aro...  train  1.0   0.0


In [11]:
df.drop(["v1"], axis = 1, inplace = True)

In [12]:
train_df = df[df["split"] == "train"]
val_df = df[df["split"] == "val"]
test_df = df[df["split"] == "test"]

In [13]:
train_df.drop(["split"], axis = 1 ,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.drop(["split"], axis = 1 ,inplace=True)


In [14]:
train_df.head()

Unnamed: 0,message,ham,spam
0,"Go until jurong point, crazy.. Available only ...",1.0,0.0
1,Ok lar... Joking wif u oni...,1.0,0.0
2,Free entry in 2 a wkly comp to win FA Cup fina...,0.0,1.0
3,U dun say so early hor... U c already then say...,1.0,0.0
4,"Nah I don't think he goes to usf, he lives aro...",1.0,0.0


In [54]:
target = torch.tensor(np.stack(train_df[["ham", "spam"]].values.tolist()))

In [93]:
a_target = []

for index, rows in train_df.iterrows():
    if rows["ham"] == 1.0:
        a_target.append([0])
    else:
        a_target.append([1])

In [96]:
a_target = torch.Tensor(a_target).type(torch.LongTensor)
a_target

tensor([[0],
        [0],
        [1],
        ...,
        [0],
        [0],
        [0]])

In [83]:
target.shape

torch.Size([3900, 2])

In [15]:
def add_token(token):
    if token in token_to_idx:
        index = token_to_idx[token]
    else:
        index = len(token_to_idx)
        token_to_idx[token] = index
        idx_to_token[index] = token

In [16]:
for sentences in df.message:
    for word in sentences.split(" "):
        word = re.sub(special_char, " ", word)
        word = word.lower()
        add_token(word)


In [17]:
add_token("<UNK>")

In [20]:
def CustomVectorizer(message):
    one_hot = torch.zeros(len(message), 1, len(token_to_idx))
    for num, word in enumerate(message.split(" ")):
        word = re.sub(special_char, " ", word)
        word = word.lower()
        if word in token_to_idx:
            one_hot[num][0][token_to_idx[word]] = 1
        else:
            word = "<UNK>"
            one_hot[num][0][token_to_idx[word]] = 1
    
    return one_hot

In [22]:
for sentence in df["message"]:
    print(CustomVectorizer(sentence).size())
    break

torch.Size([111, 1, 12461])


In [112]:
token_to_idx["<UNK>"]

12460

In [33]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.h2o = nn.Linear(hidden_size ,output_size)
        self.softmax = nn.LogSoftmax(dim = 1)
        
    def forward(self, input_message, hidden):
        combined = torch.cat((input_message, hidden), 1)
        hidden = self.i2h(combined)
        output = self.h2o(hidden)
        output = self.softmax(output)
        
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1, self.hidden_size)
        

In [68]:
input_size = len(token_to_idx)
hidden_size = len(token_to_idx)
output_size = 2

In [35]:
input_text = train_df["message"][0]
print(len(input_text))

111


In [69]:
rnn = RNN(input_size, hidden_size, output_size)

In [70]:
input_text = CustomVectorizer(train_df["message"][0])
hidden = torch.zeros(1, hidden_size)
output, next_hidden = rnn(input_text[0], hidden)
print(output)

tensor([[-0.6852, -0.7011]], grad_fn=<LogSoftmaxBackward0>)


In [39]:
print(input_text.shape)

torch.Size([111, 1, 12461])


In [44]:
def classifyFromOutput(output):
    top_n, top_i = output.topk(1)
    class_i = top_i[0].item()
    if class_i == 1:
        return "Spam"
    else:
        return "Ham"

In [45]:
classifyFromOutput(output)

'Spam'

In [46]:
criterion = nn.NLLLoss()

In [61]:
learning_rate = 0.005
# Should send vectorized output as input to this function
def train(vectorized_input, target_value):
    hidden = rnn.initHidden()
    
    rnn.zero_grad()
    
    for i in range(vectorized_input.size()[0]):
        output, hidden = rnn(vectorized_input[i],hidden)
    
    loss = criterion(output, target_value)
    loss.backward()
    
    for p in rnn.parameters():
        p.data.add_(p.grad.data, alpha=-learning_rate)
        
    return output, loss.item()

In [98]:
import time
import math

n_iters = 10

current_loss = 0
all_losses = []

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

start = time.time()

In [99]:
for iter in range(1, n_iters + 1):
    print("Itertion Number: ", iter)
    current_loss = 0
    for num, input_text in enumerate(train_df["message"]):
        print("Message Number: ", num)
        target_value = a_target[num]
        # This will work -> target_value = torch.tensor([1])
        vectorized_input = CustomVectorizer(input_text)
        output, loss = train(vectorized_input, target_value)
        current_loss += loss
    print("Loss ", current_loss)

Itertion Number:  1
Message Number:  0
tensor([[-0.6859, -0.7005]], grad_fn=<LogSoftmaxBackward0>)
tensor([0])
Message Number:  1
tensor([[-0.6818, -0.7047]], grad_fn=<LogSoftmaxBackward0>)
tensor([0])
Message Number:  2
tensor([[-0.6777, -0.7089]], grad_fn=<LogSoftmaxBackward0>)
tensor([1])
Message Number:  3
tensor([[-0.6819, -0.7045]], grad_fn=<LogSoftmaxBackward0>)
tensor([0])
Message Number:  4
tensor([[-0.6778, -0.7087]], grad_fn=<LogSoftmaxBackward0>)
tensor([0])
Message Number:  5
tensor([[-0.6737, -0.7129]], grad_fn=<LogSoftmaxBackward0>)
tensor([1])
Message Number:  6
tensor([[-0.6779, -0.7086]], grad_fn=<LogSoftmaxBackward0>)
tensor([0])
Message Number:  7
tensor([[-0.6739, -0.7128]], grad_fn=<LogSoftmaxBackward0>)
tensor([0])
Message Number:  8
tensor([[-0.6699, -0.7170]], grad_fn=<LogSoftmaxBackward0>)
tensor([1])


KeyboardInterrupt: 