In [None]:
# Imports

from IPython.display import clear_output
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import spacy
import re
import string
from collections import Counter
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import tqdm
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Downloading the Spam SMS Dataset
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
!unzip /content/smsspamcollection.zip
!rm /content/readme
!rm !rm /content/smsspamcollection.zip
clear_output()

In [None]:
### Downloading the GloVe embeddings database
!wget https://nlp.stanford.edu/data/glove.6B.zip
!unzip /content/glove.6B.zip
!rm -rf /content/glove.6B.zip
!rm /content/glove.6B.100d.txt
!rm /content/glove.6B.200d.txt
!rm /content/glove.6B.300d.txt


--2023-05-16 01:46:15--  https://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-05-16 01:46:15--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2023-05-16 01:48:54 (5.19 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]

Archive:  /content/glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [None]:
text = []
label = []
with open("/content/SMSSpamCollection") as f:
    lines=[]
    for line in f.readlines():
      #Splitting based on tab
      words = line.split("\t")
      words[1]=words[1][:-2]
      #Labelling Spam as 1
      if words[0]=="spam":
        words[0]=1
      else:
        words[0]=0
      label.append(words[0])
      text.append(words[1])


In [None]:
# Creating a Pandas Dataframe
sms = pd.DataFrame(zip(text, label), columns = ["Text", "Label"])
sms['Text_Length'] = sms["Text"].str.len()

#Converting all strings to lower
sms["Text"] = sms["Text"].str.lower()

In [None]:
spacy_tokenizer = spacy.load('en_core_web_sm')
def tokenize (text):
  #Removing punctuations
  text = re.sub(r'[^\w\s]',' ',text)

  #Removing non ascii
  text = re.sub(r'[^\x00-\x7F]',' ', text)

  #Removing multiple spaces
  text = re.sub(' +',' ',text)

  #Using Spacy tokenizer on the text
  doc = spacy_tokenizer(text)
  return doc

In [None]:
# Tokenizing the text sms.
sms["Tokenized_Text"] = sms['Text'].apply(tokenize)

In [None]:
def load_GloVe_embeddings(glove_file):
  dic = {}
  #Creating embedding dictonary
  with open(glove_file) as f:
    for line in f:
      w_lines = line.split()
      word = w_lines[0]
      dic[word] = np.array(w_lines[1:],dtype=np.float32)
  return dic

In [None]:
#Loading the glove embeddings
word_embeds = load_GloVe_embeddings("/content/glove.6B.50d.txt")

In [None]:
def embed_text(tokenized_text, word_embeddings, max_text_length=20, embedding_size = 50):
  embeds = np.zeros((max_text_length,50))
  wordsfound=0
  for token_idx in range(len(tokenized_text)):
    word = tokenized_text[token_idx].text
    if(word in word_embeddings):
      embeds[wordsfound] = (word_embeddings[word][:embedding_size])
      wordsfound+=1;
    if wordsfound == max_text_length:
      break
  return embeds


In [None]:
sms["Embedded_Text"] = sms["Tokenized_Text"].apply(lambda x: embed_text(x,word_embeds))

In [None]:
#Creating Dataloader Class
class load_dataset(Dataset):
    def __init__(self, X, Y):
        self.x = X
        self.y = Y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        item = self.x[idx]
        lab = self.y[idx]
        return item,lab


In [None]:
class RNN(nn.Module):
    def __init__(self, vocab_size, num_layers, hidden_size=256):
      #Setting up Model structure
        super(RNN,self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(vocab_size,hidden_size,num_layers,batch_first = True)
        self.fc = nn.Linear(hidden_size,2)



    def forward(self, x):
      #Defining a forwad pass
        hid = torch.zeros(self.num_layers,x.size(0),self.hidden_size,dtype = torch.float32)
        out,_ = self.rnn(x,hid)
        out = self.fc(out[:,-1])
        return out

In [None]:
#Defining a function to check accuracy

def acc_check(model,loader):
  cor = 0
  samp = 0
  model.eval()
  with torch.no_grad():
    for val,lab in loader:
      val = val.to(torch.float32)
      score = model(val)
      _,pred = score.max(1)
      cor+=(pred==lab).sum()
      samp+=pred.size(0)
  print(f'Got {cor} / {samp} with accuracy {float(cor)/float(samp)*100:.2f}')
  model.train()
  return



In [None]:
def train_model(num_epochs, train_loader, model, criterion, optimizer):
    for epoch in range(num_epochs):
      for idx,(data,label) in enumerate(train_loader):
        #Calculating Scores
        data = data.to(torch.float32)

        #Running model on data
        mark = model(data)

        #Calculating losses
        loss = criterion(mark,label)

        #Setting
        optimizer.zero_grad()

        #Backwards Propogation
        loss.backward()

        #Doing the descent
        optimizer.step()

      #Checking accuracy for each epoch
      acc_check(model,train_loader)


In [None]:

#Test-Train Split
train, test = train_test_split(sms, test_size=0.2, random_state=42, shuffle=True)
train = train.reset_index()
test = test.reset_index()
train_ = load_dataset(train["Embedded_Text"],train["Label"])
test_ = load_dataset(test["Embedded_Text"],test["Label"])
train_loader = DataLoader(train_,5)
test_loader = DataLoader(test_,5)

#Model
model = RNN(50,2,256)

#Setting up Hyper Paramters
alpha = 0.0001
epochs = 3
crit = nn.CrossEntropyLoss()
opti  = torch.optim.Adam(model.parameters(),lr=alpha)




In [None]:
print(model)

RNN(
  (rnn): RNN(50, 256, num_layers=2, batch_first=True)
  (fc): Linear(in_features=256, out_features=2, bias=True)
)


In [None]:
train_model(epochs,train_loader,model,crit,opti)

Got 4309 / 4459 with accuracy 96.64
Got 4338 / 4459 with accuracy 97.29
Got 4347 / 4459 with accuracy 97.49


In [None]:
acc_check(model,test_loader)

Got 1072 / 1115 with accuracy 96.14


In [None]:
model.eval()
torch.save(model,"/content/model.pth")

In [None]:
model1 = torch.load("/content/model.pth")

In [None]:
print(model1)