In [1]:
import torchtext
import torch
import torch.nn as nn
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import string
from string import punctuation
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

In [2]:
from google.colab import drive
drive.mount('/content/drive')
PATH = "drive/MyDrive/"

Mounted at /content/drive


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
torch.cuda.get_device_name(0)

cuda


'Tesla P100-PCIE-16GB'

In [4]:
tokenizer = torchtext.data.utils.get_tokenizer("basic_english")
vectorizer = torchtext.vocab.GloVe(name='840B', dim=300)

.vector_cache/glove.840B.300d.zip: 2.18GB [06:50, 5.30MB/s]                           
100%|█████████▉| 2194988/2196017 [03:40<00:00, 10493.09it/s]

In [5]:
train_dataset = pd.read_csv(PATH + 'train_dataset.csv')
val_dataset = pd.read_csv(PATH + 'val_dataset.csv')

In [6]:
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    text = text.strip()
    return text

In [7]:
class TextSimilarityDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.x1 = self.data['question1'].values
        self.x2 = self.data['question2'].values
        self.y = self.data['is_duplicate'].values

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return text_to_wordlist(str(self.x1[index])), text_to_wordlist(str(self.x2[index])), self.y[index]

In [8]:
def generate_dataset_collate_fn(batch):
    score = []
    q1_tok = []
    q2_tok = []
    q1_vect = []
    q2_vect = []
    max_q1 = 0
    max_q2 = 0
    
    # Tokenize and find max sequence length
    for i in range(len(batch)):
        q1_tokens = tokenizer(batch[i][0])
        q2_tokens = tokenizer(batch[i][1])
        q1_tok.append(q1_tokens)
        q2_tok.append(q2_tokens)
        if max_q1 < len(q1_tokens):
            max_q1 = len(q1_tokens)
        if max_q2 < len(q2_tokens):
            max_q2 = len(q2_tokens)
        score.append(torch.tensor(batch[i][2]))

    # Pad the sequence the length q1
    for q in q1_tok:
        if q != max_q1:
            diff = max_q1 - len(q)
            for add in range(0, diff):
                q.insert(0, '<pad>')

    # Pad the sequence the length q2
    for q in q2_tok:
        if q != max_q2:
            diff = max_q2 - len(q)
            for add in range(0, diff):
                q.insert(0, '<pad>')

    # Vectorize the sequence the length q1
    for q in q1_tok:
        q1_vect.append(vectorizer.get_vecs_by_tokens(q))

    # Vectorize the sequence the length q2
    for q in q2_tok:
        q2_vect.append(vectorizer.get_vecs_by_tokens(q))

    return torch.stack(q1_vect), torch.stack(q2_vect), torch.stack(score)

In [9]:
train_dataset = TextSimilarityDataset(train_dataset)
val_dataset = TextSimilarityDataset(val_dataset)

In [15]:
class NlpSiamese(nn.Module):
    def __init__(self, embed_size, hidden_size, num_layers):
        super(NlpSiamese, self).__init__()
        self.lstm = nn.LSTM(input_size=embed_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True, dropout=0.5)
        self.linear = nn.Linear(in_features=(hidden_size*2), out_features=2)
        self.tanh = nn.Tanh()
        self.layers = num_layers
        self.hidden = hidden_size

    def forward(self, x1, x2):
        h0 = torch.zeros(self.layers, x1.shape[0], self.hidden).to(device)
        c0 = torch.zeros(self.layers, x1.shape[0], self.hidden).to(device)

        out1, _ = self.lstm(x1, (h0, c0))
        out2, _ = self.lstm(x2, (h0, c0))

        # Taking the last output
        out1 = out1[:, -1, :]
        out2 = out2[:, -1, :]

        out = torch.cat([out1, out2], dim=1)
        
        # linear layer
        out = self.linear(out)

        return out

In [13]:
# Hyper-Parameters
embed_size = 300 #From Glove
hidden_size = 512
num_layers = 2
learning_rate = 0.001
num_epochs = 40
batch_size = 32

In [16]:
train_iterator = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, collate_fn=generate_dataset_collate_fn, num_workers=0)
val_iterator = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=True, collate_fn=generate_dataset_collate_fn, num_workers=0)

In [17]:
model = NlpSiamese(embed_size, hidden_size, num_layers).to(device)

In [18]:
positive = 119433
negative = 204047
class_weight = torch.tensor([positive/(positive+negative), negative/(positive+negative)]).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weight, reduction='mean').to(device)
optim = torch.optim.Adam(model.parameters(), lr = learning_rate)

In [19]:
FILE = "siamese-new.pth"

In [20]:
model.load_state_dict(torch.load(PATH + FILE))

RuntimeError: ignored

In [35]:
train_loss = 0
train_count = 0
val_loss = 0
val_count = 0
for epoch in range(num_epochs):
    # Train Loop
    print(f'Epoch {epoch+1} Started training')
    train_progress = tqdm(total=train_dataset.__len__())
    model.train(True)
    for _, batch in enumerate(train_iterator):
        q1, q2, score = batch[0].to(device), batch[1].to(device), batch[2].to(device)
        output = model.forward(q1, q2)
        loss = criterion(output, score)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1)
        optim.zero_grad()
        loss.backward()
        optim.step()
        train_count += 1
        train_loss += loss
        train_progress.update(batch_size)
    print(f'Completed {epoch+1} epoch of {num_epochs} epochs - Current Training loss is {train_loss/train_count}')
    train_progress.close()

    # Val loop
    print(f'Epoch {epoch+1} Started validation')
    val_progress = tqdm(total=val_dataset.__len__())
    model.train(False)
    with torch.no_grad():
        for _, batch in enumerate(val_iterator):
            q1, q2, score = batch[0].to(device), batch[1].to(device), batch[2].to(device)
            output = model.forward(q1, q2)
            loss = criterion(output, score)
            val_loss += loss
            val_count += 1
            val_progress.update(batch_size)
    print(f'Completed {epoch+1} epoch of {num_epochs} epochs - Current Validation loss is {train_loss/train_count}')
    val_progress.close()
    torch.save(model.state_dict(), PATH + FILE)

Epoch 1 Started training


HBox(children=(FloatProgress(value=0.0, max=323480.0), HTML(value='')))

Completed 1 epoch of 40 epochs - Current Training loss is 0.18497306108474731

Epoch 1 Started validation


HBox(children=(FloatProgress(value=0.0, max=80871.0), HTML(value='')))

Completed 1 epoch of 40 epochs - Current Validation loss is 0.18497306108474731

Epoch 2 Started training


HBox(children=(FloatProgress(value=0.0, max=323480.0), HTML(value='')))

Completed 2 epoch of 40 epochs - Current Training loss is 0.18287013471126556

Epoch 2 Started validation


HBox(children=(FloatProgress(value=0.0, max=80871.0), HTML(value='')))

Completed 2 epoch of 40 epochs - Current Validation loss is 0.18287013471126556

Epoch 3 Started training


HBox(children=(FloatProgress(value=0.0, max=323480.0), HTML(value='')))

Completed 3 epoch of 40 epochs - Current Training loss is 0.18132749199867249

Epoch 3 Started validation


HBox(children=(FloatProgress(value=0.0, max=80871.0), HTML(value='')))

Completed 3 epoch of 40 epochs - Current Validation loss is 0.18132749199867249

Epoch 4 Started training


HBox(children=(FloatProgress(value=0.0, max=323480.0), HTML(value='')))

Completed 4 epoch of 40 epochs - Current Training loss is 0.1803884357213974

Epoch 4 Started validation


HBox(children=(FloatProgress(value=0.0, max=80871.0), HTML(value='')))

Completed 4 epoch of 40 epochs - Current Validation loss is 0.1803884357213974

Epoch 5 Started training


HBox(children=(FloatProgress(value=0.0, max=323480.0), HTML(value='')))

Completed 5 epoch of 40 epochs - Current Training loss is 0.17912805080413818

Epoch 5 Started validation


HBox(children=(FloatProgress(value=0.0, max=80871.0), HTML(value='')))

Completed 5 epoch of 40 epochs - Current Validation loss is 0.17912805080413818

Epoch 6 Started training


HBox(children=(FloatProgress(value=0.0, max=323480.0), HTML(value='')))

Completed 6 epoch of 40 epochs - Current Training loss is 0.17797525227069855

Epoch 6 Started validation


HBox(children=(FloatProgress(value=0.0, max=80871.0), HTML(value='')))

Completed 6 epoch of 40 epochs - Current Validation loss is 0.17797525227069855

Epoch 7 Started training


HBox(children=(FloatProgress(value=0.0, max=323480.0), HTML(value='')))

Completed 7 epoch of 40 epochs - Current Training loss is 0.17697365581989288

Epoch 7 Started validation


HBox(children=(FloatProgress(value=0.0, max=80871.0), HTML(value='')))

Completed 7 epoch of 40 epochs - Current Validation loss is 0.17697365581989288

Epoch 8 Started training


HBox(children=(FloatProgress(value=0.0, max=323480.0), HTML(value='')))

Completed 8 epoch of 40 epochs - Current Training loss is 0.17605093121528625

Epoch 8 Started validation


HBox(children=(FloatProgress(value=0.0, max=80871.0), HTML(value='')))

Completed 8 epoch of 40 epochs - Current Validation loss is 0.17605093121528625

Epoch 9 Started training


HBox(children=(FloatProgress(value=0.0, max=323480.0), HTML(value='')))

Completed 9 epoch of 40 epochs - Current Training loss is 0.1753523349761963

Epoch 9 Started validation


HBox(children=(FloatProgress(value=0.0, max=80871.0), HTML(value='')))

Completed 9 epoch of 40 epochs - Current Validation loss is 0.1753523349761963

Epoch 10 Started training


HBox(children=(FloatProgress(value=0.0, max=323480.0), HTML(value='')))

Completed 10 epoch of 40 epochs - Current Training loss is 0.1747191846370697

Epoch 10 Started validation


HBox(children=(FloatProgress(value=0.0, max=80871.0), HTML(value='')))

Completed 10 epoch of 40 epochs - Current Validation loss is 0.1747191846370697

Epoch 11 Started training


HBox(children=(FloatProgress(value=0.0, max=323480.0), HTML(value='')))

Completed 11 epoch of 40 epochs - Current Training loss is 0.17414601147174835

Epoch 11 Started validation


HBox(children=(FloatProgress(value=0.0, max=80871.0), HTML(value='')))

Completed 11 epoch of 40 epochs - Current Validation loss is 0.17414601147174835

Epoch 12 Started training


HBox(children=(FloatProgress(value=0.0, max=323480.0), HTML(value='')))

Completed 12 epoch of 40 epochs - Current Training loss is 0.17365477979183197

Epoch 12 Started validation


HBox(children=(FloatProgress(value=0.0, max=80871.0), HTML(value='')))

Completed 12 epoch of 40 epochs - Current Validation loss is 0.17365477979183197

Epoch 13 Started training


HBox(children=(FloatProgress(value=0.0, max=323480.0), HTML(value='')))

Completed 13 epoch of 40 epochs - Current Training loss is 0.17325328290462494

Epoch 13 Started validation


HBox(children=(FloatProgress(value=0.0, max=80871.0), HTML(value='')))

Completed 13 epoch of 40 epochs - Current Validation loss is 0.17325328290462494

Epoch 14 Started training


HBox(children=(FloatProgress(value=0.0, max=323480.0), HTML(value='')))

Completed 14 epoch of 40 epochs - Current Training loss is 0.17312879860401154

Epoch 14 Started validation


HBox(children=(FloatProgress(value=0.0, max=80871.0), HTML(value='')))

Completed 14 epoch of 40 epochs - Current Validation loss is 0.17312879860401154

Epoch 15 Started training


HBox(children=(FloatProgress(value=0.0, max=323480.0), HTML(value='')))

Completed 15 epoch of 40 epochs - Current Training loss is 0.17296521365642548

Epoch 15 Started validation


HBox(children=(FloatProgress(value=0.0, max=80871.0), HTML(value='')))

Completed 15 epoch of 40 epochs - Current Validation loss is 0.17296521365642548

Epoch 16 Started training


HBox(children=(FloatProgress(value=0.0, max=323480.0), HTML(value='')))

Completed 16 epoch of 40 epochs - Current Training loss is 0.17282751202583313

Epoch 16 Started validation


HBox(children=(FloatProgress(value=0.0, max=80871.0), HTML(value='')))

Completed 16 epoch of 40 epochs - Current Validation loss is 0.17282751202583313

Epoch 17 Started training


HBox(children=(FloatProgress(value=0.0, max=323480.0), HTML(value='')))

KeyboardInterrupt: ignored

In [74]:
def predict(model, x1, x2):
    x1_tokens = tokenizer(text_to_wordlist(str(x1)))
    x2_tokens = tokenizer(text_to_wordlist(str(x2)))
    x1_vect = vectorizer.get_vecs_by_tokens(x1_tokens)
    x2_vect = vectorizer.get_vecs_by_tokens(x1_tokens)
    x1_vect = torch.stack([x1_vect]).to(device)
    x2_vect = torch.stack([x2_vect]).to(device)
    activation = nn.Softmax(dim=1)
    model.train(False)
    with torch.no_grad():
      out = model.forward(x1_vect, x2_vect)
      out = activation(out)
      out = torch.squeeze(out)
      if out[0] < out[1]:
          return True
      else:
          return False

In [80]:
predict(model, "Light mode", "Dark mode")

True