In [1]:
import torch
import torch.nn as nn
import torchtext
import pandas as pd
from torch.utils.data import Dataset, DataLoader

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
torch.cuda.get_device_name(0)

cuda


'GeForce GTX 1050 Ti'

In [3]:
vectorizer = torchtext.vocab.GloVe(name='6B', dim=50)
tokenizer = torchtext.data.get_tokenizer("basic_english")

In [4]:
temp_text = "Hello this is harish"
temp_text = tokenizer(temp_text)
temp_text = vectorizer.get_vecs_by_tokens(temp_text, lower_case_backup=True)
temp_text.shape

torch.Size([4, 50])

In [5]:
class TextSimilarityDataset(Dataset):
    def __init__(self, tokenizer, vectorizer):
        self.data = pd.read_csv('questions.csv')
        self.vectorizer = vectorizer
        self.tokenizer = tokenizer
        self.y = torch.from_numpy(self.data['is_duplicate'].values)
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        tx1 = self.tokenizer(str(self.data['question1'].values[index]))
        tx2 = self.tokenizer(str(self.data['question2'].values[index]))
        x1 = vectorizer.get_vecs_by_tokens(tx1, lower_case_backup=True)
        x2 = vectorizer.get_vecs_by_tokens(tx2, lower_case_backup=True)
        return x1, x2, self.y[index]

In [6]:
dataset = TextSimilarityDataset(tokenizer, vectorizer)

In [7]:
def gen_dataset(batch):
    score = []
    q1 = []
    q2 = []
    max_pad_q1 = 0
    max_pad_q2 = 0
    
    
    for i in range(len(batch)):
        if max_pad_q1 < batch[i][0].shape[0]:
            max_pad_q1 = batch[i][0].shape[0]

        if max_pad_q2 < batch[i][1].shape[0]:
            max_pad_q2 = batch[i][1].shape[0]


    for i in range(len(batch)):
        
        if batch[i][0].shape[0] == max_pad_q1:
            q1.append(batch[i][0])
        else:
            q1_temp = torch.zeros(max_pad_q1, 50)
            q1_temp[:batch[i][0].shape[0], :] = batch[i][0]
            q1.append(q1_temp)
        
        if batch[i][1].shape[0] == max_pad_q2:
            q2.append(batch[i][1])
        else:
            q2_temp = torch.zeros(max_pad_q2, 50)
            q2_temp[:batch[i][1].shape[0], :] = batch[i][1]
            q2.append(q2_temp)
        
        score.append(batch[i][2])
        
    return torch.stack(q1), torch.stack(q2), torch.stack(score)

In [8]:
train_iterator = DataLoader(dataset=dataset, batch_size=32, shuffle=True, collate_fn=gen_dataset, num_workers=0)

In [9]:
class SiameseLSTM(nn.Module):
    def __init__(self, input_dim, embedding_size, hidden_size, num_layers, dropout):
        super(SiameseLSTM, self).__init__()
        self.dropout = nn.Dropout(dropout)
        self.embedding = nn.Embedding(input_dim, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout)
        self.sigmoid = nn.Sigmoid()
        
    def forward_one(self, x):
        x = self.dropout(self.embedding(x))
        out, _ = self.rnn(x)
        return out
        
    def calc_dist(self, x1, x2):
        x1 = x1[-1:,-1:]
        x2 = x2[-1:,-1:]
        return torch.exp(-(torch.norm(x1 - x2))).to(device)
    
    def forward(self, x1, x2):
        x1 = self.forward_one(x1)
        x2 = self.forward_one(x2)
        out = torch.zeros(x1.shape[0]).to(device)
        for i in range(x1.shape[0]):
            out[i] = self.sigmoid(self.calc_dist(x1[i], x2[i]))
        return out

In [10]:
# Training Hyper parameters
num_epochs = 20
learning_rate = 0.001
batch_size = 32
hidden = 1024
layers = 2
dropout = 0.5
inpt_size = 50
embed_size = 300

In [11]:
model = SiameseLSTM(inpt_size, embed_size, hidden, layers, dropout).to(device)

In [None]:
torch.utils.tensorboard.Summa

In [12]:
err = nn.BCEWithLogitsLoss()
optim = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [13]:
FILE = "siamese-new.pth"

In [102]:
for epoch in range(num_epochs):
    count = 0
    for _, batch in enumerate(train_iterator):
        q1, q2, score = batch[0].to(device), batch[1].to(device), batch[2].to(device)
        output = model.forward(q1, q2)
        score = score.type(torch.cuda.FloatTensor)
        loss = err(output, score)
        optim.zero_grad()
        loss.backward()
        optim.step()
        count += 1
        if count % 50 == 0:
            print(f'{epoch+1} epoch of {num_epochs} epochs - {count+1} batch - Current loss is {loss}')
    torch.save(model.state_dict(), FILE)

RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.cuda.FloatTensor instead (while checking arguments for embedding)

In [52]:
def predict(model, x1, x2):
    x1 = tokenizer(x1)
    x1 = vectorizer.get_vecs_by_tokens(x1, lower_case_backup=True)
    x2 = tokenizer(x2)
    x2 = vectorizer.get_vecs_by_tokens(x2, lower_case_backup=True)
    x1 = torch.stack([x1]).to(device)
    x2 = torch.stack([x2]).to(device)
    out = model.forward(x1, x2)
    print(out[0])
    if out[0] > 0.5:
        return True
    else:
        return False

In [87]:
out = predict(model, "Lion looks for its den", "Kite flies over the sky")
print(out)

tensor(0.7311, device='cuda:0', grad_fn=<SelectBackward>)
True
