In [93]:
import torch
import torch.nn as nn
import pickle
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
import torch.nn.functional as F
import pandas as pd
import numpy as np



In [94]:
#%%
#Analogue of the nn.RNN module
class MyRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1, bias=True, nonlinearity='tanh'):
        super(MyRNN, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # Initialize parameters
        self.weight_ih = nn.Parameter(torch.Tensor(num_layers, hidden_size, input_size))
        self.weight_hh = nn.Parameter(torch.Tensor(num_layers, hidden_size, hidden_size))
        if bias:
            self.bias_ih = nn.Parameter(torch.Tensor(num_layers, hidden_size))
            self.bias_hh = nn.Parameter(torch.Tensor(num_layers, hidden_size))
        else:
            self.register_parameter('bias_ih', None)
            self.register_parameter('bias_hh', None)

        self.nonlinearity = nonlinearity

        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1.0 / (self.hidden_size ** 0.5)
        for weight in self.parameters():
            weight.data.uniform_(-stdv, stdv)

    def forward(self, input, hx=None):
        '''
        This function defines a forward RNN pass  

        Input: tensor of shape (batch_size, sequence_length, input_size)'
        Output: (output, hx) where output is a list of tensors oh  cell
        predictions, shape (num_layers, batch_size, hidden_size)
        '''
        # Initializes the hidden state if not provided
        if hx is None:
            hx = torch.zeros(self.num_layers, input.size(0), self.hidden_size, dtype=input.dtype, device=input.device)

        outputs = []

        #iterate over each time step
        for i in range(input.size(1)):
            hx = self.rnn_cell(input[:, i, :], hx)
            outputs.append(hx.unsqueeze(1))

        output = torch.cat(outputs, dim=1)
        return output, hx

    def rnn_cell(self, input, hx):
        '''
        Defines a run of one RNN batch for one time step

        Inputs: 
            input tensor of hape (batch_size, 1, input_size)
            hx tensor of shape (num_layers, batch_size, hidden_size)
        Output:
            tensor of shape (num_layers, batch_size, hidden_size)

        '''
        # Apply RNN cell computation  --> tensor (batch_size, hidden_size)
        gates = torch.matmul(input, self.weight_ih.transpose(0, 1)) + torch.matmul(hx, self.weight_hh.transpose(0, 1))
        if self.bias_ih is not None:
            gates += self.bias_ih.unsqueeze(0)
            gates += self.bias_hh.unsqueeze(0)
        if self.nonlinearity == 'tanh':
            return torch.tanh(gates)
        elif self.nonlinearity == 'relu':
            return torch.relu(gates)
        else:
            raise ValueError("Unsupported nonlinearity. Choose from 'tanh' or 'relu'.")



In [143]:

# %%
class fullRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(fullRNN, self).__init__()
        self.hidden_size=hidden_size
        self.rnn_cell=nn.RNN(input_size, hidden_size, batch_first=True)
        self.output_layer=nn.Linear(hidden_size, output_size)

    def forward(self, input):
        '''
        This functions defines forward prop through our RNN network.
        The input is a tensor of shape (seq_length, batch_size, input_size)
        The seq_length is number of examples
        '''
        print(input.size())
        
        #Initiates the hidden layer for the whole text
        hidden = torch.zeros(1, input.size(0), self.hidden_size)
        print(hidden.size())
        # hidden=torch.zeros (input.size(1), self.hidden_size)
        rnn_output, hidden = self.rnn_cell(input, hidden)
        output=self.output_layer(rnn_output[:, -1, :])
        return output

input_size = 128
hidden_size = 100
output_size = 100

# Step 1 - Create RNN for Query Tower + for Doc tower 
queryRNN = fullRNN(input_size, hidden_size, output_size)

# Step 2 - Load input data - pickle files have been tokenised by sentence piece and embedded by
# Data in format - [tesnor([tensor(query), tensor(rel_docs), tensor(irr_docs)]), .... ]
# For query
testData = []
trainingData = [] 
# To prep the data
validationData = []

with open('test_tokenised_triplets.pkl', 'rb') as file:
    testData = pickle.load(file)
with open('training_tokenised_triplets.pkl', 'rb') as file:
    trainingData = pickle.load(file)
with open('validation_tokenised_triplets.pkl', 'rb') as file:
    validationData = pickle.load(file)

    




In [131]:
# Step 1 - Get the dataset from MS Marco and put into data frame

training_query_dataset = pd.read_parquet("./v1.1-data/train.parquet") 



In [144]:

# Take the query out of the triplet 
query_list = []
relevant_doc_list = []
irrelevant_doc_list = []
for (query, relevant_doc, irrelevant_doc) in trainingData:
# This gives tensor([w1, w2, w3,...wn]) for each individual query
# Take the query out of a tensor form and keep as a list
# Then iterate over all of the triplets and pull them all out
    query_list.append(query)
    relevant_doc_list.append(relevant_doc)  ## TODO: don't tensorise in the first place
    irrelevant_doc_list.append(irrelevant_doc) ## TODO: don't tensorise in the first place
# Put them all in one list 
# Make this list a tensor

def pad(list_of_arrays):
    # Find the length of the longest list
    max_length = 0
    for lst in list_of_arrays:
        if len(lst) > max_length:
            max_length = len(lst)
            
    # Pad all lists to the same length
    padded_arrays = []
    for lst in list_of_arrays:
        arr = np.array(lst)
        padded_arr = np.pad(arr, ((max_length - len(arr), 0), (0, 0)), mode='constant', constant_values=0)
        padded_arrays.append(padded_arr)
    return padded_arrays

padded_queries = pad(query_list)

for i in range(len(query_list)):
    print(np.array(query_list[i]).shape)
    
for i in range(len(padded_queries)):
    print(padded_queries[i].shape)

query_list = torch.tensor(padded_queries).to(torch.float32)
relevant_doc_list = torch.tensor(pad(relevant_doc_list)).to(torch.float32)
irrelevant_doc_list = torch.tensor(pad(irrelevant_doc_list)).to(torch.float32)


(4, 128)
(4, 128)
(4, 128)
(4, 128)
(4, 128)
(4, 128)
(4, 128)
(4, 128)
(4, 128)
(4, 128)
(7, 128)
(7, 128)
(7, 128)
(7, 128)
(7, 128)
(7, 128)
(7, 128)
(12, 128)
(12, 128)
(12, 128)
(12, 128)
(12, 128)
(12, 128)
(12, 128)
(12, 128)
(12, 128)
(12, 128)
(6, 128)
(6, 128)
(6, 128)
(6, 128)
(6, 128)
(6, 128)
(6, 128)
(6, 128)
(6, 128)
(5, 128)
(5, 128)
(5, 128)
(5, 128)
(5, 128)
(5, 128)
(5, 128)
(5, 128)
(8, 128)
(8, 128)
(8, 128)
(8, 128)
(8, 128)
(8, 128)
(8, 128)
(8, 128)
(8, 128)
(8, 128)
(5, 128)
(5, 128)
(5, 128)
(5, 128)
(5, 128)
(5, 128)
(5, 128)
(5, 128)
(9, 128)
(9, 128)
(9, 128)
(9, 128)
(9, 128)
(9, 128)
(9, 128)
(9, 128)
(9, 128)
(6, 128)
(6, 128)
(6, 128)
(6, 128)
(6, 128)
(6, 128)
(6, 128)
(6, 128)
(6, 128)
(4, 128)
(4, 128)
(4, 128)
(4, 128)
(4, 128)
(4, 128)
(4, 128)
(4, 128)
(4, 128)
(15, 128)
(15, 128)
(15, 128)
(15, 128)
(15, 128)
(15, 128)
(15, 128)
(15, 128)
(15, 128)
(14, 128)
(14, 128)
(14, 128)
(14, 128)
(14, 128)
(14, 128)
(14, 128)
(14, 128)
(14, 128)
(14, 128)

In [145]:
# Step 3 - Pass data into model 
criterion = nn.NLLLoss()
learning_rate = 0.005 # param, play around with to learn
optimizer = torch.optim.SGD(queryRNN.parameters(), lr=learning_rate) #stochastic gradient descent

query_output = queryRNN(query_list)
# output.forward(trainintensor_query_listData[0][0])


torch.Size([814, 18, 128])
torch.Size([1, 814, 100])


In [146]:
# Step 4 - Create Document RNN
documentRNN = fullRNN(input_size, hidden_size, output_size)

# Step 5 - Run Document RNN
relevant_output = documentRNN(relevant_doc_list)
irrelevant_output = documentRNN(irrelevant_doc_list)

torch.Size([814, 12, 128])
torch.Size([1, 814, 100])
torch.Size([814, 12, 128])
torch.Size([1, 814, 100])


In [155]:

# Step 6 - Create Loss function
margin = 0.3

def difference_function(vector1, vector2):
    return F.cosine_similarity(vector1, vector2)

def triplet_loss_function(query_output, relevant_output, irrelevant_output, margin):
    relevant_similarity = difference_function(query_output, relevant_output)
    irrelevant_similarity = difference_function(query_output, irrelevant_output)
    triplet_loss = torch.max(torch.tensor(0), margin + relevant_similarity - irrelevant_similarity)
    print(triplet_loss)
    return triplet_loss.mean()

loss = triplet_loss_function(query_output, relevant_output, irrelevant_output, margin)

print(loss)

# Step 7 - Backpropogate





tensor([0.3494, 0.1325, 0.1891, 0.2993, 0.2451, 0.2466, 0.3652, 0.3316, 0.2988,
        0.3367, 0.3389, 0.1969, 0.2195, 0.3157, 0.3536, 0.3891, 0.3443, 0.3149,
        0.2158, 0.3080, 0.2606, 0.3190, 0.3805, 0.3449, 0.2939, 0.3048, 0.3307,
        0.3810, 0.4337, 0.3028, 0.2897, 0.2556, 0.3266, 0.2894, 0.2654, 0.4436,
        0.1875, 0.2710, 0.2749, 0.3171, 0.2741, 0.2737, 0.3266, 0.3362, 0.2154,
        0.2947, 0.2567, 0.3445, 0.2424, 0.2333, 0.2042, 0.2537, 0.2771, 0.2482,
        0.3204, 0.3564, 0.3668, 0.2740, 0.3294, 0.3325, 0.2824, 0.3329, 0.3166,
        0.4447, 0.2594, 0.1318, 0.3396, 0.1419, 0.2189, 0.3439, 0.2802, 0.3477,
        0.3473, 0.3435, 0.2719, 0.3980, 0.3305, 0.3446, 0.3274, 0.3281, 0.2704,
        0.3337, 0.2915, 0.2785, 0.3658, 0.2768, 0.2789, 0.2420, 0.3531, 0.2899,
        0.2900, 0.2192, 0.3437, 0.3382, 0.3467, 0.3188, 0.3092, 0.3458, 0.3537,
        0.3244, 0.2682, 0.2880, 0.2583, 0.2410, 0.3659, 0.2735, 0.2891, 0.3662,
        0.1883, 0.2406, 0.2943, 0.2863, 