# Helpers

In [None]:
import random
import numpy as np
from tqdm import tqdm
import torch

from transformers import RobertaConfig
from transformers import AdamW
from transformers import RobertaForMaskedLM
from tqdm.auto import tqdm

In [9]:


def read_data(path):
    '''read sequences from '''
    with open(path, 'r') as f:
        seqs= f.read().split('\n')
    return seqs

def filter_sequence(seqs, min_length=3):
    '''filter a sequnce on minimum length'''
    filtered_seqs=[]
        
    for val in seqs:
        temp= [value for value in val.split(" ") if value != '']
        if len(temp)>= min_length:
            filtered_seqs.append(val)
    return filtered_seqs


def train_test_split(data, train_ratio=0.25):
    '''shuffle and split data into train test split'''
    random.shuffle(data)
    train_range= int(len(data)*train_ratio)
    return data[:train_range], data[train_range:]

def merge_seq_arr(arr):
    '''Merge array of items to space seperated sequence'''
    seq=""
    for item in arr: 
         seq+=item + " "
    return seq
    
def split_fixed_length_sequence(data, max_length=9):
    '''split sequences on fixed lengths to increse data points'''
    for seq in data: 
        if len(seq)> max_length:
            pass

        
def chunks(lst, n=8):
    """return successive n-sized chunks from lst."""
    slices=[]
    lst = lst.split(" ")
    for i in range(n, len(lst)+n, n):
        slices.append(merge_seq_arr(lst[i-n:i]))

    return slices

def extend_data_points(data, chunk_size=9):
    extended_data=[]
    for seqs in data:
        extended_data+=chunks(seqs, chunk_size)
    return extended_data

#masking function
def mlm(tensor):
    rand= torch.rand(tensor.shape) #[0,1]
    #masking tokens that are not seq, /seq, pad
    mask_arr = (rand < 0.30) * (tensor > 2)
    for i in range(tensor.shape[0]):
        selection = torch.flatten(mask_arr[i].nonzero()).tolist() #[[2,3,: non zero indicies]]
        tensor[i, selection]= 4 # assigning mask token to the selected token
    return tensor 


def mfm(tensor):
    counter=0
    for i in range(tensor.shape[0]):
        idx=tensor[i].detach().clone().tolist().index(2)
        tensor[i][idx-1]=4
        counter+=1

    return tensor 




In [10]:
#initialize model input data lists
input_ids =[]
mask =[]
labels= []
path = "vocab.txt"

    
base_seq= read_data(path)
fil_seq= filter_sequence(base_seq)
test_seq, train_seq= train_test_split(fil_seq)

updated_train= extend_data_points(train_seq)
updated_test= extend_data_points(test_seq)

updated_train= filter_sequence(updated_train)
updated_test= filter_sequence(updated_test)





In [11]:
token_mapping={"<s>":0,"<pad>":1,"</s>":2,"<unk>":3,"<mask>":4,}
iter_val=5

for seqs in base_seq:
    tokens=[value for value in seqs.split(" ") if value != '']
    unique_token=list(set(tokens))
    
    for token in unique_token:
        if token not in token_mapping:
            token_mapping[token] = iter_val
            iter_val+=1

import json 

with open("vocab_.json", "w") as outfile:
    json.dump(token_mapping, outfile)
    

In [12]:

def tokenizer(token_mapping, seqs, max_length=11 ):
    
    attention_masks=[]
    input_ids=[]
    
    for seq in tqdm(seqs):
        encoded_sequence=[]
        mask_sequence=[]
        
        #starting token 
        encoded_sequence.append(token_mapping['<s>'])
        
        #breaking sequence in tokens
        seq=[value for value in seq.split(" ")[0:5] if value != '']
        
        #appending encoded items to sequence
        for token in seq:
            encoded_sequence.append(token_mapping[token])
            
        #appending ending token
        encoded_sequence.append(token_mapping['</s>'])
        
        mask_sequence=[1]* len(encoded_sequence)
        
        #padding to max length
        if len(encoded_sequence)< max_length:
            for i in range(max_length-len(encoded_sequence)):
                encoded_sequence.append(token_mapping['<pad>'])
                mask_sequence.append(0)
                

        attention_masks.append(mask_sequence)
        input_ids.append(encoded_sequence)

    return torch.tensor(attention_masks), torch.tensor(input_ids)
    

In [13]:
#initialize model input data lists
input_ids =[]
mask =[]
labels= []


mask, input_ids= tokenizer(token_mapping, updated_train, max_length=11)
vocab_size= len(token_mapping.keys())

    
# sample = tokenizer(updated_train, max_length= 11, padding= 'max_length', truncation=True, return_tensors='pt')
labels=(input_ids)
# mask=(sample.attention_mask)
input_ids=(mfm(input_ids.detach().clone()))


100%|█████████████████████████████████| 54958/54958 [00:00<00:00, 330691.12it/s]


In [15]:
encodings ={'input_ids': input_ids, 'attention_mask': mask , 'labels':labels}

In [16]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        # store encodings internally
        self.encodings = encodings

    def __len__(self):
        # return the number of samples
        return self.encodings['input_ids'].shape[0]

    def __getitem__(self, i):
        # return dictionary of input_ids, attention_mask, and labels for index i
        return {key: tensor[i] for key, tensor in self.encodings.items()}

In [17]:
dataset = Dataset(encodings)
loader = torch.utils.data.DataLoader(dataset, batch_size= 32, shuffle =True) 

In [84]:

config = RobertaConfig(
    vocab_size=vocab_size,  # we align this to the tokenizer vocab_size
    max_position_embeddings=11,
    hidden_size=282,
    num_attention_heads=6,
    num_hidden_layers=4,
    type_vocab_size=1
)

In [85]:



model = RobertaForMaskedLM(config)
print(model.num_parameters())
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
print (device)
# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=1e-4)

8525220
cuda


In [86]:
epochs = 5

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)

        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  0%|          | 0/859 [00:00<?, ?it/s]

  0%|          | 0/859 [00:00<?, ?it/s]

  0%|          | 0/859 [00:00<?, ?it/s]

  0%|          | 0/859 [00:00<?, ?it/s]

  0%|          | 0/859 [00:00<?, ?it/s]

In [87]:
model.save_pretrained('eth_items')  

In [88]:
dataset = Dataset(encodings)
loader = torch.utils.data.DataLoader(dataset, batch_size= 64, shuffle =True) 

In [89]:
a={1:'a',2:'b'}

inv_map = {v: k for k, v in a.items()}

print(inv_map)

{'a': 1, 'b': 2}


In [90]:
'''Custom Inference Code'''

import heapq
import numpy as np
import pdb
import torch


def decode_sequence(token_mapping, seqs):
    
    out=[]
    reverse_map= {v: k for k, v in token_mapping.items()}
    
    for token in seqs:
        out.append(reverse_map[token])

    print("encoding",seqs, out)
    return out
    

def decode_inference(output,labels, token_mapping, topN=10):
    decoded_out=[]
    


    for i in tqdm(range(len(output))):

        mask_list=labels[i].tolist()
        
        idx= mask_list.index(2) - 1 # label for mask
        
        import pdb; pdb.set_trace()
        pred= np.array(output[i][idx].detach().tolist())
        pred=heapq.nlargest(topN, range(len(pred)), pred.take)

        out_decoded= decode_sequence(token_mapping, pred)
            

        decoded_out.append(out_decoded)
    return decoded_out


def compute_acc(output,labels, token_mapping, topN=10):
    decoded_out=[]
    
    correct=0

    for i in tqdm(range(len(output))):

        mask_list=labels[i].tolist()
        
        idx= mask_list.index(2) - 1 # label for mask
        
#         import pdb; pdb.set_trace()
        pred= np.array(output[i][idx].detach().tolist())
        pred=heapq.nlargest(topN, range(len(pred)), pred.take)
        
        if labels[i][idx] in pred:
#             print("Correct")
            correct+=1
            


    return (correct/len(labels))*100

def infer(model, token_mapping ,seqs, topN=9):
    

    mask, input_ids= tokenizer(token_mapping, seqs, max_length=11)
    vocab_size= len(token_mapping.keys())


    # sample = tokenizer(updated_train, max_length= 11, padding= 'max_length', truncation=True, return_tensors='pt')
    labels=(input_ids)
    # mask=(sample.attention_mask)
    input_ids=(mfm(input_ids.detach().clone()))

    encodings ={'input_ids': input_ids, 'attention_mask': mask , 'labels':labels}
    
    dataset = Dataset(encodings)
    loader = torch.utils.data.DataLoader(dataset, batch_size= 64, shuffle =False)
    loader = tqdm(loader, leave=True)
    
    batch_out=[]
    batch_label=[]
    
    
    for batch in loader:

        input_ids_ = batch['input_ids']
        attention_mask = batch['attention_mask']
        
        inference= model(input_ids_.cuda(), attention_mask=attention_mask.cuda())
        
        batch_out+=(inference['logits'].tolist())
        batch_label+=(batch['labels'].tolist())
        
        
    print(inference['logits'].shape)
    
    
    
    output= compute_acc(torch.tensor(batch_out), torch.tensor(batch_label)  ,token_mapping, topN)
    
    return output


def compute_accuracy(predicted, actual, tok):
    correct=0
    
    print(predicted[0], actual[0])
    print(len(predicted), len(actual))
    for i in range(len(predicted)):
        last_item= [value for value in actual[i].split(" ") if value != ''][-1]
#         predicted_items= [value for value in predicted[i].split(" ") if value != '']
        predicted_items=predicted[i]
        
        import pdb; pdb.set_trace()
        print("last Item", last_item, predicted_items)
        if last_item in predicted_items:
            
            correct+=1
            print("true")
            
    print(correct)
    return (correct/len(actual)) *100
            
        
    
# print(infer['logits'][0][0])
# test_list=infer['logits'][0][10].tolist()


# test_list = np.array(test_list)
# idx=heapq.nlargest(10, range(len(test_list)), test_list.take)

# print(idx)
# tokenizer.decode(idx)

In [91]:
top_n=10

train_results= infer(model, token_mapping, updated_train, top_n)
print("##################################")
print(train_results)




# test_results= infer(model, token_mapping, updated_test, top_n)
# print("computing accuracy")
# test_acc= compute_accuracy(test_results, updated_test)
# print(test_acc)

  0%|          | 0/54958 [00:00<?, ?it/s]

  0%|          | 0/859 [00:00<?, ?it/s]

torch.Size([46, 11, 762])


  0%|          | 0/54958 [00:00<?, ?it/s]

##################################
37.270279122238804


In [92]:
test_results= infer(model, token_mapping, updated_test, top_n)
print("##################################")
print(test_results)



  0%|          | 0/16293 [00:00<?, ?it/s]

  0%|          | 0/255 [00:00<?, ?it/s]

torch.Size([37, 11, 762])


  0%|          | 0/16293 [00:00<?, ?it/s]

##################################
27.75425029153624
