# Data pre-process

In [1]:
# read meta data
import json

# Function to read a .jsonl file
def read_jsonl(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            # Parse each line as JSON and append to the data list
            data.append(json.loads(line))
            # break
    return data

movies_meta_data = read_jsonl('../data/meta_Movies_and_TV.jsonl')

In [2]:
asin_dict = {}
for line in movies_meta_data:
    if line['title'] is not None:
        asin_dict[line['parent_asin']] = [line['title']]
    # asin_dict[line['parent_asin']] = [line['title']]

In [3]:
# save asin_dict to json file
json_file_asins = '../data/meta_asins.json'
with open(json_file_asins, "w") as file:
    json.dump(asin_dict, file)

# MISC

In [None]:
import torch

test_tensor = torch.tensor([[1, 2, 3, 4, 5],
        [6, 7, 8, 9, 10],
        [11, 12, 13, 14, 15]], device='cuda:0') 

test_tensor[:, -1, None], test_tensor[..., -1, None]

In [9]:
from transformers import TextStreamer, GenerationConfig
from unsloth import FastLanguageModel

max_seq_length = 4096 # 8192 | Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "../outputs/model_04242024_090830/", # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

==((====))==  Unsloth: Fast Gemma patching release 2024.4
   \\   /|    GPU: Tesla T4. Max memory: 14.581 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.2. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


In [10]:
tokenizer.decode([2, 58423, 8920, 235292]), tokenizer.decode(3)

('<bos>Justified:', '<unk>')

In [5]:
import json

# asins_small.json to be used for meta data matching
json_file_asins = '../data/asins_small.json'
with open(json_file_asins, "r") as file:
    asin_dict = json.load(file)

In [6]:
# trie def here
class TrieNode:
    def __init__(self, id):
        self.children = {}
        self.node_value = id
    
class Trie:
    def __init__(self, bos_token_id):
        self.root = TrieNode(bos_token_id) # set start of sentence id as root node
        self.pad_token_id = tokenizer.pad_token_id
        self.pad_node = TrieNode(self.pad_token_id)
        self.pad_node.children[self.pad_token_id] = self.pad_node
    
    def insert(self, input_ids_list):
        node = self.root
        for id in input_ids_list[1:]:
            if id not in node.children:
                node.children[id] = TrieNode(id)
            node = node.children[id]
        
    def search_children(self, input_ids_list):
        # search for the input_ids_list in the trie
        # the path is presented in the input_ids_list
        # return the last node in the path                  
        # input_ids_list should always start with 2 (start of sentence id)
        
        node = self.root
        traverse_depth = 0
        if len(input_ids_list) == 1:
            return node
        for id in input_ids_list[1:]:
            if id in node.children:
                node = node.children[id]
                traverse_depth += 1
            else:
                # if id == self.pad_token_id:
                #     return self.pad_node
                
                raise ValueError("input_ids_list not found in the trie. Traverse failed at node: ", node.node_value, 
                                " at depth: ", traverse_depth)
        return node

In [7]:
trie = Trie(tokenizer.bos_token_id)
for key in asin_dict.keys():
    movie_title = asin_dict[key][0]
    
    # tokenize movie title
    inputs = tokenizer(movie_title, return_tensors="pt")
    # get the input ids
    input_ids = inputs['input_ids'].tolist()
    
    # append end of sentence id at the end of the input_ids
    input_ids[0].append(tokenizer.eos_token_id)
    # print("input_ids: ", input_ids)
    
    # insert this input_ids as a path in the trie
    for id in input_ids:
        trie.insert(id)

In [9]:
trie.search_children([2, 58423, 8920, 235292, 13316, 235248, 235274, 892, 41330, 235290, 1040, 235307]).children

{1: <__main__.TrieNode at 0x7f3fd3f59d50>}

In [1]:
import torch
next_token_logits = torch.tensor([[-29.2969, -13.4375, -20.1094, -23.3125, -22.7500, -28.5625],
        [-30.3125, -12.4453, -30.0625, -31.4688, -30.4219, -29.7500],
        [-27.1094, -12.6562, -18.6250, -16.5156, -19.5000, -26.5469]],
       device='cuda:0', dtype=torch.float16)
next_token_scores = torch.nn.functional.log_softmax(next_token_logits, dim=-1)
next_token_scores

tensor([[-1.5859e+01, -1.4076e-03, -6.6719e+00, -9.8750e+00, -9.3125e+00,
         -1.5125e+01],
        [-1.7875e+01,  0.0000e+00, -1.7625e+01, -1.9031e+01, -1.7969e+01,
         -1.7312e+01],
        [-1.4477e+01, -2.4399e-02, -5.9922e+00, -3.8828e+00, -6.8672e+00,
         -1.3914e+01]], device='cuda:0', dtype=torch.float16)

In [10]:
import torch

test_token_scores = torch.tensor([1.3, 2.1, 3.4, -torch.inf, -torch.inf, -torch.inf])
test_token_scores = torch.nn.functional.softmax(test_token_scores, dim=-1)
torch.multinomial(test_token_scores, num_samples=6, replacement=True)

tensor([2, 2, 2, 2, 2, 1])