In [1]:
"""#default_exp faster"""

### Try to store locally initialized object for future usage so that it runs faster overall
### aka, those default settings, e.g., tokenizer(), EmoModel()

In [2]:
"""
a dictionary that store locally initialized objects
that are mostly likely to be reused in the future
"""
ret = {}

In [3]:
"""
1. improve the modularity: one function -> one function xD
2. make this a decorator so that it's more elegant to use for every function
"""
def get_key(obj):
    key = str(obj.__repr__)
    return key

def check_in_ret(obj):
    if obj is None:
        return False
    key = get_key(obj)
    return key in ret

def add_to_ret(obj):
    # add called instance to ret: the dict()
    # see how fast.ai impl. Callback class
    if obj is None:
        return False
    key = get_key(obj)
    if key not in ret:
        ret[key] = obj
        return True
    return False

In [4]:
from EMO_AI.data_process import get_tokenizer

In [5]:
g = get_tokenizer()
g

Tokenizer(vocabulary_size=50265, model=ByteLevelBPE, add_prefix_space=False, lowercase=False, dropout=None, unicode_normalizer=None, continuing_subword_prefix=None, end_of_word_suffix=None, trim_offsets=False)

In [6]:
add_to_ret(g)

True

In [7]:
ret

{'<bound method BaseTokenizer.__repr__ of Tokenizer(vocabulary_size=50265, model=ByteLevelBPE, add_prefix_space=False, lowercase=False, dropout=None, unicode_normalizer=None, continuing_subword_prefix=None, end_of_word_suffix=None, trim_offsets=False)>': Tokenizer(vocabulary_size=50265, model=ByteLevelBPE, add_prefix_space=False, lowercase=False, dropout=None, unicode_normalizer=None, continuing_subword_prefix=None, end_of_word_suffix=None, trim_offsets=False)}

In [9]:
check_in_ret(g)

True

### Should we just use @lru_cache for the functions that will initialized default objects?

In [1]:
import torch
text = "I was so dump that i did not realize"

In [2]:
from functools import lru_cache

In [3]:
from EMO_AI.data_process import get_tokenizer
@lru_cache(maxsize=8)
def convert_text_to_tensor(text, tokenizer=None):
    if tokenizer is None:
        tokenizer = get_tokenizer()
    print("id: ", id(tokenizer))
    enc = tokenizer.encode(text)
    X = torch.tensor(enc.ids).unsqueeze(0)
    Attn = torch.tensor(enc.attention_mask).unsqueeze(0)
    return (X, Attn)

In [4]:
%%time
_ = convert_text_to_tensor(text)

id:  2614168374024
Wall time: 316 ms


In [5]:
%%time
_ = convert_text_to_tensor(text)

Wall time: 0 ns


In [6]:
_

(tensor([[    0,   100,    21,    98, 12371,    14,   939,   222,    45,  4883,
              2,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,   