In [16]:
# pip install transformers

In [17]:
import itertools
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer

In [18]:
# get sparse vector from dense vectors with shape batch_size * seq_len * vocab_size
def get_sparse_vector(feature, output):
    values, _ = torch.max(output*feature["attention_mask"].unsqueeze(-1), dim=1)
    values = torch.log(1 + torch.relu(values))
    values[:,special_token_ids] = 0
    return values
    
# transform the sparse vector to a dict of (token, weight)
def transform_sparse_vector_to_dict(sparse_vector):
    sample_indices,token_indices=torch.nonzero(sparse_vector,as_tuple=True)
    non_zero_values = sparse_vector[(sample_indices,token_indices)].tolist()
    number_of_tokens_for_each_sample = torch.bincount(sample_indices).cpu().tolist()
    tokens = [transform_sparse_vector_to_dict.id_to_token[_id] for _id in token_indices.tolist()]

    output = []
    end_idxs = list(itertools.accumulate([0]+number_of_tokens_for_each_sample))
    for i in range(len(end_idxs)-1):
        token_strings = tokens[end_idxs[i]:end_idxs[i+1]]
        weights = non_zero_values[end_idxs[i]:end_idxs[i+1]]
        output.append(dict(zip(token_strings, weights)))
    return output
    

In [19]:
# load the model
model = AutoModelForMaskedLM.from_pretrained("opensearch-project/opensearch-neural-sparse-encoding-v1")
tokenizer = AutoTokenizer.from_pretrained("opensearch-project/opensearch-neural-sparse-encoding-v1")

In [20]:
# set the special tokens and id_to_token transform for post-process
special_token_ids = [tokenizer.vocab[token] for token in tokenizer.special_tokens_map.values()]
get_sparse_vector.special_token_ids = special_token_ids
id_to_token = ["" for i in range(tokenizer.vocab_size)]
for token, _id in tokenizer.vocab.items():
    id_to_token[_id] = token
transform_sparse_vector_to_dict.id_to_token = id_to_token

In [21]:
query = "What's the weather in ny now?"
document = "Currently New York is rainy."

# encode the query & document
feature = tokenizer([query, document], padding=True, truncation=True, return_tensors='pt', return_token_type_ids=False)
output = model(**feature)[0]
sparse_vector = get_sparse_vector(feature, output)

# get similarity score
sim_score = torch.matmul(sparse_vector[0],sparse_vector[1])
print(sim_score)   # tensor(22.3299, grad_fn=<DotBackward0>)


query_token_weight, document_query_token_weight = transform_sparse_vector_to_dict(sparse_vector)
for token in sorted(query_token_weight, key=lambda x:query_token_weight[x], reverse=True):
    if token in document_query_token_weight:
        print("score in query: %.4f, score in document: %.4f, token: %s"%(query_token_weight[token],document_query_token_weight[token],token))

tensor(22.3298, grad_fn=<DotBackward0>)
score in query: 2.9262, score in document: 2.1335, token: ny
score in query: 2.5206, score in document: 1.5277, token: weather
score in query: 2.0373, score in document: 2.3489, token: york
score in query: 1.5786, score in document: 0.8752, token: cool
score in query: 1.4636, score in document: 1.5132, token: current
score in query: 0.7761, score in document: 0.8860, token: season
score in query: 0.7560, score in document: 0.6726, token: 2020
score in query: 0.7222, score in document: 0.6292, token: summer
score in query: 0.6888, score in document: 0.6419, token: nina
score in query: 0.6451, score in document: 0.8200, token: storm
score in query: 0.4698, score in document: 0.7635, token: brooklyn
score in query: 0.4562, score in document: 0.1208, token: julian
score in query: 0.3484, score in document: 0.3903, token: wow
score in query: 0.3439, score in document: 0.4160, token: usa
score in query: 0.2751, score in document: 0.8260, token: manhatt

In [24]:
query_token_weight

{'now': 2.2065000534057617,
 'season': 0.7760677337646484,
 'here': 0.026180144399404526,
 'air': 0.14392752945423126,
 'april': 0.03711361065506935,
 'york': 2.0373146533966064,
 'already': 0.03775929659605026,
 'summer': 0.7222039699554443,
 'today': 0.25874048471450806,
 'change': 0.09803194552659988,
 'news': 0.3427507281303406,
 'current': 1.4635943174362183,
 'future': 0.001597201102413237,
 'sun': 0.14983253180980682,
 'martin': 0.04427485167980194,
 'nature': 0.11914681643247604,
 'travel': 0.28682243824005127,
 'tonight': 0.15597288310527802,
 'usa': 0.34385496377944946,
 'storm': 0.64512699842453,
 '##ius': 0.05066607519984245,
 'weather': 2.5205931663513184,
 'cool': 1.5786479711532593,
 'climate': 0.16530534625053406,
 'temperature': 0.06651061773300171,
 'ny': 2.926173210144043,
 'julian': 0.4562440812587738,
 'brooklyn': 0.46979716420173645,
 'mood': 0.19892123341560364,
 'milan': 0.06909207999706268,
 'manhattan': 0.27505552768707275,
 'nina': 0.6887561678886414,
 'fog':

In [27]:
query_1 = "summary of the pages"
encode_query_1 = tokenizer(query_1, return_tensors='pt', truncation=True, padding=True)
output_1 = model(**encode_query_1)[0]
sparse_vector_1 = get_sparse_vector(encode_query_1, output_1)
document_query_token_weight_1 = transform_sparse_vector_to_dict(sparse_vector_1)

In [28]:
document_query_token_weight_1

[{'.': 0.09795466810464859,
  'the': 0.9895220994949341,
  'of': 0.5942500233650208,
  'these': 0.0709637925028801,
  'book': 0.45201361179351807,
  'main': 0.16776703298091888,
  'story': 0.5015198588371277,
  'written': 0.09777490049600601,
  'hall': 0.10336891561746597,
  'various': 0.15453986823558807,
  'total': 0.4604087173938751,
  'information': 0.11041329056024551,
  'seven': 0.2504394054412842,
  'table': 0.5144628286361694,
  'list': 0.5308113098144531,
  'section': 0.29919686913490295,
  'novel': 0.21108883619308472,
  'chapter': 0.46096885204315186,
  'report': 0.16256111860275269,
  'view': 0.02961583063006401,
  'structure': 0.159208282828331,
  'review': 0.32896003127098083,
  'chart': 0.6405640244483948,
  'article': 0.32595589756965637,
  'sides': 0.5633036494255066,
  'literature': 0.10077744722366333,
  'page': 2.215363025665283,
  'analysis': 0.23568813502788544,
  'content': 0.07434964179992676,
  'eric': 0.2624856233596802,
  'circle': 0.1595962643623352,
  'brie