In [1]:
# !pip install transformers sentencepiece -q

import torch
from transformers import AutoTokenizer
from transformers.models.m2m_100.modeling_m2m_100 import M2M100Encoder

model_name = "cointegrated/SONAR_200_text_encoder"
encoder = M2M100Encoder.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def encode_mean_pool(texts, tokenizer, encoder, lang='eng_Latn', norm=False):
    tokenizer.src_lang = lang
    with torch.inference_mode():
        batch = tokenizer(texts, return_tensors='pt', padding=True)
        seq_embs = encoder(**batch).last_hidden_state
        mask = batch.attention_mask
        mean_emb = (seq_embs * mask.unsqueeze(-1)).sum(1) / mask.unsqueeze(-1).sum(1)
        if norm:
            mean_emb = torch.nn.functional.normalize(mean_emb)
    return mean_emb

sentences = ['My name is SONAR.', 'I can embed the sentences into vectorial space.']
embs = encode_mean_pool(sentences, tokenizer, encoder, lang="eng_Latn")
print(embs.shape)  
# torch.Size([2, 1024])
print(embs)
# tensor([[-0.0053,  0.0020, -0.0006,  ...,  0.0094, -0.0009,  0.0070],
#         [-0.0003, -0.0071,  0.0076,  ...,  0.0055,  0.0022, -0.0083]])


config.json:   0%|          | 0.00/812 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.9k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/4.23k [00:00<?, ?B/s]

torch.Size([2, 1024])
tensor([[-0.0053,  0.0020, -0.0006,  ...,  0.0094, -0.0009,  0.0070],
        [-0.0003, -0.0071,  0.0076,  ...,  0.0055,  0.0022, -0.0083]])


In [3]:
sentences = ['Cat is a animal', 'I have a cat i have a animal']
embs = encode_mean_pool(sentences, tokenizer, encoder, lang="eng_Latn")
print(embs.shape)  
# torch.Size([2, 1024])
print(embs)

torch.Size([2, 1024])
tensor([[ 3.2370e-04,  2.3489e-03,  9.9993e-05,  ..., -8.4636e-03,
          7.5012e-03,  3.8039e-03],
        [ 3.3168e-03,  2.4580e-03, -2.8700e-03,  ..., -1.0706e-02,
          3.2984e-03, -6.1285e-04]])


In [None]:
pip

In [4]:
# Load model directly
from transformers import AutoTokenizer, BertModel

tokenizer = AutoTokenizer.from_pretrained("sadakmed/dpr-passage_encoder-spanish")
model = BertModel.from_pretrained("sadakmed/dpr-passage_encoder-spanish")

tokenizer_config.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/489 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/242k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

In [6]:
query = "Disculpe buenas tardes el  Diplomado en Banca a cuanto esta?"
context = '''programa : diplomado en banca y transformación digital 
certificado academico : diplomado en banca y transformación digital
unidad de ejecucion : Centro de Estudios de Posgrado e Investigación
Facultad de Contaduría Pública y Ciencias Financieras
sede :  SUCRE: 
CEPI: Calle Aniceto Arce No. 46, zona central.
Telf. y Fax: (591) 4 6440887
Facultad de Contaduría Pública y Ciencias Financieras: Calle Grau Nº 149  .
TLF: (591) 4 6434025 - (591) 4 6452320
costo total : Bs 	3.600, 00
modalidad de ejecucion : virtual
'''
token = tokenizer.encode(query,context,return_tensors="pt")

In [9]:
model_output = model(token)

In [14]:
model_output.pooler_output.size()

torch.Size([1, 768])

In [18]:
model_output.pooler_output.shape

torch.Size([1, 768])

In [19]:
model_output.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [5]:
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer, DPRContextEncoder, DPRContextEncoderTokenizer

model_str = "IIC/dpr-spanish-question_encoder-allqa-base"
tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(model_str)
model = DPRQuestionEncoder.from_pretrained(model_str)




You are using a model of type bert to instantiate a model of type dpr. This is not supported for all configurations of models and can yield errors.
Some weights of DPRQuestionEncoder were not initialized from the model checkpoint at IIC/dpr-spanish-question_encoder-allqa-base and are newly initialized: ['bert_model.encoder.layer.5.output.LayerNorm.weight', 'bert_model.encoder.layer.11.output.LayerNorm.bias', 'bert_model.encoder.layer.5.attention.output.dense.weight', 'bert_model.encoder.layer.7.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.6.attention.self.query.weight', 'bert_model.encoder.layer.7.intermediate.dense.bias', 'bert_model.encoder.layer.4.output.LayerNorm.weight', 'bert_model.encoder.layer.10.attention.self.query.weight', 'bert_model.encoder.layer.9.intermediate.dense.bias', 'bert_model.encoder.layer.9.attention.self.key.bias', 'bert_model.encoder.layer.0.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.11.attention.self.query.bias', 'bert_model.

In [47]:
def model_query_output(word):
    input_ids = tokenizer(word, return_tensors="pt")["input_ids"]
    query_embeddings = model(input_ids).pooler_output
    return query_embeddings.detach().numpy()

In [48]:
model_query_output("el perro es un animal")

array([[-6.72719419e-01, -4.45536263e-02,  8.14662218e-01,
        -1.79185390e-01, -1.64071977e-01, -2.39909863e+00,
        -5.92900991e-01, -9.53954220e-01,  2.48369742e-02,
        -1.29709363e+00,  1.90237403e+00,  5.54110467e-01,
        -9.99662220e-01, -3.71938199e-01,  5.75284474e-02,
        -4.67096835e-01,  7.82199681e-01, -4.91334617e-01,
        -9.97006357e-01, -1.29229021e+00, -1.62421358e+00,
         1.14575577e+00, -6.72953069e-01, -1.06598401e+00,
         1.95739603e+00, -3.22984546e-01, -6.57687187e-01,
         6.29309788e-02, -1.06082189e+00,  2.07091093e-01,
         9.94658828e-01, -2.13912189e-01,  1.62228525e-01,
        -7.77383029e-01,  3.53976071e-01,  1.27271414e+00,
         3.21519494e-01,  3.72279793e-01, -3.14089745e-01,
         1.51762173e-01, -4.12794828e-01, -1.12807512e+00,
        -1.59266818e+00,  1.94258541e-01, -2.37690777e-01,
         1.33744824e+00,  3.23186576e-01,  3.46380174e-01,
         5.83737791e-01,  3.65897059e-01,  9.61071968e-0

In [16]:
context_encoder_model = DPRContextEncoder.from_pretrained("IIC/dpr-spanish-passage_encoder-allqa-base")
context_encoder_tokenizer = DPRContextEncoderTokenizer.from_pretrained("IIC/dpr-spanish-passage_encoder-allqa-base")

You are using a model of type bert to instantiate a model of type dpr. This is not supported for all configurations of models and can yield errors.
Some weights of DPRContextEncoder were not initialized from the model checkpoint at IIC/dpr-spanish-passage_encoder-allqa-base and are newly initialized: ['bert_model.encoder.layer.5.output.LayerNorm.weight', 'bert_model.encoder.layer.11.output.LayerNorm.bias', 'bert_model.encoder.layer.5.attention.output.dense.weight', 'bert_model.encoder.layer.7.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.6.attention.self.query.weight', 'bert_model.encoder.layer.7.intermediate.dense.bias', 'bert_model.encoder.layer.4.output.LayerNorm.weight', 'bert_model.encoder.layer.10.attention.self.query.weight', 'bert_model.encoder.layer.9.intermediate.dense.bias', 'bert_model.encoder.layer.9.attention.self.key.bias', 'bert_model.encoder.layer.0.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.11.attention.self.query.bias', 'bert_model.en

In [20]:
input_context = '''programa : diplomado en banca y transformación digital 
certificado academico : diplomado en banca y transformación digital
unidad de ejecucion : Centro de Estudios de Posgrado e Investigación
Facultad de Contaduría Pública y Ciencias Financieras
sede :  SUCRE: 
CEPI: Calle Aniceto Arce No. 46, zona central.
Telf. y Fax: (591) 4 6440887
Facultad de Contaduría Pública y Ciencias Financieras: Calle Grau Nº 149  .
TLF: (591) 4 6434025 - (591) 4 6452320
costo total : Bs 	3.600, 00
modalidad de ejecucion : virtual
'''



In [53]:
def model_ctxt_output(text):
    context_token = context_encoder_tokenizer(input_context, return_tensors="pt")["input_ids"]
    output_context = context_encoder_model(context_token).pooler_output
    return output_context.detach().numpy()

In [54]:
model_ctxt_output(input_context)


array([[-1.14829220e-01, -3.91656518e-01,  1.09574795e+00,
         2.09859443e+00, -9.66568351e-01, -3.78206670e-01,
         4.35817614e-02, -1.21820249e-01,  5.17445445e-01,
        -8.74035358e-01,  3.73031870e-02, -4.24682796e-01,
         1.36622381e+00,  7.19356418e-01, -1.05592012e-01,
         7.49281943e-01, -2.57143408e-01,  5.38518548e-01,
        -1.11609781e+00, -5.50528109e-01, -8.79833937e-01,
         7.92579114e-01, -1.88262618e+00, -9.85792518e-01,
         1.03639710e+00, -2.60540456e-01, -1.02406323e+00,
        -1.64425516e+00,  5.53443611e-01, -1.01424766e+00,
        -1.81939662e-01, -1.57043350e+00,  6.11576617e-01,
         9.41590816e-02, -1.08460784e+00, -9.87856388e-01,
         2.87408428e-03,  1.33704567e+00,  2.22832227e+00,
        -5.50818816e-02,  4.79457140e-01, -1.09226358e+00,
         2.87283152e-01, -6.20542645e-01, -1.58212200e-01,
        -1.17285833e-01, -3.54650199e-01,  1.27832025e-01,
        -3.63534361e-01, -3.47788893e-02,  3.88755918e-0

In [27]:
from sklearn.metrics.pairwise import cosine_similarity

In [68]:
query = model_query_output("a cuanto cuesta el curso de diplomado en banca y transformacion digital")
ctxt = model_ctxt_output(input_context)

In [56]:
cosine_similarity(query, ctxt) # el perro es un animal

array([[0.02152175]], dtype=float32)

In [65]:
cosine_similarity(query, ctxt)

array([[0.02406216]], dtype=float32)

In [69]:
cosine_similarity(query, ctxt)

array([[0.02652429]], dtype=float32)