In [57]:
import torch
from transformers import AutoModel, AutoTokenizer
from scipy.spatial.distance import cosine

In [58]:
# Get our models - The package will take care of downloading the models automatically
# For best performance: Muennighoff/SGPT-5.8B-weightedmean-msmarco-specb-bitfit
tokenizer = AutoTokenizer.from_pretrained("Muennighoff/SGPT-125M-weightedmean-msmarco-specb-bitfit")
model = AutoModel.from_pretrained("Muennighoff/SGPT-125M-weightedmean-msmarco-specb-bitfit")
# Deactivate Dropout (There is no dropout in the above models so it makes no difference here but other SGPT models may have dropout)
# model.load_state_dict(torch.load('models/sgpt-bi/model_1.pt'))
model.eval()

GPTNeoModel(
  (wte): Embedding(50259, 768)
  (wpe): Embedding(2048, 768)
  (drop): Dropout(p=0.0, inplace=False)
  (h): ModuleList(
    (0): GPTNeoBlock(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPTNeoAttention(
        (attention): GPTNeoSelfAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (k_proj): Linear(in_features=768, out_features=768, bias=False)
          (v_proj): Linear(in_features=768, out_features=768, bias=False)
          (q_proj): Linear(in_features=768, out_features=768, bias=False)
          (out_proj): Linear(in_features=768, out_features=768, bias=True)
        )
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPTNeoMLP(
        (c_fc): Linear(in_features=768, out_features=3072, bias=True)
        (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        (act): NewGELUActivation()
       

In [59]:

queries = [
    "software engineer",
]

docs = [
    "software engineer c, c++, java, uml, xamp, agile. defence/communications our client is a successful and expanding company developing air defence systems, information systems, targeting systems, communication systems. they now require an additional software engineer to support the design evolution from requirements into software code. identification of test requirements and the development of component test harnesses. supporting design and code review activities to derisk the design evolution qualifications for the software engineer. degree in software engineering, computer science, maths, physics or equivalent good degree pref 1st or ****:1, meng or equiv. essential requirements: software engineer experience in one or more of the following: object oriented analysis and design (preferably  uml)  unix /linux or windows, c, c++, java. html, php candidates from a defence background experience of agile development an advantage. experience of xamp technologies useful. desirable requirements:  software engineer  experience in one or more of the following: realtime design. mathematical or algorithm coding  network protocols  oo case tools  structured design. methods  data and voice networks  computer telephony integration (cti) an appreciation of the following: testing techniques and strategies  configuration management personal attributes flexibility, adaptability, team player, good communication skills, discipline the company provide excellent career prospects and career development recent software graduates will also be considered for these roles. keywords software engineer, c, c++, java, realtime, embedded, uml, linux, unix, oo, agile, xamp. defence, communication systems. salary ****k****k location fleet, hampshire",
    "user experience developer our client requires a user experience developer to help develop and maintain new and existing projects. the successful candidate will report directly to the technical lead. the post will be based in the farnham, surrey area. applicants applicants will have 1**** years flex development experience and should be able to provide a portfolio of their work. successful applicants will be those seeking a challenging opportunity with an exciting and expanding organisation and who are enthusiastic, flexible and personable, able to work well on both an individual basis and as part of a development team. the positions require good communication skills, both oral and written. specific technical skills/knowledge actionscript 3 flex adobe air flashbuilder basic technical skills/knowledge oo programming and/or mvc framework (highly desirable) software design using uml (desirable) test driven development (highly desirable) version control (desirable) other skills/knowledge of interest javascript, html & css, php/.net/python ios/android process user experience developer location: farnham, surrey salary up to ****",
    "a leading ecommerce agency is looking to hire a web developer to join their team of ecommerce developers. the role will involve working on different types of client and a candidate should have the ability to create and implement unique, engaging, and intuitive work to meet client requirements. requirements: 34 years of experience in web development **** or more years on ecommerce projects in an agency setting development skills in the following technologies: o asp.net and asp o vb and vb.net o sql server / tsql o com/dcom o xml, html o javascript knowledge of microsoft commerce server, any other ecommerce solutions if this sounds like a challenge that you are ready to take up, send in your cv now",
    "a successful software organisation based in cheshire is on the lookout for a skilled and experienced software developer to join their technical team. you will be involved in the full software development lifecycle from design to implementation and maintaining current applications and websites. qualifications: minimum of 2 years commercial development experience using; vb.net, c.net, ms sql experience with the following technologies is a must: o visual studio (**** and 2010) o sql management studio o visual sourcesafe/team foundation server should have a working experience or knowledge of the following: o asp.net o ajax, javascript, css, html, xml o web services, tsql, sql scripting / stored procedures this is a great opportunity for you to be a part of a well known leader in financial software. if you think you are able to take on a new challenge, send your cv now"
]



In [60]:
SPECB_QUE_BOS = tokenizer.encode("[", add_special_tokens=False)[0]
SPECB_QUE_EOS = tokenizer.encode("]", add_special_tokens=False)[0]

SPECB_DOC_BOS = tokenizer.encode("{", add_special_tokens=False)[0]
SPECB_DOC_EOS = tokenizer.encode("}", add_special_tokens=False)[0]


def tokenize_with_specb(texts, is_query):
    # Tokenize without padding
    batch_tokens = tokenizer(texts, padding=False, truncation=True)   
    # Add special brackets & pay attention to them
    for seq, att in zip(batch_tokens["input_ids"], batch_tokens["attention_mask"]):
        if is_query:
            seq.insert(0, SPECB_QUE_BOS)
            seq.append(SPECB_QUE_EOS)
        else:
            seq.insert(0, SPECB_DOC_BOS)
            seq.append(SPECB_DOC_EOS)
        att.insert(0, 1)
        att.append(1)
    # Add padding
    batch_tokens = tokenizer.pad(batch_tokens, padding=True, return_tensors="pt")
    return batch_tokens

def get_weightedmean_embedding(batch_tokens, model):
    # Get the embeddings
    with torch.no_grad():
        # Get hidden state of shape [bs, seq_len, hid_dim]
        last_hidden_state = model(**batch_tokens, output_hidden_states=True, return_dict=True).last_hidden_state

    # Get weights of shape [bs, seq_len, hid_dim]
    weights = (
        torch.arange(start=1, end=last_hidden_state.shape[1] + 1)
        .unsqueeze(0)
        .unsqueeze(-1)
        .expand(last_hidden_state.size())
        .float().to(last_hidden_state.device)
    )

    # Get attn mask of shape [bs, seq_len, hid_dim]
    input_mask_expanded = (
        batch_tokens["attention_mask"]
        .unsqueeze(-1)
        .expand(last_hidden_state.size())
        .float()
    )

    # Perform weighted mean pooling across seq_len: bs, seq_len, hidden_dim -> bs, hidden_dim
    sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded * weights, dim=1)
    sum_mask = torch.sum(input_mask_expanded * weights, dim=1)

    embeddings = sum_embeddings / sum_mask

    return embeddings


In [63]:
query_embeddings = get_weightedmean_embedding(tokenize_with_specb(queries, is_query=True), model)
doc_embeddings = get_weightedmean_embedding(tokenize_with_specb(docs, is_query=False), model)

In [64]:
# Calculate cosine similarities
# Cosine similarities are in [-1, 1]. Higher means more similar
cosine_sim_0_1 = 1 - cosine(query_embeddings[0], doc_embeddings[0])
cosine_sim_0_2 = 1 - cosine(query_embeddings[0], doc_embeddings[1])
cosine_sim_0_3 = 1 - cosine(query_embeddings[0], doc_embeddings[3])

print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (queries[0], docs[0][:20] + "...", cosine_sim_0_1))
print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (queries[0], docs[1][:20] + "...", cosine_sim_0_2))
print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (queries[0], docs[2][:20] + "...", cosine_sim_0_3))

Cosine similarity between "software engineer" and "software engineer c,..." is: 0.684
Cosine similarity between "software engineer" and "user experience deve..." is: 0.386
Cosine similarity between "software engineer" and "a leading ecommerce ..." is: 0.589
