In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
tokenizer = AutoTokenizer.from_pretrained("bigcode/starencoder")
model = AutoModel.from_pretrained("bigcode/starencoder")

Downloading (…)okenizer_config.json: 100%|██████████| 973/973 [00:00<00:00, 375kB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 777k/777k [00:00<00:00, 1.43MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 442k/442k [00:00<00:00, 1.56MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 2.06M/2.06M [00:00<00:00, 3.97MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 532/532 [00:00<00:00, 1.64MB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 667/667 [00:00<00:00, 663kB/s]
Downloading pytorch_model.bin: 100%|██████████| 499M/499M [02:12<00:00, 3.77MB/s] 


In [9]:
def get_similarity(text1, text2):
    # Tokenize the texts
    encoded_text1 = tokenizer.encode(text1, add_special_tokens=True)
    encoded_text2 = tokenizer.encode(text2, add_special_tokens=True)

    # Define the maximum length for padding
    max_length = max(len(encoded_text1), len(encoded_text2))

    # Pad the encoded texts manually
    padded_text1 = encoded_text1 + [0] * (max_length - len(encoded_text1))
    padded_text2 = encoded_text2 + [0] * (max_length - len(encoded_text2))

    # Convert the padded texts to tensors
    padded_text1_tensor = torch.tensor(padded_text1).unsqueeze(0)
    padded_text2_tensor = torch.tensor(padded_text2).unsqueeze(0)

    # Generate the embeddings for the padded texts using the model
    with torch.no_grad():
        embeddings_text1 = model(padded_text1_tensor).last_hidden_state.mean(dim=1)
        embeddings_text2 = model(padded_text2_tensor).last_hidden_state.mean(dim=1)

    # Calculate the cosine similarity between the embeddings
    similarity_score = cosine_similarity(embeddings_text1, embeddings_text2)[0][0]
    return similarity_score

In [10]:
# Define your texts
text1 = "I like Wombats for a reason."
text2 = "A very different one speaking about animals having square poop"

similarity_score = get_similarity(text1=text1, text2=text2)

# Print the similarity score
print("Similarity score:", similarity_score)


Similarity score: 0.92662966


In [19]:
crawler1_path = "testing_embeding/crawler1.py"
crawler2_path = "testing_embeding/crawler2.py"
snake_path = "testing_embeding/snake.py"

code_examples = []

with open(crawler1_path, 'r') as f:
    code_examples.append(('crawler1', f.read()))
with open(crawler2_path, 'r') as f:
    code_examples.append(('crawler2', f.read()))
with open(snake_path, 'r') as f:
    code_examples.append(('snake', f.read()))

code_examples.extend([('snake questing', 'how do i write a snake game in python'),
                      ('crawler question 1', 'how do i crawl the web with python'),
                      ('crawler question 2', 'how to scrape the net with python')])

In [20]:
from itertools import permutations

# Get all permutations of pairs
pair_permutations = list(permutations(code_examples, 2))

# Print the pair permutations
for pair in pair_permutations:
    print(f'similarity between {pair[0][0]} and {pair[1][0]}: {get_similarity(text1=pair[0][1], text2=pair[1][1])}')


similarity between crawler1 and crawler2: 0.9679924249649048
similarity between crawler1 and snake: 0.5613067746162415
similarity between crawler1 and snake questing: 0.507878839969635
similarity between crawler1 and crawler question 1: 0.5067800283432007
similarity between crawler1 and crawler question 2: 0.5064317584037781
similarity between crawler2 and crawler1: 0.9679924249649048
similarity between crawler2 and snake: 0.6073487401008606
similarity between crawler2 and snake questing: 0.4860556125640869
similarity between crawler2 and crawler question 1: 0.48266953229904175
similarity between crawler2 and crawler question 2: 0.48531633615493774
similarity between snake and crawler1: 0.5613067746162415
similarity between snake and crawler2: 0.6073487401008606
similarity between snake and snake questing: 0.4142509698867798
similarity between snake and crawler question 1: 0.4103277027606964
similarity between snake and crawler question 2: 0.41300565004348755
similarity between snake q