In [2]:
!pip install transformers



In [3]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
model = AutoModelForMaskedLM.from_pretrained("distilroberta-base")

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

import json

# Lists to store titles and descriptions
titles = []
descriptions = []

# Replace "data.jsonl" with the actual path to your JSONL file
with open("issues_hyperledger.jsonl", "r") as file:
    for line in file:
        data = json.loads(line)
        if(data["title"] and data["description"]):
          titles.append(data["title"])
          descriptions.append(data["description"])

# Printing the extracted titles and descriptions
print("Titles:", titles[:5])
print("Descriptions:", descriptions[:5])


Titles: ['Revocation / indy-sdk proof verification fails after issuing a second or more credentials', '2019-03 license scanning results', 'Python Wrapper must be moved to hyperledger', 'Link URSA contributing guide once it exists.', 'No contributing guide.']
Descriptions: ["We are facing some issues with proofing credentials issued on cred defs with revocation support.\r\nWhen there's only one credential issued for the corresponding revoc reg everything is fine.\r\nBut if we issue another credential, every following proof is verified as false.\r\nAttached you can find some code, using mainly the indy sdk and also some functions from the Aries Framework for .NET.\r\n\r\nThis has already been discussed in the aries and ursa chatrooms and was beeing considered to be related to ursa.", 'Issues to address from the latest software license scan. The attached .txt file has the notes, the attached .xls has the details, the attached .spdx file is for scripts to chew on.', 'Currently the code liv

In [5]:
data = list(zip(titles, descriptions))

In [6]:
print(data[:1])

[('Revocation / indy-sdk proof verification fails after issuing a second or more credentials', "We are facing some issues with proofing credentials issued on cred defs with revocation support.\r\nWhen there's only one credential issued for the corresponding revoc reg everything is fine.\r\nBut if we issue another credential, every following proof is verified as false.\r\nAttached you can find some code, using mainly the indy sdk and also some functions from the Aries Framework for .NET.\r\n\r\nThis has already been discussed in the aries and ursa chatrooms and was beeing considered to be related to ursa.")]


In [7]:
# Tokenize the titles
title_tokens = tokenizer(titles, padding=True, truncation=True, return_tensors="pt")

# Tokenize the descriptions
description_tokens = tokenizer(descriptions, padding=True, truncation=True, return_tensors="pt")

In [8]:
!pip install torch



In [None]:
import torch

# Generate embeddings for titles and descriptions using the model
with torch.no_grad():
    title_embeddings = model(**title_tokens).last_hidden_state[:, 0, :].numpy()
    description_embeddings = model(**description_tokens).last_hidden_state[:, 0, :].numpy()

# Calculate Cosine Similarity between title and description embeddings
similarities = cosine_similarity(title_embeddings, description_embeddings)