# openai

In [7]:
import os
from openai import OpenAI

client = OpenAI(
    api_key=OPENAI_API_KEY
)

In [8]:
def get_embedding(text, model="text-embedding-ada-002"):
   return client.embeddings.create(input=[text], model=model).data[0].embedding

In [14]:
item1 = "Pneumothorax is An accumulation of air or gas in the PLEURAL CAVITY, which may occur spontaneously or as a result of trauma or a pathological process. The gas may also be introduced deliberately during PNEUMOTHORAX, ARTIFICIAL. (MSH)"
item2 = "Lungs is Either of the pair of organs occupying the cavity of the thorax that effect the aeration of the blood. (MSH)"
item3 = "Pleural is Of or pertaining to the pleura. (NCI)"
item4 = "Thyroidectomy is Surgical removal of the thyroid gland." 
items = [item1, item2, item3, item4]

mention = 'Pneumothorax'

item_embeddings = [get_embedding(item) for item in items]

mention = 'Pneumothorax'
mention_embedding = get_embedding(mention)

import torch

item_embeddings = torch.tensor(item_embeddings)
mention_embedding = torch.tensor(mention_embedding)

import torch.nn.functional as F

for i, item in enumerate(items):
    similarity = F.cosine_similarity(mention_embedding.unsqueeze(0), item_embeddings[i].unsqueeze(0))
    print(f"Similarity between {mention} and {item} is {similarity.item()}")

Similarity between Pneumothorax and Pneumothorax is An accumulation of air or gas in the PLEURAL CAVITY, which may occur spontaneously or as a result of trauma or a pathological process. The gas may also be introduced deliberately during PNEUMOTHORAX, ARTIFICIAL. (MSH) is 0.9123316407203674
Similarity between Pneumothorax and Lungs is Either of the pair of organs occupying the cavity of the thorax that effect the aeration of the blood. (MSH) is 0.835702657699585
Similarity between Pneumothorax and Pleural is Of or pertaining to the pleura. (NCI) is 0.8355947136878967
Similarity between Pneumothorax and Thyroidectomy is Surgical removal of the thyroid gland. is 0.7639389634132385


In [1]:
print(item_embeddings)

NameError: name 'item_embeddings' is not defined

# Llama 3

In [9]:
from transformers import pipeline

item = 'test'

pipeline = pipeline('feature-extraction', model="meta-llama/Meta-Llama-3-8B")
data = pipeline(item)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [11]:
print(len(data[0][0]))

4096


In [16]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import euclidean
from transformers import pipeline
import torch

# 定义使用Llama 3的get_word_centric_embedding函数
def get_word_centric_embedding(text, target_word, model_name="meta-llama/Meta-Llama-3-8B"):
    # 初始化pipeline
    feature_extractor = pipeline('feature-extraction', model=model_name)
    
    # 获取整个文本的embedding
    full_embedding = feature_extractor(text)
    
    # 将embedding转换为numpy数组
    full_embedding = np.array(full_embedding[0])
    
    # 获取目标词在文本中的位置
    words = text.split()
    target_word_positions = [i for i, word in enumerate(words) if word.lower() == target_word.lower()]
    
    if not target_word_positions:
        raise ValueError(f"Target word '{target_word}' not found in the text.")
    
    # 提取目标词的embedding
    target_embeddings = full_embedding[target_word_positions]
    
    # 计算平均embedding
    word_centric_embedding = np.mean(target_embeddings, axis=0)
    
    return word_centric_embedding


# 定义项目
items = [
    "Pneumothorax is An accumulation of air or gas in the PLEURAL CAVITY, which may occur spontaneously or as a result of trauma or a pathological process. The gas may also be introduced deliberately during PNEUMOTHORAX, ARTIFICIAL. (MSH)",
    "Lungs is Either of the pair of organs occupying the cavity of the thorax that effect the aeration of the blood. (MSH)",
    "Pleural is Of or pertaining to the pleura. (NCI)",
    "Thyroidectomy is Surgical removal of the thyroid gland."
]

mention = "Pneumothorax"

# 获取每个项目的target word和embedding
item_embeddings = []
for item in items:
    target_word = item.split(' is')[0]
    embedding = get_word_centric_embedding(item, target_word)
    item_embeddings.append(embedding)

# 获取mention的embedding
mention_embedding = get_word_centric_embedding(mention, mention)

# 定义相似性计算函数
def cosine_sim(a, b):
    return cosine_similarity([a], [b])[0][0]

def euclidean_sim(a, b):
    return 1 / (1 + euclidean(a, b))

def dot_product_sim(a, b):
    return np.dot(a, b)

# 计算相似性并排序
similarity_methods = {
    "Cosine Similarity": cosine_sim,
    "Euclidean Similarity": euclidean_sim,
    "Dot Product": dot_product_sim
}

for method_name, sim_func in similarity_methods.items():
    print(f"\nUsing {method_name}:")
    similarities = [sim_func(mention_embedding, item_emb) for item_emb in item_embeddings]
    sorted_indices = np.argsort(similarities)[::-1]
    
    for i, idx in enumerate(sorted_indices):
        print(f"{i+1}. {items[idx][:50]}... (Similarity: {similarities[idx]:.4f})")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.



Using Cosine Similarity:
1. Thyroidectomy is Surgical removal of the thyroid g... (Similarity: 1.0000)
2. Pleural is Of or pertaining to the pleura. (NCI)... (Similarity: 1.0000)
3. Lungs is Either of the pair of organs occupying th... (Similarity: 1.0000)
4. Pneumothorax is An accumulation of air or gas in t... (Similarity: 1.0000)

Using Euclidean Similarity:
1. Thyroidectomy is Surgical removal of the thyroid g... (Similarity: 1.0000)
2. Pleural is Of or pertaining to the pleura. (NCI)... (Similarity: 0.9999)
3. Lungs is Either of the pair of organs occupying th... (Similarity: 0.9999)
4. Pneumothorax is An accumulation of air or gas in t... (Similarity: 0.9999)

Using Dot Product:
1. Pleural is Of or pertaining to the pleura. (NCI)... (Similarity: 23961.6815)
2. Lungs is Either of the pair of organs occupying th... (Similarity: 23961.6815)
3. Pneumothorax is An accumulation of air or gas in t... (Similarity: 23961.6815)
4. Thyroidectomy is Surgical removal of the thyroid g... (Sim

# BERT

In [16]:
import torch
from transformers import BertTokenizer, BertModel
from scipy.spatial.distance import cosine

# Load the pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Define the mention and items
mention = 'Pneumothorax'
items = ['Pneumothorax is An accumulation of air or gas in the PLEURAL CAVITY, which may occur spontaneously or as a result of trauma or a pathological process. The gas may also be introduced deliberately during PNEUMOTHORAX, ARTIFICIAL. (MSH)',
         'Lungs is Either of the pair of organs occupying the cavity of the thorax that effect the aeration of the blood. (MSH)',
         'Pleural is Of or pertaining to the pleura. (NCI)',
         'Thyroidectomy is Surgical removal of the thyroid gland.']

# Tokenize the mention and items
mention_tokens = tokenizer.tokenize(mention)
item_tokens = [tokenizer.tokenize(item) for item in items]

# Convert tokens to input IDs
mention_input_ids = tokenizer.convert_tokens_to_ids(mention_tokens)
item_input_ids = [tokenizer.convert_tokens_to_ids(tokens) for tokens in item_tokens]

# Pad input IDs to the same length
max_length = max(len(mention_input_ids), max(len(ids) for ids in item_input_ids))
mention_input_ids = mention_input_ids + [0] * (max_length - len(mention_input_ids))
item_input_ids = [ids + [0] * (max_length - len(ids)) for ids in item_input_ids]

# Convert input IDs to tensors
mention_input_ids = torch.tensor([mention_input_ids])
item_input_ids = torch.tensor(item_input_ids)

# Generate BERT embeddings
with torch.no_grad():
    mention_embeddings = model.embeddings.word_embeddings(mention_input_ids)
    item_embeddings = model.embeddings.word_embeddings(item_input_ids)

# Average the embeddings
mention_embedding = mention_embeddings.mean(dim=1)
item_embeddings = item_embeddings.mean(dim=1)

# Calculate cosine similarity
similarities = [1 - cosine(mention_embedding.squeeze().numpy(), item_embedding.squeeze().numpy()) for item_embedding in item_embeddings]

# Print the similarities
for i, item in enumerate(items):
    print(f"Similarity between '{mention}' and '{item}' is {similarities[i]}")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Similarity between 'Pneumothorax' and 'Pneumothorax is An accumulation of air or gas in the PLEURAL CAVITY, which may occur spontaneously or as a result of trauma or a pathological process. The gas may also be introduced deliberately during PNEUMOTHORAX, ARTIFICIAL. (MSH)' is 0.5690178275108337
Similarity between 'Pneumothorax' and 'Lungs is Either of the pair of organs occupying the cavity of the thorax that effect the aeration of the blood. (MSH)' is 0.9573487043380737
Similarity between 'Pneumothorax' and 'Pleural is Of or pertaining to the pleura. (NCI)' is 0.988681972026825
Similarity between 'Pneumothorax' and 'Thyroidectomy is Surgical removal of the thyroid gland.' is 0.9960650205612183
