# Find similar Columns

## Analize Similarity using pretrained BERT-Model

In [103]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')

# Define medical terms
term1 = "chemo_therapy_status"
term2 = "new_adjuvant_status"

# Convert terms into embeddings
embedding1 = model.encode(term1, convert_to_tensor=True)
embedding2 = model.encode(term2, convert_to_tensor=True)

# Calculate cosine similarity
similarity = util.pytorch_cos_sim(embedding1, embedding2).item()

# Output the similarity score
print(f"Similarity between '{term1}' and '{term2}': {similarity:.4f}")

Similarity between 'chemo_therapy_status' and 'new_adjuvant_status': 0.3512


## Analizing similarity using PubMedBERT
PubMedBERT, a BERT model trained on biomedical text

### Importing and Loading Models

In [104]:
#Import Libraries
from transformers import AutoTokenizer, AutoModel #AutoTokenizer Converts text into numbers that the model can understand and AutoModel Loads the pre-trained PubMedBERT model.
import torch #torch is a library for tensor operations (used for machine learning)
from sklearn.metrics.pairwise import cosine_similarity

# Load PubMedBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext')
model = AutoModel.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext')

### Creating a Function to Generate Embeddings

In [106]:
def get_embedding(text):
    """Generate embedding for a single text using PubMedBERT."""
    with torch.no_grad():  # Disable gradient tracking, the model should not calculate gradients (saves resources since we're not training).

        inputs = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
        outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

### Getting Embeddings for Both Terms

In [107]:
term_1 = "chemo_therapy_status"
term_2 = "new_adjuvant_status"

# Get embeddings for each term (their corresponding vectors)
embedding1 = get_embedding(term_1)
embedding2 = get_embedding(term_2)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


### Calculate cosine similarity

In [111]:
similarity_score = cosine_similarity(embedding1, embedding2)[0][0]

print(f"Cosine similarity between '{term_1}' and '{term_2}': {similarity_score:.4f}")

Cosine similarity between 'chemo_therapy_status' and 'new_adjuvant_status': 0.9758


### PROBLEMS 
Note that the similairty is very high, even when words are not as similar. That's why the numers must be finetuned. 

In [113]:
term_3 = "Pacient ID"
term_4 = "Pacient Name"

embedding1 = get_embedding(term_3)
embedding2 = get_embedding(term_4)

similarity_score = cosine_similarity(embedding1, embedding2)[0][0]

print(f"Cosine similarity between '{term_1}' and '{term_2}': {similarity_score:.4f}")

Cosine similarity between 'chemo_therapy_status' and 'new_adjuvant_status': 0.9760


### Code to compare column headers

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Load PubMedBERT tokenizer and model once
tokenizer = AutoTokenizer.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext')
model = AutoModel.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext')

def get_embeddings(texts):
    """Generate embeddings for a list of texts in batches using PubMedBERT."""
    with torch.no_grad():
        inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
        outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

def compare_headers(file_path, sheet_name=None, reference_list=None, similarity_threshold=0.90):
    # Load Excel file and extract headers
    df = pd.read_excel(file_path, sheet_name=sheet_name)
    headers = df.columns.tolist()

    # Generate embeddings for headers
    header_embeddings = get_embeddings(headers)
    
    # If reference list is provided, compare headers with it
    if reference_list:
        reference_embeddings = get_embeddings(reference_list)
        similarities = cosine_similarity(header_embeddings, reference_embeddings)
        results = []

        for i, header in enumerate(headers):
            for j, reference in enumerate(reference_list):
                similarity_score = similarities[i][j]
                if similarity_score > similarity_threshold:
                    results.append([header, reference, similarity_score])
    else:
        # Compare headers among themselves
        similarities = cosine_similarity(header_embeddings)
        results = []

        for i, header in enumerate(headers):
            for j, header_check in enumerate(headers):
                if i != j:
                    similarity_score = similarities[i][j]
                    if similarity_score > similarity_threshold:
                        results.append([header, header_check, similarity_score])

    # Create DataFrame for results
    df_results = pd.DataFrame(results, columns=['Header', 'Matched Header/Reference', 'Cosine Similarity'])
    return df_results

## Different trained Models

These models are designed specifically to handle medical and clinical data, making them the best choice for medical text similarity

### PubMedBERT

This model is specifically trained on PubMed data (abstracts and full-text biomedical literature), making it highly effective for biomedical and medical text mining tasks, including text similarity, named entity recognition (NER), and relation extraction.

In [135]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext')
model = AutoModel.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext')


In [137]:
def get_embedding(text):
    """Generate embedding for a single text using PubMedBERT."""
    with torch.no_grad():  # Disable gradient tracking, the model should not calculate gradients (saves resources since we're not training).

        inputs = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
        outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        
term_1 = "chemo_therapy_status"
term_2 = "new_adjuvant_status"

# Get embeddings for each term (their corresponding vectors)
embedding1 = get_embedding(term_1)
embedding2 = get_embedding(term_2)

similarity_score = cosine_similarity(embedding1, embedding2)[0][0]

print(f"Cosine similarity between '{term_1}' and '{term_2}': {similarity_score:.4f}")

term_3 = "Pacient ID"
term_4 = "Pacient Name"

embedding1 = get_embedding(term_3)
embedding2 = get_embedding(term_4)

similarity_score = cosine_similarity(embedding1, embedding2)[0][0]

print(f"Cosine similarity between '{term_3}' and '{term_4}': {similarity_score:.4f}")
        

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Cosine similarity between 'chemo_therapy_status' and 'new_adjuvant_status': 0.9758
Cosine similarity between 'chemo_therapy_status' and 'new_adjuvant_status': 0.9760


### BioBERT

BioBERT is a domain-specific version of BERT, pre-trained on large-scale biomedical corpora (such as PubMed and PMC). It excels at biomedical named entity recognition (NER), question answering, and other medical NLP tasks.

In [139]:
tokenizer = AutoTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.1')
model = AutoModel.from_pretrained('dmis-lab/biobert-base-cased-v1.1')


In [141]:
def get_embedding(text):
    """Generate embedding for a single text using PubMedBERT."""
    with torch.no_grad():  # Disable gradient tracking, the model should not calculate gradients (saves resources since we're not training).

        inputs = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
        outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        
term_1 = "chemo_therapy_status"
term_2 = "new_adjuvant_status"

# Get embeddings for each term (their corresponding vectors)
embedding1 = get_embedding(term_1)
embedding2 = get_embedding(term_2)

similarity_score = cosine_similarity(embedding1, embedding2)[0][0]

print(f"Cosine similarity between '{term_1}' and '{term_2}': {similarity_score:.4f}")

term_3 = "Pacient ID"
term_4 = "Pacient Name"

embedding1 = get_embedding(term_3)
embedding2 = get_embedding(term_4)

similarity_score = cosine_similarity(embedding1, embedding2)[0][0]

print(f"Cosine similarity between '{term_3}' and '{term_4}': {similarity_score:.4f}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Cosine similarity between 'chemo_therapy_status' and 'new_adjuvant_status': 0.9313
Cosine similarity between 'chemo_therapy_status' and 'new_adjuvant_status': 0.9739


### CLINICALBERT

ClinicalBERT is fine-tuned specifically on clinical notes from patient records. It helps in processing text from clinical environments, such as electronic medical records (EMRs) or health assessments.

In [143]:
tokenizer = AutoTokenizer.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')
model = AutoModel.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')


In [145]:
def get_embedding(text):
    """Generate embedding for a single text using PubMedBERT."""
    with torch.no_grad():  # Disable gradient tracking, the model should not calculate gradients (saves resources since we're not training).

        inputs = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
        outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        
term_1 = "chemo_therapy_status"
term_2 = "new_adjuvant_status"

# Get embeddings for each term (their corresponding vectors)
embedding1 = get_embedding(term_1)
embedding2 = get_embedding(term_2)

similarity_score = cosine_similarity(embedding1, embedding2)[0][0]

print(f"Cosine similarity between '{term_1}' and '{term_2}': {similarity_score:.4f}")

term_3 = "Pacient ID"
term_4 = "Pacient Name"

embedding1 = get_embedding(term_3)
embedding2 = get_embedding(term_4)

similarity_score = cosine_similarity(embedding1, embedding2)[0][0]

print(f"Cosine similarity between '{term_3}' and '{term_4}': {similarity_score:.4f}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Cosine similarity between 'chemo_therapy_status' and 'new_adjuvant_status': 0.9261
Cosine similarity between 'Pacient ID' and 'Pacient Name': 0.8963


### BlueBERT

BlueBERT is trained on both PubMed data and the MIMIC-III clinical dataset, making it effective for tasks that involve both biomedical research and clinical notes. It is particularly useful for tasks such as clinical text classification and similarity detection.

In [153]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained('bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12')
model = AutoModel.from_pretrained('bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12')


config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/441M [00:00<?, ?B/s]

In [155]:
def get_embedding(text):
    """Generate embedding for a single text using PubMedBERT."""
    with torch.no_grad():  # Disable gradient tracking, the model should not calculate gradients (saves resources since we're not training).

        inputs = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
        outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        
term_1 = "chemo_therapy_status"
term_2 = "new_adjuvant_status"

# Get embeddings for each term (their corresponding vectors)
embedding1 = get_embedding(term_1)
embedding2 = get_embedding(term_2)

similarity_score = cosine_similarity(embedding1, embedding2)[0][0]

print(f"Cosine similarity between '{term_1}' and '{term_2}': {similarity_score:.4f}")

term_3 = "Pacient ID"
term_4 = "Pacient Name"

embedding1 = get_embedding(term_3)
embedding2 = get_embedding(term_4)

similarity_score = cosine_similarity(embedding1, embedding2)[0][0]

print(f"Cosine similarity between '{term_3}' and '{term_4}': {similarity_score:.4f}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Cosine similarity between 'chemo_therapy_status' and 'new_adjuvant_status': 0.9229
Cosine similarity between 'Pacient ID' and 'Pacient Name': 0.9428


### BioELECTRA 
BioELECTRA is a variation of BioBERT that uses the ELECTRA framework, which is faster and more efficient in pretraining. It's fine-tuned for biomedical text, making it ideal for large-scale biomedical NLP tasks.

In [157]:
tokenizer = AutoTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.2')
model = AutoModel.from_pretrained('dmis-lab/biobert-base-cased-v1.2')



config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [159]:
def get_embedding(text):
    """Generate embedding for a single text using PubMedBERT."""
    with torch.no_grad():  # Disable gradient tracking, the model should not calculate gradients (saves resources since we're not training).

        inputs = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
        outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        
term_1 = "chemo_therapy_status"
term_2 = "new_adjuvant_status"

# Get embeddings for each term (their corresponding vectors)
embedding1 = get_embedding(term_1)
embedding2 = get_embedding(term_2)

similarity_score = cosine_similarity(embedding1, embedding2)[0][0]

print(f"Cosine similarity between '{term_1}' and '{term_2}': {similarity_score:.4f}")

term_3 = "Pacient ID"
term_4 = "Pacient Name"

embedding1 = get_embedding(term_3)
embedding2 = get_embedding(term_4)

similarity_score = cosine_similarity(embedding1, embedding2)[0][0]

print(f"Cosine similarity between '{term_3}' and '{term_4}': {similarity_score:.4f}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Cosine similarity between 'chemo_therapy_status' and 'new_adjuvant_status': 0.9305
Cosine similarity between 'Pacient ID' and 'Pacient Name': 0.9164


### MedBERT
MedBERT is another variation of BERT optimized for medical NLP tasks. It is trained on a variety of medical datasets and excels at understanding and processing medical terminology.


In [151]:
tokenizer = AutoTokenizer.from_pretrained('alobha/medbert')
model = AutoModel.from_pretrained('alobha/medbert')



OSError: alobha/medbert is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`