In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sentence_transformers import util
import torch
import inflect
import string
import re
import numpy as np

# PreProcessing - Alternative 

In [3]:
metadata = pd.read_csv("~/mlxai-2024-team-connor-gunti/inputs/selected_attributes.tsv.gz", 
                       sep = '\t', names=['attr', 'text'])
cleaned_data = metadata

Unnamed: 0,attr,text
0,bio_material_sam,feces
1,bio_material_sam,F123
2,bio_material_sam,PBMCs
3,bio_material_sam,SM10632
4,bio_material_sam,feces
...,...,...
2095353,ww_surv_target_1_conc_sam,31485.699730221182
2095354,ww_surv_target_1_conc_sam,44935
2095355,ww_surv_target_1_conc_sam,6410.0
2095356,ww_surv_target_1_conc_sam,80000


In [5]:
cleaned_data.dropna(inplace=True)
cleaned_data.drop_duplicates(inplace=True)
len(cleaned_data)

631515

In [9]:
pattern = fr'[{re.escape(string.punctuation)}]'
for index in range(0, len(cleaned_data)):
    if cleaned_data.iloc[index].text.replace('.','',1).isdigit():
        cleaned_data.iloc[index].text = inflect.engine().number_to_words(cleaned_data.iloc[index].text)
    if re.search(pattern, cleaned_data.iloc[index].text):
        cleaned_data.iloc[index].text = cleaned_data.iloc[index].text.translate((str.maketrans(string.punctuation, ' '*len(string.punctuation))))
for index in range(0, len(cleaned_data)):
    cleaned_data.iloc[index].text = cleaned_data.iloc[index].text.lower()

# Pubmedbert Model

In [5]:
df = pd.read_csv('/home/sagemaker-user/mlxai-2024-team-connor-gunti/preprocessed/attribute.seqs.tsv', sep = '\t', names=['name', 'text']).dropna()
cleaned_data = df

In [6]:
model = SentenceTransformer("NeuML/pubmedbert-base-embeddings")

2024-02-29 20:22:28.879236: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
!mkdir -p ../results/pubmedbert-similarity-matrics ../results/pubmedbert-similar-pairs

In [10]:
NAMESPACE = 'pubmedbert'
for a in cleaned_data.name.unique().tolist():
    sentences = list(set(cleaned_data.loc[cleaned_data['name'] == a].text.tolist()))
    embeddings = model.encode(sentences, show_progress_bar=True, normalize_embeddings=True)
    scores = []
    m = np.matmul(embeddings, embeddings.T)
    x = np.tril(m, k=-1)
    np.save(f'../results/{NAMESPACE}-similarity-matrics/{NAMESPACE}-{a}.similarity-score-matrix', m, allow_pickle = False)
    for r, c in zip(*np.where(x>0.85)):
        scores.append([a, x[r, c], sentences[r], sentences[c]])
    pd.DataFrame(scores, columns=['name', 
                                  'score', 
                                  's1', 
                                  's2']).to_csv(f'../results/{NAMESPACE}-similar-pairs/{NAMESPACE}-{a}.similar-pairs.tsv', sep = '\t', index=False)

Batches:   0%|          | 0/291 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/47 [00:00<?, ?it/s]

Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/65 [00:00<?, ?it/s]

Batches:   0%|          | 0/15 [00:00<?, ?it/s]

Batches:   0%|          | 0/130 [00:00<?, ?it/s]

Batches:   0%|          | 0/539 [00:00<?, ?it/s]

Batches:   0%|          | 0/43 [00:00<?, ?it/s]

Batches:   0%|          | 0/42 [00:00<?, ?it/s]

Batches:   0%|          | 0/11 [00:00<?, ?it/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Batches:   0%|          | 0/31 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Batches:   0%|          | 0/141 [00:00<?, ?it/s]

Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Batches:   0%|          | 0/336 [00:00<?, ?it/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Batches:   0%|          | 0/9 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/109 [00:00<?, ?it/s]

Batches:   0%|          | 0/53 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/17 [00:00<?, ?it/s]

Batches:   0%|          | 0/105 [00:00<?, ?it/s]

Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]