In [None]:
!pip install transformers SentencePiece
from huggingface_hub import notebook_login
notebook_login()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import torch
import re
from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel, XLNetTokenizer, XLNetModel, AutoTokenizer, AutoModel
from scipy.spatial.distance import cosine
from ast import literal_eval
from google.colab import drive
drive.mount('/content/gdrive')

# Run locally
#DATA_PATH = ''

# KW
DATA_PATH = '/content/gdrive/MyDrive/CSI6900/'

# FZ
#DATA_PATH = 'gdrive/MyDrive/CSI6900/'

device = torch.device("mps" if getattr(torch,'has_mps',False) else "cuda:0" if torch.cuda.is_available() else "cpu")
device

## Randomly select 10 search terms from file

In [None]:
terms = pd.read_csv(DATA_PATH + 'data/Most-popular-search-terms.csv')
search_terms = [terms['searchTerm'][i] for i in range(10)]
search_terms

## Load data

In [None]:
records = pd.read_json(DATA_PATH + 'data/records.json')
records = records.drop(['created_at', 'updated_at', 'deleted_at', 'publish', 'academic_credentials', 'age_max', 'age_min', 'last_name', 
                        'latitude', 'longitude', 'name_of_private_practice', 'fee_description',	'fee_type',	'first_name',	'languages',
                        'organization_type', 'original_id',	'record_type',	'salutation_type', 'website'], axis=1)

taxonomy = pd.read_json(DATA_PATH + 'data/taxonomy_headings.json')
taxonomy = taxonomy.drop(['created_at',	'updated_at',	'deleted_at', 'alias_of_id', 'short_description',	'original_id'], axis=1)

taxonomy.head(10)

## Pre-processing

In [None]:
!cp /content/gdrive/MyDrive/CSI6900/pre_processing.py /content

In [None]:
from pre_processing import PreProcessing

records, taxonomy = PreProcessing(records, taxonomy).preprocess()

## Helper functions

In [None]:
def load_embeddings(record_file_path, taxonomy_file_path):
    # Load embeddings from pt
    record_embeddings = torch.load(record_file_path)
    taxonomy_embeddings = torch.load(taxonomy_file_path)

    # Load embeddings from csv
    # records = pd.read_csv(record_file_path)
    # #records['embeddings'] = records['embeddings'].apply(literal_eval)
    # taxonomy = pd.read_csv(taxonomy_file_path)
    # #taxonomy['embeddings'] = taxonomy['embeddings'].apply(literal_eval)
    return record_embeddings, taxonomy_embeddings

def get_highest_numbers_with_indices(numbers, n=10):
    """
    Returns the n highest numbers in a list along with their indices.
    :param numbers: List of numbers
    :param n: Number of highest numbers to retrieve (default: 10)
    :return: List of tuples containing the highest numbers and their indices
    """
    highest_numbers_with_indices = []
    for i, num in enumerate(numbers):
        if len(highest_numbers_with_indices) < n:
            highest_numbers_with_indices.append((num, i))
            highest_numbers_with_indices.sort(reverse=True)
        else:
            if num > highest_numbers_with_indices[-1][0]:
                highest_numbers_with_indices.pop()
                highest_numbers_with_indices.append((num, i))
                highest_numbers_with_indices.sort(reverse=True)
    return highest_numbers_with_indices

## BERT

In [None]:
tokenizer = BertTokenizer.from_pretrained("AIMH/mental-bert-large-uncased")
model = BertModel.from_pretrained("AIMH/mental-bert-large-uncased").to(device)
model.eval()

Some weights of the model checkpoint at AIMH/mental-bert-large-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 1024, padding_idx=0)
    (position_embeddings): Embedding(512, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-23): 24 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inpl

### Generate embedding for search terms

In [None]:
tokenized_search_terms = tokenizer(search_terms, padding='max_length', max_length=512, truncation=True, return_tensors='pt').to(device)

# Generate embeddings
search_term_embeddings = []
with torch.no_grad():
    embedding = model(input_ids=tokenized_search_terms['input_ids'],
                    attention_mask=tokenized_search_terms['attention_mask'],
                    token_type_ids=tokenized_search_terms['token_type_ids'])
    embedding = embedding.last_hidden_state.mean(dim=1).cpu()
    
search_term_embeddings = embedding.tolist()

### Read embeddings from file

In [None]:
record_embeddings, taxonomy_embeddings = load_embeddings(DATA_PATH + 'data/embeddings/bert_records_embeddings.pt', DATA_PATH + 'data/embeddings/bert_taxonomy_embeddings.pt')
print(record_embeddings.shape)
print(taxonomy_embeddings.shape)

torch.Size([6239, 1024])
torch.Size([192, 1024])


### Prediction

In [None]:
records_result = {'Search Term': [], 'Record ID': [], 'Name': [], 'Description': [], 'Similarity score': []}
taxonomy_result = {'Search Term': [], 'Record ID': [], 'Name': [], 'Description': [], 'Similarity score': []}

In [None]:
for idx in range(len(search_term_embeddings)):
    # Records
    cos_sim = []
    for each in record_embeddings:
        cos_sim.append(1 - cosine(search_term_embeddings[idx], each))

    lst = get_highest_numbers_with_indices(cos_sim)

    for id in lst:
        records_result['Search Term'].append(search_terms[idx])
        records_result['Record ID'].append(id[1])
        records_result['Name'].append(records['name'][id[1]])
        records_result['Description'].append(records['description'][id[1]])
        records_result['Similarity score'].append(id[0])

    # Taxonomy
    cos_sim = []
    for each in taxonomy_embeddings:
        cos_sim.append(1 - cosine(search_term_embeddings[idx], each))

    lst = get_highest_numbers_with_indices(cos_sim)

    for id in lst:
        taxonomy_result['Search Term'].append(search_terms[idx])
        taxonomy_result['Record ID'].append(id[1])
        taxonomy_result['Name'].append(taxonomy['name'][id[1]])
        taxonomy_result['Description'].append(taxonomy['description'][id[1]])
        taxonomy_result['Similarity score'].append(id[0])

In [None]:
records_result = pd.DataFrame(records_result)
records_result.to_csv(DATA_PATH + 'data/predictions/bert_records.csv', index=False)
taxonomy_result = pd.DataFrame(taxonomy_result)
taxonomy_result.to_csv(DATA_PATH + 'data/predictions/bert_taxonomy.csv', index=False)

## RoBERTa

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('AIMH/mental-roberta-large')
model = RobertaModel.from_pretrained('AIMH/mental-roberta-large').to(device)
model.eval()

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/328 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of the model checkpoint at AIMH/mental-roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 1024, padding_idx=1)
    (position_embeddings): Embedding(514, 1024, padding_idx=1)
    (token_type_embeddings): Embedding(1, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-23): 24 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      

### Generate embeddings for search terms

In [None]:
tokenized_search_terms = tokenizer(search_terms, padding='max_length', max_length=512, truncation=True, return_tensors='pt').to(device)

# Generate embeddings
search_term_embeddings = []
with torch.no_grad():
    embedding = model(input_ids=tokenized_search_terms['input_ids'],
                    attention_mask=tokenized_search_terms['attention_mask'])
    embedding = embedding.last_hidden_state.mean(dim=1).cpu()
    
search_term_embeddings = embedding.tolist()

### Read embeddings from file

In [None]:
record_embeddings, taxonomy_embeddings = load_embeddings(DATA_PATH + 'data/embeddings/roberta_records_embeddings.pt', DATA_PATH + 'data/embeddings/roberta_taxonomy_embeddings.pt')
print(record_embeddings.shape)
print(taxonomy_embeddings.shape)

torch.Size([6239, 1024])
torch.Size([192, 1024])


### Predictions

In [None]:
records_result = {'Search Term': [], 'Record ID': [], 'Name': [], 'Description': [], 'Similarity score': []}
taxonomy_result = {'Search Term': [], 'Record ID': [], 'Name': [], 'Description': [], 'Similarity score': []}

In [None]:
for idx in range(len(search_term_embeddings)):
    # Records
    cos_sim = []
    for each in record_embeddings:
        cos_sim.append(1 - cosine(search_term_embeddings[idx], each))

    lst = get_highest_numbers_with_indices(cos_sim)

    for id in lst:
        records_result['Search Term'].append(search_terms[idx])
        records_result['Record ID'].append(id[1])
        records_result['Name'].append(records['name'][id[1]])
        records_result['Description'].append(records['description'][id[1]])
        records_result['Similarity score'].append(id[0])

    # Taxonomy
    cos_sim = []
    for each in taxonomy_embeddings:
        cos_sim.append(1 - cosine(search_term_embeddings[idx], each))

    lst = get_highest_numbers_with_indices(cos_sim)

    for id in lst:
        taxonomy_result['Search Term'].append(search_terms[idx])
        taxonomy_result['Record ID'].append(id[1])
        taxonomy_result['Name'].append(taxonomy['name'][id[1]])
        taxonomy_result['Description'].append(taxonomy['description'][id[1]])
        taxonomy_result['Similarity score'].append(id[0])

In [None]:
records_result = pd.DataFrame(records_result)
records_result.to_csv(DATA_PATH + 'data/predictions/roberta_records.csv', index=False)
taxonomy_result = pd.DataFrame(taxonomy_result)
taxonomy_result.to_csv(DATA_PATH + 'data/predictions/roberta_taxonomy.csv', index=False)

## XLNet

In [None]:
tokenizer = XLNetTokenizer.from_pretrained('AIMH/mental-xlnet-base-cased')
model = XLNetModel.from_pretrained('AIMH/mental-xlnet-base-cased').to(device)
model.eval()

Downloading spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/489 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/921 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of the model checkpoint at AIMH/mental-xlnet-base-cased were not used when initializing XLNetModel: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


XLNetModel(
  (word_embedding): Embedding(32000, 768)
  (layer): ModuleList(
    (0-11): 12 x XLNetLayer(
      (rel_attn): XLNetRelativeAttention(
        (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): XLNetFeedForward(
        (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (layer_1): Linear(in_features=768, out_features=3072, bias=True)
        (layer_2): Linear(in_features=3072, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (activation_function): GELUActivation()
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (dropout): Dropout(p=0.1, inplace=False)
)

### Generate embeddings for search terms

In [None]:
tokenized_search_terms = tokenizer(search_terms, padding='max_length', max_length=512, truncation=True, return_tensors='pt').to(device)

# Generate embeddings
search_term_embeddings = []
with torch.no_grad():
    embedding = model(input_ids=tokenized_search_terms['input_ids'],
                    attention_mask=tokenized_search_terms['attention_mask'],
                    token_type_ids=tokenized_search_terms['token_type_ids'])
    embedding = embedding.last_hidden_state.mean(dim=1).cpu()
    
search_term_embeddings = embedding.tolist()

### Read embeddings from file

In [None]:
record_embeddings, taxonomy_embeddings = load_embeddings(DATA_PATH + 'data/embeddings/xlnet_records_embeddings.pt', DATA_PATH + 'data/embeddings/xlnet_taxonomy_embeddings.pt')
print(record_embeddings.shape)
print(taxonomy_embeddings.shape)

torch.Size([6239, 768])
torch.Size([192, 768])


### Prediction

In [None]:
records_result = {'Search Term': [], 'Record ID': [], 'Name': [], 'Description': [], 'Similarity score': []}
taxonomy_result = {'Search Term': [], 'Record ID': [], 'Name': [], 'Description': [], 'Similarity score': []}

In [None]:
for idx in range(len(search_term_embeddings)):
    # Records
    cos_sim = []
    for each in record_embeddings:
        cos_sim.append(1 - cosine(search_term_embeddings[idx], each))

    lst = get_highest_numbers_with_indices(cos_sim)

    for id in lst:
        records_result['Search Term'].append(search_terms[idx])
        records_result['Record ID'].append(id[1])
        records_result['Name'].append(records['name'][id[1]])
        records_result['Description'].append(records['description'][id[1]])
        records_result['Similarity score'].append(id[0])

    # Taxonomy
    cos_sim = []
    for each in taxonomy_embeddings:
        cos_sim.append(1 - cosine(search_term_embeddings[idx], each))

    lst = get_highest_numbers_with_indices(cos_sim)

    for id in lst:
        taxonomy_result['Search Term'].append(search_terms[idx])
        taxonomy_result['Record ID'].append(id[1])
        taxonomy_result['Name'].append(taxonomy['name'][id[1]])
        taxonomy_result['Description'].append(taxonomy['description'][id[1]])
        taxonomy_result['Similarity score'].append(id[0])

In [None]:
records_result = pd.DataFrame(records_result)
records_result.to_csv(DATA_PATH + 'data/predictions/xlnet_records.csv', index=False)
taxonomy_result = pd.DataFrame(taxonomy_result)
taxonomy_result.to_csv(DATA_PATH + 'data/predictions/xlnet_taxonomy.csv', index=False)

## SGPT

In [None]:
# Get SGPT
tokenizer = AutoTokenizer.from_pretrained("Muennighoff/SGPT-1.3B-weightedmean-nli-bitfit")
model = AutoModel.from_pretrained("Muennighoff/SGPT-1.3B-weightedmean-nli-bitfit").to(device)

model.eval()

### Generate embeddings for search terms

In [None]:
def getEmbeddings(input_ids, attention_mask):
    # Get the embeddings
    with torch.no_grad():
        # Get hidden state of shape [bs, seq_len, hid_dim]
        last_hidden_state = model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True, return_dict=True).last_hidden_state

    # Get weights of shape [bs, seq_len, hid_dim]
    weights = (
        torch.arange(start=1, end=last_hidden_state.shape[1] + 1)
        .unsqueeze(0)
        .unsqueeze(-1)
        .expand(last_hidden_state.size())
        .float().to(last_hidden_state.device)
    )

    # Get attn mask of shape [bs, seq_len, hid_dim]
    input_mask_expanded = (
        attention_mask
        .unsqueeze(-1)
        .expand(last_hidden_state.size())
        .float()
    )

    # Perform weighted mean pooling across seq_len: bs, seq_len, hidden_dim -> bs, hidden_dim
    sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded * weights, dim=1)
    sum_mask = torch.sum(input_mask_expanded * weights, dim=1)

    embeddings = sum_embeddings / sum_mask

    return embeddings.cpu()

In [10]:
tokenized_search_terms = tokenizer(search_terms, padding='max_length', max_length=600, truncation=True, return_tensors='pt').to(device)

# Generate embeddings
search_term_embeddings = getEmbeddings(tokenized_search_terms['input_ids'], attention_mask=tokenized_search_terms['attention_mask']).tolist()

torch.Size([10, 2048])

### Read embeddings from file

In [None]:
record_embeddings, taxonomy_embeddings = load_embeddings(DATA_PATH + 'data/embeddings/sgpt_records_embeddings.pt', DATA_PATH + 'data/embeddings/sgpt_taxonomy_embeddings.pt')
print(record_embeddings.shape)
print(taxonomy_embeddings.shape)

torch.Size([6239, 768])
torch.Size([192, 768])


### Prediction

In [None]:
records_result = {'Search Term': [], 'Record ID': [], 'Name': [], 'Description': [], 'Similarity score': []}
taxonomy_result = {'Search Term': [], 'Record ID': [], 'Name': [], 'Description': [], 'Similarity score': []}

In [None]:
for idx in range(len(search_term_embeddings)):
    # Records
    cos_sim = []
    for each in record_embeddings:
        cos_sim.append(1 - cosine(search_term_embeddings[idx], each))
    # for index, row in records.iterrows():
    #     cos_sim.append(1 - cosine(search_term_embeddings[idx], row['embeddings']))

    lst = get_highest_numbers_with_indices(cos_sim)

    for id in lst:
        records_result['Search Term'].append(search_terms[idx])
        records_result['Record ID'].append(id[1])
        records_result['Name'].append(records['name'][id[1]])
        records_result['Description'].append(records['description'][id[1]])
        records_result['Similarity score'].append(id[0])

    # Taxonomy
    cos_sim = []
    for each in taxonomy_embeddings:
        cos_sim.append(1 - cosine(search_term_embeddings[idx], each))
    # for index, row in taxonomy.iterrows():
    #     cos_sim.append(1 - cosine(search_term_embeddings[idx], row['embeddings']))

    lst = get_highest_numbers_with_indices(cos_sim)

    for id in lst:
        taxonomy_result['Search Term'].append(search_terms[idx])
        taxonomy_result['Record ID'].append(id[1])
        taxonomy_result['Name'].append(taxonomy['name'][id[1]])
        taxonomy_result['Description'].append(taxonomy['description'][id[1]])
        taxonomy_result['Similarity score'].append(id[0])

In [None]:
records_result = pd.DataFrame(records_result)
records_result.to_csv(DATA_PATH + 'data/predictions/sgpt_records.csv', index=False)
taxonomy_result = pd.DataFrame(taxonomy_result)
taxonomy_result.to_csv(DATA_PATH + 'data/predictions/sgpt_taxonomy.csv', index=False)