In [None]:
!pip install transformers SentencePiece
from huggingface_hub import notebook_login
notebook_login()

# Evaluate models using map_records_under_heading

`map_records_under_heading.csv` contains the links between taxonomy and records.

Each taxonomy corresponds to one or more records.

In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel, XLNetTokenizer, XLNetModel, AutoTokenizer, AutoModel
from scipy.spatial.distance import cosine
import random
from google.colab import drive
drive.mount('/content/gdrive')

DATA_PATH = '/content/gdrive/MyDrive/CSI6900/'

device = torch.device("mps" if getattr(torch,'has_mps',False) else "cuda:0" if torch.cuda.is_available() else "cpu")
device

Mounted at /content/gdrive


device(type='cuda', index=0)

## Load data

In [None]:
map_records_heading = pd.read_csv(DATA_PATH + 'data/map_records_under_heading.csv')
map_records_heading.head(10)

Unnamed: 0,id,created_at,updated_at,parent_id,child_id,item_order
0,1,2020-12-28 01:28:05,2020-12-28 01:28:05,3,1,0
1,2,2020-12-28 01:28:05,2020-12-28 01:28:05,8,1,0
2,3,2020-12-28 01:28:05,2020-12-28 01:28:05,12,1,0
3,4,2020-12-28 01:28:05,2020-12-28 01:28:05,13,1,0
4,5,2020-12-28 01:28:05,2020-12-28 01:28:05,14,1,0
5,6,2020-12-28 01:28:05,2020-12-28 01:28:05,16,1,0
6,7,2020-12-28 01:28:05,2020-12-28 01:28:05,17,1,0
7,8,2020-12-28 01:28:05,2020-12-28 01:28:05,19,1,0
8,9,2020-12-28 01:28:05,2020-12-28 01:28:05,20,1,0
9,10,2020-12-28 01:28:05,2020-12-28 01:28:05,21,1,0


In [None]:
taxonomy_records = list(zip(map_records_heading['parent_id'], map_records_heading['child_id']))

In [None]:
records = pd.read_json(DATA_PATH + 'data/records.json')
records = records.drop(['created_at', 'updated_at', 'deleted_at', 'publish', 'academic_credentials', 'age_max', 'age_min', 'last_name',
                        'latitude', 'longitude', 'name_of_private_practice', 'fee_description',	'fee_type',	'first_name',	'languages',
                        'organization_type', 'original_id',	'record_type',	'salutation_type', 'website'], axis=1)

taxonomy = pd.read_json(DATA_PATH + 'data/taxonomy_headings.json')
taxonomy = taxonomy.drop(['created_at',	'updated_at',	'deleted_at', 'alias_of_id', 'short_description',	'original_id'], axis=1)

taxonomy.head(10)

Unnamed: 0,id,name,description,translations
0,1,Root,Root,"{""name"":{""en"":""Root"",""fr"":null},""description"":..."
1,2,All Mental Health Resources,<p>\r\n\tThe listings of mental health resourc...,"{""name"":{""en"":""All Mental Health Resources"",""f..."
2,3,Crisis and Emergency,<p>\r\n\tRefers to all programs that provide i...,"{""name"":{""en"":""Crisis and Emergency"",""fr"":""Res..."
3,4,"System Navigation, including Information and R...","<p>\r\n\tAre you looking for help, but don&#39...","{""name"":{""en"":""System Navigation, including In..."
4,5,Child Welfare including Children's Aid Society...,<p>The child welfare / child protection system...,"{""name"":{""en"":""Child Welfare including Childre..."
5,6,Emergency Shelter and Housing,<p>\r\n\tThere are various shelters that peopl...,"{""name"":{""en"":""Emergency Shelter and Housing"",..."
6,7,Hospital Emergency Department,<p>\r\n\tIs there an emergency such as medical...,"{""name"":{""en"":""Hospital Emergency Department"",..."
7,8,"Crisis Lines including Telephone, Online and Chat",<p>\r\n\tAre you in a crisis? Crisis lines off...,"{""name"":{""en"":""Crisis Lines including Telephon..."
8,9,Psychiatrists,<p>\r\n\tPsychiatrists are medical doctors who...,"{""name"":{""en"":""Psychiatrists"",""fr"":""Psychiatre..."
9,10,A-Z Mental Health Conditions and Topics,<p>\r\n\tAlphabetical list of mental health to...,"{""name"":{""en"":""A-Z Mental Health Conditions an..."


## Pre-processing

In [None]:
!cp /content/gdrive/MyDrive/CSI6900/pre_processing.py /content

In [None]:
from pre_processing import PreProcessing

records, taxonomy = PreProcessing(records, taxonomy).preprocess()

Length of records before preprocessing: 6406
Length of taxonomy before preprocessing: 277
Length of records after preprocessing: 6239
Length of taxonomy after preprocessing: 192


## Randomly chose ten taxonomies

In [None]:
# Generate ten random integers between 0 and 191
random_ids = [random.randint(0, 191) for i in range(10)]
search_terms = []
for each in random_ids:
    print(str(each) + ': ' + taxonomy.iloc[each]['name'] + '\n' + taxonomy.iloc[each]['description'] + '\n')
    search_terms.append(taxonomy.iloc[each]['name'])

179: Visual Stress
Visual stress is a visual perceptual processing condition that affects how visual information is interpreted by the brain and interferes with reading, attention, coordination, general health and behaviour. This is different from problems involving sight or sharpness of vision and can occur despite normal vision. Classic symptoms include light sensitivity, headaches from reading, and problems reading because the white page appears too bright or the words appear to be moving, flashing, or jumping on the page. As reading is such a key skill for school and life in general, problems with reading can thus lead to significant impairment. The good news is that appropriate intervention can make a significant improvement and for many individuals, one of the interventions is as simple as specific colour filters.

134: Outpatient/community mental health services
Outpatient Mental Health Services is where a person receives services by going to an office or clinic, without having 

## Helper functions

In [None]:
def load_embeddings(record_file_path, taxonomy_file_path):
    # Load embeddings from pt
    record_embeddings = torch.load(record_file_path)
    taxonomy_embeddings = torch.load(taxonomy_file_path)
    return record_embeddings, taxonomy_embeddings

def get_highest_numbers_with_indices(numbers, n=10):
    """
    Returns the n highest numbers in a list along with their indices.
    :param numbers: List of numbers
    :param n: Number of highest numbers to retrieve (default: 10)
    :return: List of tuples containing the highest numbers and their indices
    """
    highest_numbers_with_indices = []
    for i, num in enumerate(numbers):
        if len(highest_numbers_with_indices) < n:
            highest_numbers_with_indices.append((num, i))
            highest_numbers_with_indices.sort(reverse=True)
        else:
            if num > highest_numbers_with_indices[-1][0]:
                highest_numbers_with_indices.pop()
                highest_numbers_with_indices.append((num, i))
                highest_numbers_with_indices.sort(reverse=True)
    return highest_numbers_with_indices

## BERT

In [None]:
tokenizer = BertTokenizer.from_pretrained("AIMH/mental-bert-large-uncased")
model = BertModel.from_pretrained("AIMH/mental-bert-large-uncased").to(device)
model.eval()

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/322 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/641 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at AIMH/mental-bert-large-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at AIMH/mental-bert-large-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.poo

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 1024, padding_idx=0)
    (position_embeddings): Embedding(512, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-23): 24 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inpl

### Generate embeddings for search_terms

In [None]:
tokenized_search_terms = tokenizer(search_terms, padding='max_length', max_length=512, truncation=True, return_tensors='pt').to(device)

# Generate embeddings
search_term_embeddings = []
with torch.no_grad():
    embedding = model(input_ids=tokenized_search_terms['input_ids'],
                    attention_mask=tokenized_search_terms['attention_mask'],
                    token_type_ids=tokenized_search_terms['token_type_ids'])
    embedding = embedding.last_hidden_state.mean(dim=1).cpu()

search_term_embeddings = embedding.tolist()

### Read embeddings from file

In [None]:
record_embeddings, taxonomy_embeddings = load_embeddings(DATA_PATH + 'data/embeddings/bert_records_embeddings.pt', DATA_PATH + 'data/embeddings/bert_taxonomy_embeddings.pt')
print(record_embeddings.shape)
print(taxonomy_embeddings.shape)

torch.Size([6239, 1024])
torch.Size([192, 1024])


### Prediction

In [None]:
records_result = {'taxonomy_record': [], 'Similarity score': []}

for idx in range(len(search_term_embeddings)):
    # Records
    cos_sim = []
    for each in record_embeddings:
        cos_sim.append(1 - cosine(search_term_embeddings[idx], each))

    lst = get_highest_numbers_with_indices(cos_sim)

    for id in lst:
        records_result['taxonomy_record'].append((random_ids[idx], id[1]))
        records_result['Similarity score'].append(id[0])

records_result = pd.DataFrame(records_result)
records_result.head(10)

Unnamed: 0,taxonomy_record,Similarity score
0,"(89, 842)",0.775565
1,"(89, 5693)",0.754079
2,"(89, 3456)",0.752779
3,"(89, 1487)",0.743351
4,"(89, 1749)",0.736981
5,"(89, 2540)",0.736856
6,"(89, 5147)",0.736615
7,"(89, 1488)",0.73191
8,"(89, 3761)",0.712234
9,"(89, 1898)",0.711902


In [None]:
retrieved_relevant = 0
for each in records_result['taxonomy_record'].to_list():
    if each in taxonomy_records:
        retrieved_relevant += 1
print('Precision: ' + str(retrieved_relevant / len(records_result)))

Precision: 0.06


## Roberta

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('AIMH/mental-roberta-large')
model = RobertaModel.from_pretrained('AIMH/mental-roberta-large').to(device)
model.eval()

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/328 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of the model checkpoint at AIMH/mental-roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at AIMH/mental-roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 1024, padding_idx=1)
    (position_embeddings): Embedding(514, 1024, padding_idx=1)
    (token_type_embeddings): Embedding(1, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-23): 24 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      

### Generate embeddings for search_terms

In [None]:
tokenized_search_terms = tokenizer(search_terms, padding='max_length', max_length=512, truncation=True, return_tensors='pt').to(device)

# Generate embeddings
search_term_embeddings = []
with torch.no_grad():
    embedding = model(input_ids=tokenized_search_terms['input_ids'],
                    attention_mask=tokenized_search_terms['attention_mask'])
    embedding = embedding.last_hidden_state.mean(dim=1).cpu()

search_term_embeddings = embedding.tolist()

### Read embeddings from file

In [None]:
record_embeddings, taxonomy_embeddings = load_embeddings(DATA_PATH + 'data/embeddings/roberta_records_embeddings.pt', DATA_PATH + 'data/embeddings/roberta_taxonomy_embeddings.pt')
print(record_embeddings.shape)
print(taxonomy_embeddings.shape)

torch.Size([6239, 1024])
torch.Size([192, 1024])


### Prediction

In [None]:
records_result = {'taxonomy_record': [], 'Similarity score': []}

for idx in range(len(search_term_embeddings)):
    # Records
    cos_sim = []
    for each in record_embeddings:
        cos_sim.append(1 - cosine(search_term_embeddings[idx], each))

    lst = get_highest_numbers_with_indices(cos_sim)

    for id in lst:
        records_result['taxonomy_record'].append((random_ids[idx], id[1]))
        records_result['Similarity score'].append(id[0])

records_result = pd.DataFrame(records_result)
records_result.head(10)

Unnamed: 0,taxonomy_record,Similarity score
0,"(89, 5147)",0.806375
1,"(89, 5560)",0.794889
2,"(89, 842)",0.773059
3,"(89, 1409)",0.76051
4,"(89, 2285)",0.756035
5,"(89, 2242)",0.755511
6,"(89, 5170)",0.755273
7,"(89, 4123)",0.752227
8,"(89, 2583)",0.749207
9,"(89, 3255)",0.746239


In [None]:
retrieved_relevant = 0
for each in records_result['taxonomy_record'].to_list():
    if each in taxonomy_records:
        retrieved_relevant += 1
print('Precision: ' + str(retrieved_relevant / len(records_result)))

Precision: 0.04


## XLNet

In [None]:
tokenizer = XLNetTokenizer.from_pretrained('AIMH/mental-xlnet-base-cased')
model = XLNetModel.from_pretrained('AIMH/mental-xlnet-base-cased').to(device)
model.eval()

Downloading spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/489 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/921 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of the model checkpoint at AIMH/mental-xlnet-base-cased were not used when initializing XLNetModel: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


XLNetModel(
  (word_embedding): Embedding(32000, 768)
  (layer): ModuleList(
    (0-11): 12 x XLNetLayer(
      (rel_attn): XLNetRelativeAttention(
        (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): XLNetFeedForward(
        (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (layer_1): Linear(in_features=768, out_features=3072, bias=True)
        (layer_2): Linear(in_features=3072, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (activation_function): GELUActivation()
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (dropout): Dropout(p=0.1, inplace=False)
)

### Generate embeddings for search_terms

In [None]:
tokenized_search_terms = tokenizer(search_terms, padding='max_length', max_length=512, truncation=True, return_tensors='pt').to(device)

# Generate embeddings
search_term_embeddings = []
with torch.no_grad():
    embedding = model(input_ids=tokenized_search_terms['input_ids'],
                    attention_mask=tokenized_search_terms['attention_mask'],
                    token_type_ids=tokenized_search_terms['token_type_ids'])
    embedding = embedding.last_hidden_state.mean(dim=1).cpu()

search_term_embeddings = embedding.tolist()

### Read embeddings from file

In [None]:
record_embeddings, taxonomy_embeddings = load_embeddings(DATA_PATH + 'data/embeddings/xlnet_records_embeddings.pt', DATA_PATH + 'data/embeddings/xlnet_taxonomy_embeddings.pt')
print(record_embeddings.shape)
print(taxonomy_embeddings.shape)

torch.Size([6239, 768])
torch.Size([192, 768])


### Prediction

In [None]:
records_result = {'taxonomy_record': [], 'Similarity score': []}

for idx in range(len(search_term_embeddings)):
    # Records
    cos_sim = []
    for each in record_embeddings:
        cos_sim.append(1 - cosine(search_term_embeddings[idx], each))

    lst = get_highest_numbers_with_indices(cos_sim)

    for id in lst:
        records_result['taxonomy_record'].append((random_ids[idx], id[1]))
        records_result['Similarity score'].append(id[0])

records_result = pd.DataFrame(records_result)
records_result.head(10)

Unnamed: 0,taxonomy_record,Similarity score
0,"(89, 1761)",0.859594
1,"(89, 5693)",0.858888
2,"(89, 5088)",0.854702
3,"(89, 5351)",0.848961
4,"(89, 5662)",0.848724
5,"(89, 3940)",0.843557
6,"(89, 4897)",0.84343
7,"(89, 899)",0.841146
8,"(89, 104)",0.840499
9,"(89, 5169)",0.836508


In [None]:
retrieved_relevant = 0
for each in records_result['taxonomy_record'].to_list():
    if each in taxonomy_records:
        retrieved_relevant += 1
print('Precision: ' + str(retrieved_relevant / len(records_result)))

Precision: 0.07


## SGPT

In [None]:
# Get SGPT
tokenizer = AutoTokenizer.from_pretrained("Muennighoff/SGPT-1.3B-weightedmean-nli-bitfit")
model = AutoModel.from_pretrained("Muennighoff/SGPT-1.3B-weightedmean-nli-bitfit").to(device)

model.eval()

Downloading (…)okenizer_config.json:   0%|          | 0.00/658 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/5.36G [00:00<?, ?B/s]

GPTNeoModel(
  (wte): Embedding(50257, 2048)
  (wpe): Embedding(2048, 2048)
  (drop): Dropout(p=0.0, inplace=False)
  (h): ModuleList(
    (0-23): 24 x GPTNeoBlock(
      (ln_1): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
      (attn): GPTNeoAttention(
        (attention): GPTNeoSelfAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (out_proj): Linear(in_features=2048, out_features=2048, bias=True)
        )
      )
      (ln_2): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
      (mlp): GPTNeoMLP(
        (c_fc): Linear(in_features=2048, out_features=8192, bias=True)
        (c_proj): Linear(in_features=8192, out_features=2048, bias=True)
        (act): NewGE

### Generate embeddings for search_terms

In [None]:
def getEmbeddings(input_ids, attention_mask):
    # Get the embeddings
    with torch.no_grad():
        # Get hidden state of shape [bs, seq_len, hid_dim]
        last_hidden_state = model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True, return_dict=True).last_hidden_state

    # Get weights of shape [bs, seq_len, hid_dim]
    weights = (
        torch.arange(start=1, end=last_hidden_state.shape[1] + 1)
        .unsqueeze(0)
        .unsqueeze(-1)
        .expand(last_hidden_state.size())
        .float().to(last_hidden_state.device)
    )

    # Get attn mask of shape [bs, seq_len, hid_dim]
    input_mask_expanded = (
        attention_mask
        .unsqueeze(-1)
        .expand(last_hidden_state.size())
        .float()
    )

    # Perform weighted mean pooling across seq_len: bs, seq_len, hidden_dim -> bs, hidden_dim
    sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded * weights, dim=1)
    sum_mask = torch.sum(input_mask_expanded * weights, dim=1)

    embeddings = sum_embeddings / sum_mask

    return embeddings.cpu()

In [None]:
tokenized_search_terms = tokenizer(search_terms, padding='max_length', max_length=600, truncation=True, return_tensors='pt').to(device)

# Generate embeddings
search_term_embeddings = getEmbeddings(tokenized_search_terms['input_ids'], attention_mask=tokenized_search_terms['attention_mask']).tolist()

### Read embeddings from file

In [None]:
record_embeddings, taxonomy_embeddings = load_embeddings(DATA_PATH + 'data/embeddings/sgpt_records_embeddings.pt', DATA_PATH + 'data/embeddings/sgpt_taxonomy_embeddings.pt')
print(record_embeddings.shape)
print(taxonomy_embeddings.shape)

torch.Size([6239, 2048])
torch.Size([192, 2048])


### Prediction

In [None]:
records_result = {'taxonomy_record': [], 'Similarity score': []}

for idx in range(len(search_term_embeddings)):
    # Records
    cos_sim = []
    for each in record_embeddings:
        cos_sim.append(1 - cosine(search_term_embeddings[idx], each))

    lst = get_highest_numbers_with_indices(cos_sim)

    for id in lst:
        records_result['taxonomy_record'].append((random_ids[idx], id[1]))
        records_result['Similarity score'].append(id[0])

records_result = pd.DataFrame(records_result)
records_result.head(10)

Unnamed: 0,taxonomy_record,Similarity score
0,"(179, 4477)",0.511494
1,"(179, 3758)",0.382165
2,"(179, 5821)",0.378124
3,"(179, 5518)",0.374686
4,"(179, 6224)",0.366329
5,"(179, 2535)",0.362946
6,"(179, 4936)",0.360907
7,"(179, 5470)",0.360685
8,"(179, 1502)",0.359349
9,"(179, 1898)",0.359178


In [None]:
retrieved_relevant = 0
for each in records_result['taxonomy_record'].to_list():
    if each in taxonomy_records:
        retrieved_relevant += 1
print('Precision: ' + str(retrieved_relevant / len(records_result)))

Precision: 0.05
