# Evaluate models using map_records_under_heading

`map_records_under_heading.csv` contains the links between taxonomy and records.

Each taxonomy corresponds to one or more records.

In [2]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel, XLNetTokenizer, XLNetModel, AutoTokenizer, AutoModel
from scipy.spatial.distance import cosine
from pre_processing import PreProcessing
import random

device = torch.device("mps" if getattr(torch,'has_mps',False) else "cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

## Load data

In [3]:
map_records_heading = pd.read_csv('data/map_records_under_heading.csv')
map_records_heading.head(10)

Unnamed: 0,id,created_at,updated_at,parent_id,child_id,item_order
0,1,2020-12-28 01:28:05,2020-12-28 01:28:05,3,1,0
1,2,2020-12-28 01:28:05,2020-12-28 01:28:05,8,1,0
2,3,2020-12-28 01:28:05,2020-12-28 01:28:05,12,1,0
3,4,2020-12-28 01:28:05,2020-12-28 01:28:05,13,1,0
4,5,2020-12-28 01:28:05,2020-12-28 01:28:05,14,1,0
5,6,2020-12-28 01:28:05,2020-12-28 01:28:05,16,1,0
6,7,2020-12-28 01:28:05,2020-12-28 01:28:05,17,1,0
7,8,2020-12-28 01:28:05,2020-12-28 01:28:05,19,1,0
8,9,2020-12-28 01:28:05,2020-12-28 01:28:05,20,1,0
9,10,2020-12-28 01:28:05,2020-12-28 01:28:05,21,1,0


In [4]:
taxonomy_records = list(zip(map_records_heading['parent_id'], map_records_heading['child_id']))

In [5]:
records = pd.read_json('data/records.json')
records = records.drop(['created_at', 'updated_at', 'deleted_at', 'publish', 'academic_credentials', 'age_max', 'age_min', 'last_name', 
                        'latitude', 'longitude', 'name_of_private_practice', 'fee_description',	'fee_type',	'first_name',	'languages',
                        'organization_type', 'original_id',	'record_type',	'salutation_type', 'website'], axis=1)

taxonomy = pd.read_json('data/taxonomy_headings.json')
taxonomy = taxonomy.drop(['created_at',	'updated_at',	'deleted_at', 'alias_of_id', 'short_description',	'original_id'], axis=1)

taxonomy.head(10)

Unnamed: 0,id,name,description,translations
0,1,Root,Root,"{""name"":{""en"":""Root"",""fr"":null},""description"":..."
1,2,All Mental Health Resources,<p>\r\n\tThe listings of mental health resourc...,"{""name"":{""en"":""All Mental Health Resources"",""f..."
2,3,Crisis and Emergency,<p>\r\n\tRefers to all programs that provide i...,"{""name"":{""en"":""Crisis and Emergency"",""fr"":""Res..."
3,4,"System Navigation, including Information and R...","<p>\r\n\tAre you looking for help, but don&#39...","{""name"":{""en"":""System Navigation, including In..."
4,5,Child Welfare including Children's Aid Society...,<p>The child welfare / child protection system...,"{""name"":{""en"":""Child Welfare including Childre..."
5,6,Emergency Shelter and Housing,<p>\r\n\tThere are various shelters that peopl...,"{""name"":{""en"":""Emergency Shelter and Housing"",..."
6,7,Hospital Emergency Department,<p>\r\n\tIs there an emergency such as medical...,"{""name"":{""en"":""Hospital Emergency Department"",..."
7,8,"Crisis Lines including Telephone, Online and Chat",<p>\r\n\tAre you in a crisis? Crisis lines off...,"{""name"":{""en"":""Crisis Lines including Telephon..."
8,9,Psychiatrists,<p>\r\n\tPsychiatrists are medical doctors who...,"{""name"":{""en"":""Psychiatrists"",""fr"":""Psychiatre..."
9,10,A-Z Mental Health Conditions and Topics,<p>\r\n\tAlphabetical list of mental health to...,"{""name"":{""en"":""A-Z Mental Health Conditions an..."


## Pre-processing

In [6]:
records, taxonomy = PreProcessing(records, taxonomy).preprocess()

Length of records before preprocessing: 6406
Length of taxonomy before preprocessing: 277
Length of records after preprocessing: 6239
Length of taxonomy after preprocessing: 192


## Randomly chose ten taxonomies

In [7]:
# Generate ten random integers between 0 and 191
random_ids = [random.randint(0, 191) for i in range(10)]
search_terms = []
for each in random_ids:
    print(str(each) + ': ' + taxonomy.iloc[each]['name'] + '\n' + taxonomy.iloc[each]['description'] + '\n')
    search_terms.append(taxonomy.iloc[each]['name'])

53: Psychotherapists
Psychotherapists provide psychotherapy (aka talk therapy) and help individuals with difficulties by listening and giving support.Special training is required in order to become a psychotherapist.Various types of professionals such asSocial Workers or Psychologists provide psychotherapy, and can thus be viewed as being psychotherapists as well.

120: Cancer
Living with cancer can mean living with a wide range of emotions and psychological stresses that can have an impact on mental health and well-being.

58: Counselling and Therapy
Counselling and therapy can be helpful for dealing with a wide variety of issues (e.g. stress and coping, relationship problems)and mental health conditions (e.g. depression, anxiety, etc.)

89: Grief and Bereavement
Grief and bereavement refers to the sadness and loneliness that result from the loss of a loved one.

175: Acceptance and Commitment Therapy (ACT)
ACT is a type of therapy that helps people by using acceptance and mindfulness

## Helper functions

In [8]:
def load_embeddings(record_file_path, taxonomy_file_path):
    # Load embeddings from pt
    record_embeddings = torch.load(record_file_path)
    taxonomy_embeddings = torch.load(taxonomy_file_path)
    return record_embeddings, taxonomy_embeddings

def get_highest_numbers_with_indices(numbers, n=10):
    """
    Returns the n highest numbers in a list along with their indices.
    :param numbers: List of numbers
    :param n: Number of highest numbers to retrieve (default: 10)
    :return: List of tuples containing the highest numbers and their indices
    """
    highest_numbers_with_indices = []
    for i, num in enumerate(numbers):
        if len(highest_numbers_with_indices) < n:
            highest_numbers_with_indices.append((num, i))
            highest_numbers_with_indices.sort(reverse=True)
        else:
            if num > highest_numbers_with_indices[-1][0]:
                highest_numbers_with_indices.pop()
                highest_numbers_with_indices.append((num, i))
                highest_numbers_with_indices.sort(reverse=True)
    return highest_numbers_with_indices

## BERT

In [9]:
tokenizer = BertTokenizer.from_pretrained("AIMH/mental-bert-large-uncased")
model = BertModel.from_pretrained("AIMH/mental-bert-large-uncased").to(device)
model.eval()

Some weights of the model checkpoint at AIMH/mental-bert-large-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at AIMH/mental-bert-large-uncased and are newly initialized: ['bert.pooler.dense.weight', 'bert.p

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 1024, padding_idx=0)
    (position_embeddings): Embedding(512, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-23): 24 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inpl

### Generate embeddings for search_terms

In [10]:
tokenized_search_terms = tokenizer(search_terms, padding='max_length', max_length=512, truncation=True, return_tensors='pt').to(device)

# Generate embeddings
search_term_embeddings = []
with torch.no_grad():
    embedding = model(input_ids=tokenized_search_terms['input_ids'],
                    attention_mask=tokenized_search_terms['attention_mask'],
                    token_type_ids=tokenized_search_terms['token_type_ids'])
    embedding = embedding.last_hidden_state.mean(dim=1).cpu()
    
search_term_embeddings = embedding.tolist()

### Read embeddings from file

In [11]:
record_embeddings, taxonomy_embeddings = load_embeddings('data/embeddings/bert_records_embeddings.pt', 'data/embeddings/bert_taxonomy_embeddings.pt')
print(record_embeddings.shape)
print(taxonomy_embeddings.shape)

torch.Size([6239, 1024])
torch.Size([192, 1024])


### Prediction

In [12]:
records_result = {'taxonomy_record': [], 'Similarity score': []}

for idx in range(len(search_term_embeddings)):
    # Records
    cos_sim = []
    for each in record_embeddings:
        cos_sim.append(1 - cosine(search_term_embeddings[idx], each))

    lst = get_highest_numbers_with_indices(cos_sim)

    for id in lst:
        records_result['taxonomy_record'].append((random_ids[idx], id[1]))
        records_result['Similarity score'].append(id[0])

records_result = pd.DataFrame(records_result)
records_result.head(10)

Unnamed: 0,taxonomy_record,Similarity score
0,"(53, 5693)",0.80746
1,"(53, 1634)",0.807221
2,"(53, 3809)",0.777932
3,"(53, 3940)",0.777818
4,"(53, 620)",0.773732
5,"(53, 618)",0.773732
6,"(53, 617)",0.773732
7,"(53, 616)",0.773732
8,"(53, 614)",0.773732
9,"(53, 613)",0.773732


In [13]:
retrieved_relevant = 0
for each in records_result['taxonomy_record'].to_list():
    if each in taxonomy_records:
        retrieved_relevant += 1
print('Precision: ' + str(retrieved_relevant / len(records_result)))

Precision: 0.01


## Roberta

In [14]:
tokenizer = RobertaTokenizer.from_pretrained('AIMH/mental-roberta-large')
model = RobertaModel.from_pretrained('AIMH/mental-roberta-large').to(device)
model.eval()

Some weights of the model checkpoint at AIMH/mental-roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at AIMH/mental-roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 1024, padding_idx=1)
    (position_embeddings): Embedding(514, 1024, padding_idx=1)
    (token_type_embeddings): Embedding(1, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-23): 24 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      

### Generate embeddings for search_terms

In [15]:
tokenized_search_terms = tokenizer(search_terms, padding='max_length', max_length=512, truncation=True, return_tensors='pt').to(device)

# Generate embeddings
search_term_embeddings = []
with torch.no_grad():
    embedding = model(input_ids=tokenized_search_terms['input_ids'],
                    attention_mask=tokenized_search_terms['attention_mask'])
    embedding = embedding.last_hidden_state.mean(dim=1).cpu()
    
search_term_embeddings = embedding.tolist()

### Read embeddings from file

In [16]:
record_embeddings, taxonomy_embeddings = load_embeddings('data/embeddings/roberta_records_embeddings.pt', 'data/embeddings/roberta_taxonomy_embeddings.pt')
print(record_embeddings.shape)
print(taxonomy_embeddings.shape)

torch.Size([6239, 1024])
torch.Size([192, 1024])


### Prediction

In [17]:
records_result = {'taxonomy_record': [], 'Similarity score': []}

for idx in range(len(search_term_embeddings)):
    # Records
    cos_sim = []
    for each in record_embeddings:
        cos_sim.append(1 - cosine(search_term_embeddings[idx], each))

    lst = get_highest_numbers_with_indices(cos_sim)

    for id in lst:
        records_result['taxonomy_record'].append((random_ids[idx], id[1]))
        records_result['Similarity score'].append(id[0])

records_result = pd.DataFrame(records_result)
records_result.head(10)

Unnamed: 0,taxonomy_record,Similarity score
0,"(53, 1634)",0.877737
1,"(53, 5147)",0.877365
2,"(53, 5463)",0.865646
3,"(53, 2683)",0.853112
4,"(53, 5693)",0.844657
5,"(53, 3940)",0.837582
6,"(53, 1806)",0.826826
7,"(53, 1761)",0.814847
8,"(53, 1525)",0.802147
9,"(53, 2455)",0.800418


In [18]:
retrieved_relevant = 0
for each in records_result['taxonomy_record'].to_list():
    if each in taxonomy_records:
        retrieved_relevant += 1
print('Precision: ' + str(retrieved_relevant / len(records_result)))

Precision: 0.05


## XLNet

In [19]:
tokenizer = XLNetTokenizer.from_pretrained('AIMH/mental-xlnet-base-cased')
model = XLNetModel.from_pretrained('AIMH/mental-xlnet-base-cased').to(device)
model.eval()

Some weights of the model checkpoint at AIMH/mental-xlnet-base-cased were not used when initializing XLNetModel: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


XLNetModel(
  (word_embedding): Embedding(32000, 768)
  (layer): ModuleList(
    (0-11): 12 x XLNetLayer(
      (rel_attn): XLNetRelativeAttention(
        (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): XLNetFeedForward(
        (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (layer_1): Linear(in_features=768, out_features=3072, bias=True)
        (layer_2): Linear(in_features=3072, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (activation_function): GELUActivation()
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (dropout): Dropout(p=0.1, inplace=False)
)

### Generate embeddings for search_terms

In [20]:
tokenized_search_terms = tokenizer(search_terms, padding='max_length', max_length=512, truncation=True, return_tensors='pt').to(device)

# Generate embeddings
search_term_embeddings = []
with torch.no_grad():
    embedding = model(input_ids=tokenized_search_terms['input_ids'],
                    attention_mask=tokenized_search_terms['attention_mask'],
                    token_type_ids=tokenized_search_terms['token_type_ids'])
    embedding = embedding.last_hidden_state.mean(dim=1).cpu()
    
search_term_embeddings = embedding.tolist()

### Read embeddings from file

In [21]:
record_embeddings, taxonomy_embeddings = load_embeddings('data/embeddings/xlnet_records_embeddings.pt', 'data/embeddings/xlnet_taxonomy_embeddings.pt')
print(record_embeddings.shape)
print(taxonomy_embeddings.shape)

torch.Size([6239, 768])
torch.Size([192, 768])


### Prediction

In [22]:
records_result = {'taxonomy_record': [], 'Similarity score': []}

for idx in range(len(search_term_embeddings)):
    # Records
    cos_sim = []
    for each in record_embeddings:
        cos_sim.append(1 - cosine(search_term_embeddings[idx], each))

    lst = get_highest_numbers_with_indices(cos_sim)

    for id in lst:
        records_result['taxonomy_record'].append((random_ids[idx], id[1]))
        records_result['Similarity score'].append(id[0])

records_result = pd.DataFrame(records_result)
records_result.head(10)

Unnamed: 0,taxonomy_record,Similarity score
0,"(53, 5794)",0.820551
1,"(53, 1749)",0.804476
2,"(53, 4354)",0.764771
3,"(53, 3945)",0.761853
4,"(53, 5016)",0.748078
5,"(53, 5946)",0.740098
6,"(53, 3144)",0.735447
7,"(53, 5463)",0.734411
8,"(53, 3130)",0.732243
9,"(53, 3362)",0.731013


In [23]:
retrieved_relevant = 0
for each in records_result['taxonomy_record'].to_list():
    if each in taxonomy_records:
        retrieved_relevant += 1
print('Precision: ' + str(retrieved_relevant / len(records_result)))

Precision: 0.02


## SGPT

In [24]:
# Get SGPT
tokenizer = AutoTokenizer.from_pretrained("Muennighoff/SGPT-1.3B-weightedmean-nli-bitfit")
model = AutoModel.from_pretrained("Muennighoff/SGPT-1.3B-weightedmean-nli-bitfit").to(device)

model.eval()

: 

: 

### Generate embeddings for search_terms

In [None]:
def getEmbeddings(input_ids, attention_mask):
    # Get the embeddings
    with torch.no_grad():
        # Get hidden state of shape [bs, seq_len, hid_dim]
        last_hidden_state = model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True, return_dict=True).last_hidden_state

    # Get weights of shape [bs, seq_len, hid_dim]
    weights = (
        torch.arange(start=1, end=last_hidden_state.shape[1] + 1)
        .unsqueeze(0)
        .unsqueeze(-1)
        .expand(last_hidden_state.size())
        .float().to(last_hidden_state.device)
    )

    # Get attn mask of shape [bs, seq_len, hid_dim]
    input_mask_expanded = (
        attention_mask
        .unsqueeze(-1)
        .expand(last_hidden_state.size())
        .float()
    )

    # Perform weighted mean pooling across seq_len: bs, seq_len, hidden_dim -> bs, hidden_dim
    sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded * weights, dim=1)
    sum_mask = torch.sum(input_mask_expanded * weights, dim=1)

    embeddings = sum_embeddings / sum_mask

    return embeddings.cpu()

In [None]:
tokenized_search_terms = tokenizer(search_terms, padding='max_length', max_length=600, truncation=True, return_tensors='pt').to(device)

# Generate embeddings
search_term_embeddings = getEmbeddings(tokenized_search_terms['input_ids'], attention_mask=tokenized_search_terms['attention_mask']).tolist()

### Read embeddings from file

In [None]:
record_embeddings, taxonomy_embeddings = load_embeddings('data/embeddings/sgpt_records_embeddings.pt', 'data/embeddings/sgpt_taxonomy_embeddings.pt')
print(record_embeddings.shape)
print(taxonomy_embeddings.shape)

torch.Size([6239, 768])
torch.Size([192, 768])


### Prediction

In [None]:
records_result = {'taxonomy_record': [], 'Similarity score': []}

for idx in range(len(search_term_embeddings)):
    # Records
    cos_sim = []
    for each in record_embeddings:
        cos_sim.append(1 - cosine(search_term_embeddings[idx], each))

    lst = get_highest_numbers_with_indices(cos_sim)

    for id in lst:
        records_result['taxonomy_record'].append((random_ids[idx], id[1]))
        records_result['Similarity score'].append(id[0])

records_result = pd.DataFrame(records_result)
records_result.head(10)

Unnamed: 0,taxonomy_record,Similarity score
0,"(184, 1634)",0.780829
1,"(184, 5000)",0.702536
2,"(184, 6055)",0.656871
3,"(184, 2586)",0.646647
4,"(184, 4978)",0.613448
5,"(184, 1761)",0.61158
6,"(184, 3809)",0.606449
7,"(184, 5016)",0.600357
8,"(184, 5880)",0.595736
9,"(184, 5279)",0.595647


In [None]:
retrieved_relevant = 0
for each in records_result['taxonomy_record'].to_list():
    if each in taxonomy_records:
        retrieved_relevant += 1
print('Precision: ' + str(retrieved_relevant / len(records_result)))

Precision: 0.06
