# Evaluate models using map_records_under_heading

`map_records_under_heading.csv` contains the links between taxonomy and records.

Each taxonomy corresponds to one or more records.

In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel, XLNetTokenizer, XLNetModel, AutoTokenizer, AutoModel
from scipy.spatial.distance import cosine
from pre_processing import PreProcessing
import random

device = torch.device("mps" if getattr(torch,'has_mps',False) else "cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

## Load data

In [2]:
map_records_heading = pd.read_csv('data/map_records_under_heading.csv')
map_records_heading.head(10)

Unnamed: 0,id,created_at,updated_at,parent_id,child_id,item_order
0,1,2020-12-28 01:28:05,2020-12-28 01:28:05,3,1,0
1,2,2020-12-28 01:28:05,2020-12-28 01:28:05,8,1,0
2,3,2020-12-28 01:28:05,2020-12-28 01:28:05,12,1,0
3,4,2020-12-28 01:28:05,2020-12-28 01:28:05,13,1,0
4,5,2020-12-28 01:28:05,2020-12-28 01:28:05,14,1,0
5,6,2020-12-28 01:28:05,2020-12-28 01:28:05,16,1,0
6,7,2020-12-28 01:28:05,2020-12-28 01:28:05,17,1,0
7,8,2020-12-28 01:28:05,2020-12-28 01:28:05,19,1,0
8,9,2020-12-28 01:28:05,2020-12-28 01:28:05,20,1,0
9,10,2020-12-28 01:28:05,2020-12-28 01:28:05,21,1,0


In [3]:
taxonomy_records = list(zip(map_records_heading['parent_id'], map_records_heading['child_id']))

In [4]:
records = pd.read_json('data/records.json')
records = records.drop(['created_at', 'updated_at', 'deleted_at', 'publish', 'academic_credentials', 'age_max', 'age_min', 'last_name', 
                        'latitude', 'longitude', 'name_of_private_practice', 'fee_description',	'fee_type',	'first_name',	'languages',
                        'organization_type', 'original_id',	'record_type',	'salutation_type', 'website'], axis=1)

taxonomy = pd.read_json('data/taxonomy_headings.json')
taxonomy = taxonomy.drop(['created_at',	'updated_at',	'deleted_at', 'alias_of_id', 'short_description',	'original_id'], axis=1)

taxonomy.head(10)

Unnamed: 0,id,name,description,translations
0,1,Root,Root,"{""name"":{""en"":""Root"",""fr"":null},""description"":..."
1,2,All Mental Health Resources,<p>\r\n\tThe listings of mental health resourc...,"{""name"":{""en"":""All Mental Health Resources"",""f..."
2,3,Crisis and Emergency,<p>\r\n\tRefers to all programs that provide i...,"{""name"":{""en"":""Crisis and Emergency"",""fr"":""Res..."
3,4,"System Navigation, including Information and R...","<p>\r\n\tAre you looking for help, but don&#39...","{""name"":{""en"":""System Navigation, including In..."
4,5,Child Welfare including Children's Aid Society...,<p>The child welfare / child protection system...,"{""name"":{""en"":""Child Welfare including Childre..."
5,6,Emergency Shelter and Housing,<p>\r\n\tThere are various shelters that peopl...,"{""name"":{""en"":""Emergency Shelter and Housing"",..."
6,7,Hospital Emergency Department,<p>\r\n\tIs there an emergency such as medical...,"{""name"":{""en"":""Hospital Emergency Department"",..."
7,8,"Crisis Lines including Telephone, Online and Chat",<p>\r\n\tAre you in a crisis? Crisis lines off...,"{""name"":{""en"":""Crisis Lines including Telephon..."
8,9,Psychiatrists,<p>\r\n\tPsychiatrists are medical doctors who...,"{""name"":{""en"":""Psychiatrists"",""fr"":""Psychiatre..."
9,10,A-Z Mental Health Conditions and Topics,<p>\r\n\tAlphabetical list of mental health to...,"{""name"":{""en"":""A-Z Mental Health Conditions an..."


## Pre-processing

In [5]:
records, taxonomy = PreProcessing(records, taxonomy).preprocess()

Length of records before preprocessing: 6406
Length of taxonomy before preprocessing: 277
Length of records after preprocessing: 6239
Length of taxonomy after preprocessing: 192


## Randomly chose ten taxonomies

In [6]:
# Generate ten random integers between 0 and 191
random_ids = [random.randint(0, 191) for i in range(10)]
search_terms = []
for each in random_ids:
    print(str(each) + ': ' + taxonomy.iloc[each]['name'] + '\n' + taxonomy.iloc[each]['description'] + '\n')
    search_terms.append(taxonomy.iloc[each]['name'])

176: Medication Treatment
Medications can sometimes be useful for mental health issues, generally when other non-medication treatments have not been ineffective.

167: Hoarding
Compulsive hoarding (aka pathological hoarding) is acquiring possessions along with the failure to discard them, even if the items are worthless, no longer useful. Hoarded items fill the persons home, and can cause severe problems with day-to-day activities and relationships, and even pose a danger to life through being a fire and safety hazard.

39: Speech and Language Pathologists
Speech and language pathologists (SLP) are professionals who work with individuals having specific needs with speech and language, and may also help with feeding and swallowing issues.

5: Emergency Shelter and Housing
There are various shelters that people can use when they have no place to go.Emergency shelters are places for people to live temporarily when they dont have a place to stay. Emergency shelters (such as womens shelters

## Helper functions

In [7]:
def load_embeddings(record_file_path, taxonomy_file_path):
    # Load embeddings from pt
    record_embeddings = torch.load(record_file_path)
    taxonomy_embeddings = torch.load(taxonomy_file_path)
    return record_embeddings, taxonomy_embeddings

def get_highest_numbers_with_indices(numbers, n=10):
    """
    Returns the n highest numbers in a list along with their indices.
    :param numbers: List of numbers
    :param n: Number of highest numbers to retrieve (default: 10)
    :return: List of tuples containing the highest numbers and their indices
    """
    highest_numbers_with_indices = []
    for i, num in enumerate(numbers):
        if len(highest_numbers_with_indices) < n:
            highest_numbers_with_indices.append((num, i))
            highest_numbers_with_indices.sort(reverse=True)
        else:
            if num > highest_numbers_with_indices[-1][0]:
                highest_numbers_with_indices.pop()
                highest_numbers_with_indices.append((num, i))
                highest_numbers_with_indices.sort(reverse=True)
    return highest_numbers_with_indices

## BERT

In [None]:
tokenizer = BertTokenizer.from_pretrained("AIMH/mental-bert-large-uncased")
model = BertModel.from_pretrained("AIMH/mental-bert-large-uncased").to(device)
model.eval()

### Generate embeddings for search_terms

In [None]:
tokenized_search_terms = tokenizer(search_terms, padding='max_length', max_length=512, truncation=True, return_tensors='pt').to(device)

# Generate embeddings
search_term_embeddings = []
with torch.no_grad():
    embedding = model(input_ids=tokenized_search_terms['input_ids'],
                    attention_mask=tokenized_search_terms['attention_mask'],
                    token_type_ids=tokenized_search_terms['token_type_ids'])
    embedding = embedding.last_hidden_state.mean(dim=1).cpu()
    
search_term_embeddings = embedding.tolist()

### Read embeddings from file

In [None]:
record_embeddings, taxonomy_embeddings = load_embeddings('data/embeddings/bert_records_embeddings.pt', 'data/embeddings/bert_taxonomy_embeddings.pt')
print(record_embeddings.shape)
print(taxonomy_embeddings.shape)

### Prediction

In [None]:
records_result = {'taxonomy_record': [], 'Similarity score': []}

for idx in range(len(search_term_embeddings)):
    # Records
    cos_sim = []
    for each in record_embeddings:
        cos_sim.append(1 - cosine(search_term_embeddings[idx], each))

    lst = get_highest_numbers_with_indices(cos_sim)

    for id in lst:
        records_result['taxonomy_record'].append((random_ids[idx], id[1]))
        records_result['Similarity score'].append(id[0])

records_result = pd.DataFrame(records_result)
records_result.head(10)

In [None]:
retrieved_relevant = 0
for each in records_result['taxonomy_record'].to_list():
    if each in taxonomy_records:
        retrieved_relevant += 1
print('Precision: ' + str(retrieved_relevant / len(records_result)))

## Roberta

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('AIMH/mental-roberta-large')
model = RobertaModel.from_pretrained('AIMH/mental-roberta-large').to(device)
model.eval()

### Generate embeddings for search_terms

In [None]:
tokenized_search_terms = tokenizer(search_terms, padding='max_length', max_length=512, truncation=True, return_tensors='pt').to(device)

# Generate embeddings
search_term_embeddings = []
with torch.no_grad():
    embedding = model(input_ids=tokenized_search_terms['input_ids'],
                    attention_mask=tokenized_search_terms['attention_mask'])
    embedding = embedding.last_hidden_state.mean(dim=1).cpu()
    
search_term_embeddings = embedding.tolist()

### Read embeddings from file

In [None]:
record_embeddings, taxonomy_embeddings = load_embeddings('data/embeddings/roberta_records_embeddings.pt', 'data/embeddings/roberta_taxonomy_embeddings.pt')
print(record_embeddings.shape)
print(taxonomy_embeddings.shape)

### Prediction

In [None]:
records_result = {'taxonomy_record': [], 'Similarity score': []}

for idx in range(len(search_term_embeddings)):
    # Records
    cos_sim = []
    for each in record_embeddings:
        cos_sim.append(1 - cosine(search_term_embeddings[idx], each))

    lst = get_highest_numbers_with_indices(cos_sim)

    for id in lst:
        records_result['taxonomy_record'].append((random_ids[idx], id[1]))
        records_result['Similarity score'].append(id[0])

records_result = pd.DataFrame(records_result)
records_result.head(10)

In [None]:
retrieved_relevant = 0
for each in records_result['taxonomy_record'].to_list():
    if each in taxonomy_records:
        retrieved_relevant += 1
print('Precision: ' + str(retrieved_relevant / len(records_result)))

## XLNet

In [None]:
tokenizer = XLNetTokenizer.from_pretrained('AIMH/mental-xlnet-base-cased')
model = XLNetModel.from_pretrained('AIMH/mental-xlnet-base-cased').to(device)
model.eval()

### Generate embeddings for search_terms

In [None]:
tokenized_search_terms = tokenizer(search_terms, padding='max_length', max_length=512, truncation=True, return_tensors='pt').to(device)

# Generate embeddings
search_term_embeddings = []
with torch.no_grad():
    embedding = model(input_ids=tokenized_search_terms['input_ids'],
                    attention_mask=tokenized_search_terms['attention_mask'],
                    token_type_ids=tokenized_search_terms['token_type_ids'])
    embedding = embedding.last_hidden_state.mean(dim=1).cpu()
    
search_term_embeddings = embedding.tolist()

### Read embeddings from file

In [None]:
record_embeddings, taxonomy_embeddings = load_embeddings('data/embeddings/xlnet_records_embeddings.pt', 'data/embeddings/xlnet_taxonomy_embeddings.pt')
print(record_embeddings.shape)
print(taxonomy_embeddings.shape)

### Prediction

In [None]:
records_result = {'taxonomy_record': [], 'Similarity score': []}

for idx in range(len(search_term_embeddings)):
    # Records
    cos_sim = []
    for each in record_embeddings:
        cos_sim.append(1 - cosine(search_term_embeddings[idx], each))

    lst = get_highest_numbers_with_indices(cos_sim)

    for id in lst:
        records_result['taxonomy_record'].append((random_ids[idx], id[1]))
        records_result['Similarity score'].append(id[0])

records_result = pd.DataFrame(records_result)
records_result.head(10)

In [None]:
retrieved_relevant = 0
for each in records_result['taxonomy_record'].to_list():
    if each in taxonomy_records:
        retrieved_relevant += 1
print('Precision: ' + str(retrieved_relevant / len(records_result)))

## SGPT

In [8]:
# Get SGPT
tokenizer = AutoTokenizer.from_pretrained("Muennighoff/SGPT-1.3B-weightedmean-nli-bitfit")
model = AutoModel.from_pretrained("Muennighoff/SGPT-1.3B-weightedmean-nli-bitfit").to(device)

model.eval()

GPTNeoModel(
  (wte): Embedding(50257, 2048)
  (wpe): Embedding(2048, 2048)
  (drop): Dropout(p=0.0, inplace=False)
  (h): ModuleList(
    (0-23): 24 x GPTNeoBlock(
      (ln_1): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
      (attn): GPTNeoAttention(
        (attention): GPTNeoSelfAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (out_proj): Linear(in_features=2048, out_features=2048, bias=True)
        )
      )
      (ln_2): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
      (mlp): GPTNeoMLP(
        (c_fc): Linear(in_features=2048, out_features=8192, bias=True)
        (c_proj): Linear(in_features=8192, out_features=2048, bias=True)
        (act): NewGE

### Generate embeddings for search_terms

In [9]:
def getEmbeddings(input_ids, attention_mask):
    # Get the embeddings
    with torch.no_grad():
        # Get hidden state of shape [bs, seq_len, hid_dim]
        last_hidden_state = model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True, return_dict=True).last_hidden_state

    # Get weights of shape [bs, seq_len, hid_dim]
    weights = (
        torch.arange(start=1, end=last_hidden_state.shape[1] + 1)
        .unsqueeze(0)
        .unsqueeze(-1)
        .expand(last_hidden_state.size())
        .float().to(last_hidden_state.device)
    )

    # Get attn mask of shape [bs, seq_len, hid_dim]
    input_mask_expanded = (
        attention_mask
        .unsqueeze(-1)
        .expand(last_hidden_state.size())
        .float()
    )

    # Perform weighted mean pooling across seq_len: bs, seq_len, hidden_dim -> bs, hidden_dim
    sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded * weights, dim=1)
    sum_mask = torch.sum(input_mask_expanded * weights, dim=1)

    embeddings = sum_embeddings / sum_mask

    return embeddings.cpu()

In [10]:
tokenized_search_terms = tokenizer(search_terms, padding='max_length', max_length=600, truncation=True, return_tensors='pt').to(device)

# Generate embeddings
search_term_embeddings = getEmbeddings(tokenized_search_terms['input_ids'], attention_mask=tokenized_search_terms['attention_mask']).tolist()

### Read embeddings from file

In [11]:
record_embeddings, taxonomy_embeddings = load_embeddings('data/embeddings/sgpt_records_embeddings.pt', 'data/embeddings/sgpt_taxonomy_embeddings.pt')
print(record_embeddings.shape)
print(taxonomy_embeddings.shape)

torch.Size([6239, 2048])
torch.Size([192, 2048])


### Prediction

In [12]:
records_result = {'taxonomy_record': [], 'Similarity score': []}

for idx in range(len(search_term_embeddings)):
    # Records
    cos_sim = []
    for each in record_embeddings:
        cos_sim.append(1 - cosine(search_term_embeddings[idx], each))

    lst = get_highest_numbers_with_indices(cos_sim)

    for id in lst:
        records_result['taxonomy_record'].append((random_ids[idx], id[1]))
        records_result['Similarity score'].append(id[0])

records_result = pd.DataFrame(records_result)
records_result.head(10)

Unnamed: 0,taxonomy_record,Similarity score
0,"(176, 1778)",0.561347
1,"(176, 1530)",0.557538
2,"(176, 1239)",0.554829
3,"(176, 1795)",0.550969
4,"(176, 2115)",0.546171
5,"(176, 1764)",0.541424
6,"(176, 1761)",0.540933
7,"(176, 5922)",0.540312
8,"(176, 3597)",0.538645
9,"(176, 2493)",0.536855


In [13]:
retrieved_relevant = 0
for each in records_result['taxonomy_record'].to_list():
    if each in taxonomy_records:
        retrieved_relevant += 1
print('Precision: ' + str(retrieved_relevant / len(records_result)))

Precision: 0.01
