In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import re
import torch
from transformers import AutoModel, AutoTokenizer
import math
from scipy.spatial.distance import cosine
from google.colab import drive
drive.mount('/content/gdrive')

# Run locally
#DATA_PATH = ''

# KW
DATA_PATH = '/content/gdrive/MyDrive/CSI6900/'

# FZ
#DATA_PATH = 'gdrive/MyDrive/CSI6900/'

device = torch.device("mps" if getattr(torch,'has_mps',False) else "cuda:0" if torch.cuda.is_available() else "cpu")
device

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


device(type='cuda', index=0)

## Import data

In [None]:
records = pd.read_json(DATA_PATH + 'data/records.json')
records = records.drop(['created_at', 'updated_at', 'deleted_at', 'publish', 'academic_credentials', 'age_max', 'age_min', 'last_name',
                        'latitude', 'longitude', 'name_of_private_practice', 'fee_description',	'fee_type',	'first_name',	'languages',
                        'organization_type', 'original_id',	'record_type',	'salutation_type', 'website'], axis=1)

taxonomy = pd.read_json(DATA_PATH + 'data/taxonomy_headings.json')
taxonomy = taxonomy.drop(['created_at',	'updated_at',	'deleted_at', 'alias_of_id', 'short_description',	'original_id'], axis=1)

taxonomy.head(10)

Unnamed: 0,id,name,description,translations
0,1,Root,Root,"{""name"":{""en"":""Root"",""fr"":null},""description"":..."
1,2,All Mental Health Resources,<p>\r\n\tThe listings of mental health resourc...,"{""name"":{""en"":""All Mental Health Resources"",""f..."
2,3,Crisis and Emergency,<p>\r\n\tRefers to all programs that provide i...,"{""name"":{""en"":""Crisis and Emergency"",""fr"":""Res..."
3,4,"System Navigation, including Information and R...","<p>\r\n\tAre you looking for help, but don&#39...","{""name"":{""en"":""System Navigation, including In..."
4,5,Child Welfare including Children's Aid Society...,<p>The child welfare / child protection system...,"{""name"":{""en"":""Child Welfare including Childre..."
5,6,Emergency Shelter and Housing,<p>\r\n\tThere are various shelters that peopl...,"{""name"":{""en"":""Emergency Shelter and Housing"",..."
6,7,Hospital Emergency Department,<p>\r\n\tIs there an emergency such as medical...,"{""name"":{""en"":""Hospital Emergency Department"",..."
7,8,"Crisis Lines including Telephone, Online and Chat",<p>\r\n\tAre you in a crisis? Crisis lines off...,"{""name"":{""en"":""Crisis Lines including Telephon..."
8,9,Psychiatrists,<p>\r\n\tPsychiatrists are medical doctors who...,"{""name"":{""en"":""Psychiatrists"",""fr"":""Psychiatre..."
9,10,A-Z Mental Health Conditions and Topics,<p>\r\n\tAlphabetical list of mental health to...,"{""name"":{""en"":""A-Z Mental Health Conditions an..."


## Pre-processing

In [None]:
# Remove empty description
print('Length of records:', len(records))
print('Length of taxonomy', len(taxonomy))
indices = []
for index, row in records.iterrows():
    if row['description'] == '':
        indices.append(index)

records = records.drop(records.index[indices])
records.reset_index(drop=True, inplace=True)

indices = []
for index, row in taxonomy.iterrows():
    if row['description'] == '':
        indices.append(index)

taxonomy = taxonomy.drop(taxonomy.index[indices])
taxonomy.reset_index(drop=True, inplace=True)

print('Length of records:', len(records))
print('Length of taxonomy', len(taxonomy))

Length of records: 6406
Length of taxonomy 277
Length of records: 6239
Length of taxonomy 193


In [None]:
def cleanhtml(raw_html):
    CLEANR = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    cleantext = re.sub(CLEANR, '', raw_html)
    return cleantext

records['description'] = records['description'].apply(lambda x: cleanhtml(x))
taxonomy['description'] = taxonomy['description'].apply(lambda x: cleanhtml(x))

## Get embedding

* `Muennighoff/SGPT-125M-weightedmean-nli-bitfit`
  - 4 minutes
  - batch_size = 64

* `Muennighoff/SGPT-1.3B-weightedmean-nli-bitfit`
  - 45 minutes
  - batch_size = 16

* `Muennighoff/SGPT-5.8B-weightedmean-nli-bitfit`
  - Not enough RAM

In [None]:
# Get SGPT
# For best performance: Muennighoff/SGPT-5.8B-weightedmean-nli-bitfit
tokenizer = AutoTokenizer.from_pretrained("Muennighoff/SGPT-1.3B-weightedmean-nli-bitfit")
model = AutoModel.from_pretrained("Muennighoff/SGPT-1.3B-weightedmean-nli-bitfit").to(device)

model.eval()

GPTNeoModel(
  (wte): Embedding(50257, 2048)
  (wpe): Embedding(2048, 2048)
  (drop): Dropout(p=0.0, inplace=False)
  (h): ModuleList(
    (0-23): 24 x GPTNeoBlock(
      (ln_1): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
      (attn): GPTNeoAttention(
        (attention): GPTNeoSelfAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (out_proj): Linear(in_features=2048, out_features=2048, bias=True)
        )
      )
      (ln_2): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
      (mlp): GPTNeoMLP(
        (c_fc): Linear(in_features=2048, out_features=8192, bias=True)
        (c_proj): Linear(in_features=8192, out_features=2048, bias=True)
        (act): NewGE

In [None]:
records_description = records['description'].tolist()
taxonomy_description = taxonomy['description'].tolist()

In [None]:
def getEmbeddings(tokens):
    # Get the embeddings
    with torch.no_grad():
        # Get hidden state of shape [bs, seq_len, hid_dim]
        last_hidden_state = model(**tokens, output_hidden_states=True, return_dict=True).last_hidden_state

    # Get weights of shape [bs, seq_len, hid_dim]
    weights = (
        torch.arange(start=1, end=last_hidden_state.shape[1] + 1)
        .unsqueeze(0)
        .unsqueeze(-1)
        .expand(last_hidden_state.size())
        .float().to(last_hidden_state.device)
    )

    # Get attn mask of shape [bs, seq_len, hid_dim]
    input_mask_expanded = (
        tokens["attention_mask"]
        .unsqueeze(-1)
        .expand(last_hidden_state.size())
        .float()
    )

    # Perform weighted mean pooling across seq_len: bs, seq_len, hidden_dim -> bs, hidden_dim
    sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded * weights, dim=1)
    sum_mask = torch.sum(input_mask_expanded * weights, dim=1)

    embeddings = sum_embeddings / sum_mask

    return embeddings.cpu().tolist()

In [None]:
batch_size = 16

# Records
embeddings = []
begining = 0
for batch_index in range(math.ceil(len(records_description)/batch_size)):
    batch_tokens = tokenizer(records_description[begining:((batch_index+1) * batch_size)], padding='max_length', max_length=600, truncation=True, return_tensors="pt").to(device)

    embeddings += getEmbeddings(batch_tokens)

    begining = (batch_index+1) * batch_size
records['embeddings'] = embeddings

# Taxonomy
embeddings = []
begining = 0
for batch_index in range(math.ceil(len(taxonomy_description)/batch_size)):
    batch_tokens = tokenizer(taxonomy_description[begining:((batch_index+1) * batch_size)], padding='max_length', max_length=600, truncation=True, return_tensors="pt").to(device)

    embeddings += getEmbeddings(batch_tokens)

    begining = (batch_index+1) * batch_size
taxonomy['embeddings'] = embeddings

## Cosine Sim

In [None]:
def get_highest_numbers_with_indices(numbers, n=10):
    """
    Returns the n highest numbers in a list along with their indices.
    :param numbers: List of numbers
    :param n: Number of highest numbers to retrieve (default: 10)
    :return: List of tuples containing the highest numbers and their indices
    """
    highest_numbers_with_indices = []
    for i, num in enumerate(numbers):
        if len(highest_numbers_with_indices) < n:
            highest_numbers_with_indices.append((num, i))
            highest_numbers_with_indices.sort(reverse=True)
        else:
            if num > highest_numbers_with_indices[-1][0]:
                highest_numbers_with_indices.pop()
                highest_numbers_with_indices.append((num, i))
                highest_numbers_with_indices.sort(reverse=True)
    return highest_numbers_with_indices

### Records

In [None]:
input_text = ['How to treat depression?']
batch_tokens = tokenizer(input_text, padding='max_length', max_length=600, truncation=True, return_tensors="pt").to(device)
embedding = getEmbeddings(batch_tokens)[0]

cos_sim = []
for index, row in records.iterrows():
    cos_sim.append(1 - cosine(embedding, row['embeddings']))

lst = get_highest_numbers_with_indices(cos_sim)

for each in lst:
    print('Record id: %d,\nDescription: %s,\nSimilarity score: %5f\n' % (each[1], records['description'][each[1]], each[0]))

Record id: 4382,
Description: Counselling services for depression, anxiety, and stress management,
Similarity score: 0.770836

Record id: 2350,
Description: Provides cognitive-behavior therapy for adults seeking help with depression, anxiety, and stress.,
Similarity score: 0.764854

Record id: 3908,
Description: THE 10 BEST-EVER DEPRESSION MANAGEMENT TECHNIQUES
Understanding the origin of your clients’ depression is essential to their successful treatment. In this practical one-day workshop, Dr. Wehrenberg will empower you to identify the root causes of depression, explore the symptoms they manifest, and address how these symptoms can be managed so that clients can progress to recovery—without medication.  

Learning Objectives: 
Discover 10 categories of interventions you can immediately apply to help reduce depression symptoms in your clients.
Explore psychotherapy methods that can improve the physiology, cognition and behaviour of depressed clients. 
Understand the aspects of 

### Taxonomy

In [None]:
input_text = ['How to treat depression?']
batch_tokens = tokenizer(input_text, padding='max_length', max_length=600, truncation=True, return_tensors="pt").to(device)
embedding = getEmbeddings(batch_tokens)[0]

cos_sim = []
for index, row in taxonomy.iterrows():
    cos_sim.append(1 - cosine(embedding, row['embeddings']))

lst = get_highest_numbers_with_indices(cos_sim)

for each in lst:
    print('Record id: %d,\nDescription: %s,\nSimilarity score: %5f\n' % (each[1], records['description'][each[1]], each[0]))

Record id: 138,
Description: "Playing with Baby" is an open drop-in time specifically for parents/caregivers with children, birth to one year of age, to play and learn together. Program includes a group time consisting of songs, rhymes, and finger plays.

DATE: Tuesdays
TIME: 1:30 - 3:00 p.m.,
Similarity score: 1.000000

Record id: 18,
Description: Children's mental health organization in Ontario that provides help to children and youth, families and communities.,
Similarity score: 0.748547

Record id: 106,
Description: The one day Fun FRIENDS training will certify professionals to deliver the program in school, day care, and clinic settings. Fun FRIENDS is an anxiety prevention and intervention program designed by a clinical psychologist specifically for 4 to 7 year old children. Fun FRIENDS is a play based program that helps children cope with feelings of fear, worry and depression by building resilience and self-esteem and teaching cognitive, behavioural and emotional skills in a