In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import re
import torch
from transformers import AutoModel, AutoTokenizer
import math
from scipy.spatial.distance import cosine
import random
from google.colab import drive
drive.mount('/content/gdrive')

# Run locally
#DATA_PATH = ''

# KW
DATA_PATH = '/content/gdrive/MyDrive/CSI6900/'

# FZ
#DATA_PATH = 'gdrive/MyDrive/CSI6900/'

device = torch.device("mps" if getattr(torch,'has_mps',False) else "cuda:0" if torch.cuda.is_available() else "cpu")
device

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


device(type='cuda', index=0)

## Load data

In [None]:
infoSheet = pd.read_csv(DATA_PATH + "data/infoSheets_2023-05-18.csv")
print(infoSheet.isnull().sum())
infoSheet.dropna(subset=['abstract_en'], inplace=True)
print('\nNumber of rows: ', len(infoSheet.index))
infoSheet.head(5)

ID                        0
name_en                   0
name_fr                 138
abstract_en              20
abstract_fr             146
description_en            0
description_fr          140
taxonomy heading ids      0
dtype: int64

Number of rows:  326


Unnamed: 0,ID,name_en,name_fr,abstract_en,abstract_fr,description_en,description_fr,taxonomy heading ids
0,84606,ADHD Medication Side Effects: Low Appetite and...,,Stimulants prescribed for ADHD can lead to red...,,Background\r\nStimulant medications for attent...,,0
1,92619,5-HTP (5-hydroxytryptophan),,5-HTP (5-Hydroxytryptophan) is a natural subst...,,What is 5-HTP?\r\n5-HTP (5-Hydroxytryptophan) ...,,0
2,50150,A Simple Way to Swallow Pills: The Head Postur...,Truc simple pour avaler les pilules: La techni...,"Swallowing pills can hard for many children, y...","Il n’est pas seul! Beaucoup d’enfants, de jeun...",\r\n\t\r\n\t\tDoes your child or teen have pro...,\r\n\t\r\n\t\tVotre enfant a-t-il de la diffic...,0
3,8920,Abuse and Domestic Violence,Maltraitance et violence familiale,"Abuse is behaviour used to intimidate, isolate...",La maltraitance est un comportement visant à i...,\r\n\tWhat is Abuse and Domestic Violence?\r\n...,\r\n\tQu&#39;est-ce que la maltraitance et la ...,21958876509365437
4,69660,"ADHD in Children, Youth and Adults: Informatio...",,Attention deficit hyperactivity disorder (ADHD...,,"\r\n\tAbbreviations\r\n\r\n\tADHD, attention-d...",,13


In [None]:
records = pd.read_json(DATA_PATH + 'data/records.json')
records = records.drop(['created_at', 'updated_at', 'deleted_at', 'publish', 'academic_credentials', 'age_max', 'age_min', 'last_name',
                        'latitude', 'longitude', 'name_of_private_practice', 'fee_description',	'fee_type',	'first_name',	'languages',
                        'organization_type', 'original_id',	'record_type',	'salutation_type', 'website'], axis=1)

taxonomy = pd.read_json(DATA_PATH + 'data/taxonomy_headings.json')
taxonomy = taxonomy.drop(['created_at',	'updated_at',	'deleted_at', 'alias_of_id', 'short_description',	'original_id'], axis=1)

taxonomy.head(10)

Unnamed: 0,id,name,description,translations
0,1,Root,Root,"{""name"":{""en"":""Root"",""fr"":null},""description"":..."
1,2,All Mental Health Resources,<p>\r\n\tThe listings of mental health resourc...,"{""name"":{""en"":""All Mental Health Resources"",""f..."
2,3,Crisis and Emergency,<p>\r\n\tRefers to all programs that provide i...,"{""name"":{""en"":""Crisis and Emergency"",""fr"":""Res..."
3,4,"System Navigation, including Information and R...","<p>\r\n\tAre you looking for help, but don&#39...","{""name"":{""en"":""System Navigation, including In..."
4,5,Child Welfare including Children's Aid Society...,<p>The child welfare / child protection system...,"{""name"":{""en"":""Child Welfare including Childre..."
5,6,Emergency Shelter and Housing,<p>\r\n\tThere are various shelters that peopl...,"{""name"":{""en"":""Emergency Shelter and Housing"",..."
6,7,Hospital Emergency Department,<p>\r\n\tIs there an emergency such as medical...,"{""name"":{""en"":""Hospital Emergency Department"",..."
7,8,"Crisis Lines including Telephone, Online and Chat",<p>\r\n\tAre you in a crisis? Crisis lines off...,"{""name"":{""en"":""Crisis Lines including Telephon..."
8,9,Psychiatrists,<p>\r\n\tPsychiatrists are medical doctors who...,"{""name"":{""en"":""Psychiatrists"",""fr"":""Psychiatre..."
9,10,A-Z Mental Health Conditions and Topics,<p>\r\n\tAlphabetical list of mental health to...,"{""name"":{""en"":""A-Z Mental Health Conditions an..."


## Pre-processing

In [None]:
!cp /content/gdrive/MyDrive/CSI6900/pre_processing.py /content

In [None]:
from pre_processing import remove_empty, remove_HTML, remove_new_line

infoSheet['abstract_en'] = infoSheet['abstract_en'].apply(lambda x: remove_HTML(x))
infoSheet['abstract_en'] = infoSheet['abstract_en'].apply(lambda x: remove_new_line(x))

print('Length of taxonomy before preprocessing:', len(taxonomy.index))
taxonomy['description'] = taxonomy['description'].apply(lambda x: remove_HTML(x))
taxonomy['description'] = taxonomy['description'].apply(lambda x: remove_new_line(x))
# taxonomy = remove_empty('description', taxonomy)
print('Length of taxonomy after preprocessing:', len(taxonomy.index))

print('Length of records before preprocessing:', len(records.index))
records['description'] = records['description'].apply(lambda x: remove_HTML(x))
records['description'] = records['description'].apply(lambda x: remove_new_line(x))
records = remove_empty('description', records)
print('Length of records after preprocessing:', len(records.index))

Length of taxonomy before preprocessing: 277
Length of taxonomy after preprocessing: 277
Length of records before preprocessing: 6406
Length of records after preprocessing: 6239


## Generate embeddings

In [None]:
# Get SGPT
tokenizer = AutoTokenizer.from_pretrained("Muennighoff/SGPT-1.3B-weightedmean-nli-bitfit")
model = AutoModel.from_pretrained("Muennighoff/SGPT-1.3B-weightedmean-nli-bitfit").to(device)

model.eval()

GPTNeoModel(
  (wte): Embedding(50257, 2048)
  (wpe): Embedding(2048, 2048)
  (drop): Dropout(p=0.0, inplace=False)
  (h): ModuleList(
    (0-23): 24 x GPTNeoBlock(
      (ln_1): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
      (attn): GPTNeoAttention(
        (attention): GPTNeoSelfAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (out_proj): Linear(in_features=2048, out_features=2048, bias=True)
        )
      )
      (ln_2): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
      (mlp): GPTNeoMLP(
        (c_fc): Linear(in_features=2048, out_features=8192, bias=True)
        (c_proj): Linear(in_features=8192, out_features=2048, bias=True)
        (act): NewGE

In [None]:
def getEmbeddings(input_ids, attention_mask):
    # Get the embeddings
    with torch.no_grad():
        # Get hidden state of shape [bs, seq_len, hid_dim]
        last_hidden_state = model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True, return_dict=True).last_hidden_state

    # Get weights of shape [bs, seq_len, hid_dim]
    weights = (
        torch.arange(start=1, end=last_hidden_state.shape[1] + 1)
        .unsqueeze(0)
        .unsqueeze(-1)
        .expand(last_hidden_state.size())
        .float().to(last_hidden_state.device)
    )

    # Get attn mask of shape [bs, seq_len, hid_dim]
    input_mask_expanded = (
        attention_mask
        .unsqueeze(-1)
        .expand(last_hidden_state.size())
        .float()
    )

    # Perform weighted mean pooling across seq_len: bs, seq_len, hidden_dim -> bs, hidden_dim
    sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded * weights, dim=1)
    sum_mask = torch.sum(input_mask_expanded * weights, dim=1)

    embeddings = sum_embeddings / sum_mask

    return embeddings.cpu()

### Generate embeddings for name and abstraction of infoSheets

In [None]:
infoSheet_abstract = infoSheet['abstract_en'].tolist()
infoSheet_name = infoSheet['name_en'].tolist()

# Concat name and abstraction
infoSheet_data = []
for i in range(len(infoSheet_name)):
    infoSheet_data.append(infoSheet_name[i] + ': ' + infoSheet_abstract[i])

tokenized_infoSheet = tokenizer(infoSheet_data, padding='max_length', max_length=512, truncation=True, return_tensors='pt').to(device)

In [None]:
batch_size = 16

num_batches = math.ceil(len(tokenized_infoSheet.input_ids)/batch_size)

# Generate embeddings for the tokenized_infoSheet using the SGPT model in batches
infoSheet_embeddings = []
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = (i + 1) * batch_size
    end_idx = end_idx if end_idx <= len(tokenized_infoSheet.input_ids) else len(tokenized_infoSheet.input_ids)

    batch_input_ids = tokenized_infoSheet['input_ids'][start_idx:end_idx].to(device)
    batch_attention_mask = tokenized_infoSheet['attention_mask'][start_idx:end_idx].to(device)

    infoSheet_embeddings.append(getEmbeddings(batch_input_ids, batch_attention_mask))

infoSheet_embeddings = torch.cat(infoSheet_embeddings, dim=0)
infoSheet_embeddings.shape

torch.Size([326, 2048])

### Save embeddings

In [None]:
torch.save(infoSheet_embeddings, DATA_PATH + 'data/new_embeddings/sgpt_infoSheet_embeddings.pt')

torch.Size([326, 2048])


### Generate embeddings for name and description of records

In [None]:
records_description = records['description'].tolist()
records_name = records['name'].tolist()
records_data = []
for i in range(len(records_name)):
    records_data.append(records_name[i] + ': ' + records_description[i])
tokenized_record = tokenizer(records_data, padding='max_length', max_length=512, truncation=True, return_tensors="pt").to(device)

In [None]:
batch_size = 16

num_batches = math.ceil(len(tokenized_record.input_ids)/batch_size)

# Generate embeddings for the tokenized_record using the SGPT model in batches
records_embeddings = []
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = (i + 1) * batch_size
    end_idx = end_idx if end_idx <= len(tokenized_record.input_ids) else len(tokenized_record.input_ids)

    batch_input_ids = tokenized_record['input_ids'][start_idx:end_idx].to(device)
    batch_attention_mask = tokenized_record['attention_mask'][start_idx:end_idx].to(device)

    records_embeddings.append(getEmbeddings(batch_input_ids, batch_attention_mask))

records_embeddings = torch.cat(records_embeddings, dim=0)
records_embeddings.shape

### Save embeddings

In [None]:
torch.save(records_embeddings, DATA_PATH + 'data/new_embeddings/sgpt_records_embeddings.pt')

NameError: ignored

### Generate embeddings for name and description of taxonomy

In [None]:
taxonomy_description = taxonomy['description'].tolist()
taxonomy_name = taxonomy['name'].tolist()
taxonomy_data = []
for i in range(len(taxonomy_name)):
    taxonomy_data.append(taxonomy_name[i] + ': ' + taxonomy_description[i])
tokenized_taxonomy = tokenizer(taxonomy_data, padding='max_length', max_length=512, truncation=True, return_tensors="pt").to(device)

In [None]:
batch_size = 16

num_batches = math.ceil(len(tokenized_taxonomy.input_ids)/batch_size)

# Generate embeddings for the tokenized_taxonomy using the SGPT model in batches
taxonomy_embeddings = []
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = (i + 1) * batch_size
    end_idx = end_idx if end_idx <= len(tokenized_taxonomy.input_ids) else len(tokenized_taxonomy.input_ids)

    batch_input_ids = tokenized_taxonomy['input_ids'][start_idx:end_idx].to(device)
    batch_attention_mask = tokenized_taxonomy['attention_mask'][start_idx:end_idx].to(device)

    taxonomy_embeddings.append(getEmbeddings(batch_input_ids, batch_attention_mask))

taxonomy_embeddings = torch.cat(taxonomy_embeddings, dim=0)
taxonomy_embeddings.shape

torch.Size([277, 2048])

### Save embeddings

In [None]:
torch.save(taxonomy_embeddings, DATA_PATH + 'data/new_embeddings/sgpt_taxonomy_embeddings.pt')

torch.Size([192, 2048])


## Prediction

In [None]:
def find_largest_numbers(lst):
    # Create a list of tuples containing numbers and their indices
    indexed_numbers = [(num, index) for index, num in enumerate(lst)]

    # Sort the list in descending order based on the numbers
    sorted_numbers = sorted(indexed_numbers, key=lambda x: x[0], reverse=True)

    # Extract the ten largest numbers and their indices
    largest_numbers = sorted_numbers[:10]

    return largest_numbers

In [None]:
# Randomly choose 10 info sheets
search_term_indices = [random.randint(0, len(infoSheet.index)) for i in range(10)]
search_term_indices

[90, 262, 287, 271, 287, 22, 220, 323, 27, 14]

In [None]:
predictions = {'infoSheet_id': [], 'pred_taxonomy_id': [], 'similarity_score': [], 'gold_taxonomy_id': []}

for search_term_idx in search_term_indices:
    cos_sim = []
    for taxonomy_idx in range(len(taxonomy_embeddings)):
        cos_sim.append(1 - cosine(infoSheet_embeddings[search_term_idx], taxonomy_embeddings[taxonomy_idx]))

    lst = find_largest_numbers(cos_sim)
    for each in lst:
        if each[0] >= 0.7:
            predictions['infoSheet_id'].append(search_term_idx)
            predictions['pred_taxonomy_id'].append(each[1])
            predictions['similarity_score'].append(each[0])
            predictions['gold_taxonomy_id'].append(infoSheet['taxonomy heading ids'][search_term_idx].split(','))

print('Length of predictions: ', len(predictions['infoSheet_id']))
predictions = pd.DataFrame(predictions)
predictions.head(10)

KeyError: ignored

## Evaluation

In [None]:
infoSheet.loc[infoSheet.index == 192]

Unnamed: 0,ID,name_en,name_fr,abstract_en,abstract_fr,description_en,description_fr,taxonomy heading ids


In [None]:
taxonomy.loc[taxonomy.index == 219]

Unnamed: 0,id,name,description,translations
219,220,Independent Schools,Independent schools (also known as private sch...,"{""name"":{""en"":""Independent Schools"",""fr"":""Les ..."


In [None]:
retrieved_relevant = 0

for idx, row in predictions.iterrows():
    if row['pred_taxonomy_id'] + 1 in row['gold_taxonomy_id']:
        retrieved_relevant += 1

print(retrieved_relevant)
print('Precision: ' + str(retrieved_relevant / len(predictions.index)))

0
Precision: 0.0
