In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd drive/MyDrive/NLP Project/Code

In [None]:
!pip install transformers
!pip install rank_bm25
!pip install stop_words
!pip install sklearn_crfsuite

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import re
import torch
import pickle
import numpy as np
import pandas as pd
from stop_words import get_stop_words
from rank_bm25 import BM25Okapi


import transformers
from transformers import BertTokenizer, BertModel, BertForTokenClassification

import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

import torch
from torch.utils.data import DataLoader, TensorDataset

# Custom functions
from bert_text_pre_processing import add_labels
from CRF_utils import sent2features

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
# Set device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 
print(f"Using device: {device}")

Using device: cuda:0


##**Data processing**

We will use the labeled and unlabeled data in order to assign a cluster to each of the tokens. Start with pre-processing. 

In [None]:
# Download all data
unlabeled_reviews_train = pd.read_csv('/content/drive/MyDrive/NLP Project/Data/Unsupervised drug reviews/drugsComTrain_raw.csv')
unlabeled_reviews_test = pd.read_csv('/content/drive/MyDrive/NLP Project/Data/Unsupervised drug reviews/drugsComTest_raw.csv')

labeled_drug_reviews = pd.read_csv("/content/drive/MyDrive/NLP Project/Data/Unsupervised drug reviews/Copy of combined_df_1.csv")

# Concatenate unlabeled reviews
unlabeled_drug_reviews = pd.concat([unlabeled_reviews_train, unlabeled_reviews_test], axis = 0)
unlabeled_drug_reviews.reset_index(drop=True, inplace=True)

In [None]:
unlabeled_drug_reviews.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37


In [None]:
labeled_drug_reviews.head()

Unnamed: 0,txt_id,text
0,LIPITOR.86,"headaches, pain in throat , tingling in side o..."
1,LIPITOR.92,"Muscle aches and weakness in neck, arms, shoul..."
2,LIPITOR.952,I have taken Lipitor for 4 years with no probl...
3,LIPITOR.946,"Aches and pain from head to toe, very grouchy...."
4,LIPITOR.45,"Cannot be sure it is the drug, but around the ..."


In [None]:
combined_dataset = pd.concat([unlabeled_drug_reviews["review"], labeled_drug_reviews["text"]], axis = 0)#.to_frame()
combined_dataset.reset_index(drop=True, inplace=True)

# Convert into list
combined_dataset_list = combined_dataset.to_list()
combined_dataset_list = [str(elem) for elem in combined_dataset_list] # Some reviews are not strings for some reason.


The pre-trained BERT model requires the data to be tokenized in the same way as the training data used for the model. Special considerations: 

1. Each sentence must start with "[CLS]" and end with "[SEP]". 
2. All sentences must have the same number of tokens: some will be padded, other truncated. 
3. Need attention mask to keep track of the true tokens and the padding ones. 


In [None]:
# Initialize tokenizer 
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Tokenises data and puts it in BERT-compatible form 
tokenized_sentences = tokenizer(combined_dataset_list, add_special_tokens = True, 
                         max_length = 506, padding='max_length', return_attention_mask = True,
                         return_tensors='pt', truncation=True)

In [None]:
# Create Dataset and DataLoader with our data
input_ids = tokenized_sentences["input_ids"]
token_type_ids = tokenized_sentences["token_type_ids"]
attention_mask = tokenized_sentences["attention_mask"]

dataset = torch.utils.data.TensorDataset(input_ids, token_type_ids, attention_mask)
dataloader = torch.utils.data.DataLoader(dataset, batch_size = 100, shuffle = False)

##**Extracting the embeddings** 

In BERT, each unique token will have a different embedding depending on the context in which it is presented. This leads to context-dependent segmentation, which proved to be an advantage, for example, for polysemic words. 

In this model, BERT embeddings will be further split into clusters, and those cluster assignments built on a larger corpus of data will serve as a richer feature representation of the text that needs to be classified.

Due to computational resources, we could not keep track of all the different embeddings for each unique token in our text corpus. We found that the tokens with the biggest number of distinct embeddings were stop words. Additionally, the mean Euclidean distance between different embeddings of each ADR token was found to be significantly smaller than to different words. 

So we set the embedding of each unique token to be the mean of the different embeddings encountered. 

In [None]:
# Initialize BERT model 
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
model = model.to(device)
model.eval()

In [None]:
# Find number of unique tokens
unique_tokens = torch.unique(tokenized_sentences["input_ids"])

# Initialize tensor where the mean embeddings for each unique token will be stored
unique_embeddings = torch.zeros((unique_tokens.shape[0], 768))
unique_tokens_counter = torch.zeros((unique_tokens.shape[0], 1))

# Keep track fo the index of each token in the unique_embeddings tensor
unique_tokens_dict = dict(zip(unique_tokens.numpy(), range(len(unique_tokens))))

In [None]:
with torch.no_grad():

  for i, (input_ids, token_type_ids, attention_mask) in enumerate(dataloader):

    input_ids, token_type_ids, attention_mask = input_ids.to(device), token_type_ids.to(device), attention_mask.to(device)

    # Forward pass through BERT and store hidden layers
    outputs = model(input_ids, token_type_ids, attention_mask)
    hidden_states = outputs[2]

    token_embeddings = torch.stack(hidden_states, dim=0) # Stack hidden layers - (layers, n sentences, n tokens, vector dim)
    embeddings_flat = token_embeddings.reshape(13, -1, 768).permute(1,0,2) # Get rid of the sentence dimension. - (layers, total n tokens, vector dim)

    # Obtain the embeddings
    token_vecs_sum = torch.zeros((embeddings_flat.shape[0], embeddings_flat.shape[2]))

    # For each token, sum the representation of the last 4-layers of BERT
    for j, token in enumerate(embeddings_flat):

        # Sum the vectors from the last four layers.
        sum_vec = torch.sum(token[-4:], dim=0)
        
        # Use `sum_vec` to represent `token`.
        token_vecs_sum[j, :] = sum_vec

    # Update the embeddings for each unique token
    batch_unique_tokens = torch.unique(input_ids.to("cpu").flatten()) # Find the unique tokens in each batch 
    batch_pos_unique_tokens = [torch.where(input_ids.to("cpu").flatten() == i)[0] for i in batch_unique_tokens] # Find the positions in which each unique token appears in the batch

    pos_unique_tokens = [unique_tokens_dict[id.item()] for id in batch_unique_tokens] # Find the position of the batch unique tokens in the original unique_token array
    
    # Sum the distinct embeddings for each token 
    for pos, batch_pos in zip(pos_unique_tokens, batch_pos_unique_tokens):
      unique_embeddings[pos, :] += token_vecs_sum[batch_pos, :].sum(axis=0)
      unique_tokens_counter[pos, 0] += len(batch_pos)

# Take the mean of the embeddings by dividing by the number of times each token appeared in the sentences
unique_embeddings = unique_embeddings / unique_tokens_counter 

In [None]:
# Find the tokens corresponding to the unique token ids in the corpus
BERT_vocab = list(tokenizer.vocab.keys())
unique_token_words = [BERT_vocab[i] for i in unique_tokens]
unique_token_words = np.array(unique_token_words)

In [None]:
# Save the unique tokens and corresponding embeddings
#np.savetxt("/content/drive/MyDrive/NLP Project/Data/BERT_embeddings/unique_embeddings", unique_embeddings)
#np.savetxt("/content/drive/MyDrive/NLP Project/Data/BERT_embeddings/unique_tokens", list(unique_tokens))

##**Experiment**

The BERT embeddings are context-dependent. So each unique token will have a set of different embeddings. Check properties for the same batch. 

In [None]:
dataiter = iter(dataloader)
input_ids, token_type_ids, attention_mask= next(dataiter)

with torch.no_grad():
    
    input_ids, token_type_ids, attention_mask = input_ids.to(device), token_type_ids.to(device), attention_mask.to(device)

    # Pass through BERT and get last layers
    outputs = model(input_ids, token_type_ids, attention_mask)
    hidden_states = outputs[2]

    # Obtain the embeddings
    token_embeddings = torch.stack(hidden_states, dim=0)
    embeddings_flat = token_embeddings.reshape(13, -1, 768).permute(1,0,2)

    token_vecs_sum = torch.zeros((embeddings_flat.shape[0], embeddings_flat.shape[2]))
    for j, token in enumerate(embeddings_flat):

        # Sum the vectors from the last four layers.
        sum_vec = torch.sum(token[-4:], dim=0)
        
        # Use `sum_vec` to represent `token`.
        token_vecs_sum[j, :] = sum_vec

    token_vecs_array = token_vecs_sum.numpy()

In [None]:
# Investigate, first the number of unique tokens in the input vector
unique_tokens = set(input_ids.to("cpu").numpy().flatten()) 
no_diff_embeddings = []

for token in unique_tokens:

  # Find all embeddings for the same token
  token_ids = np.where(input_ids.to("cpu").numpy().flatten() == token)[0]
  no_diff_embeddings.append(len(token_ids))

no_diff_embeddings = np.array(no_diff_embeddings)

print(f"The mean number of embeddings per unique token is {int(no_diff_embeddings.mean())}, with a standard deviation of {int(no_diff_embeddings.std())}")

The mean number of embeddings per unique token is 24.889326119035907, with a standard deviation of 841.3776180541691


In [None]:
# Investigate which words have the most embeddings
BERT_vocab = list(tokenizer.vocab.keys()) # The IDs are in order, so can just take this. 

max_no_embeddings_idx = np.argsort(no_diff_embeddings)[-30:]
max_tokens_ids = [list(unique_tokens)[i] for i in reversed(max_no_embeddings_idx)]
max_tokens = [BERT_vocab[i] for i in max_tokens_ids]


max_words_df = pd.DataFrame([np.flip(no_diff_embeddings[max_no_embeddings_idx])], columns = max_tokens, index=['Number of distinct embeddings'])
max_words_df

Unnamed: 0,[PAD],.,i,the,and,",",to,it,"""",my,...,on,[SEP],[CLS],is,!,but,this,me,so,in
Number of distinct embeddings,37933,642,543,286,273,262,258,213,199,196,...,102,100,100,97,86,84,83,80,79,79


In [None]:
words = [BERT_vocab[i] for i in unique_tokens]

# Now let's choose 5 ADRs
vomit_idx = np.where(np.array(words) == 'vomiting')[0][0]
headache_idx = np.where(np.array(words) == 'headache')[0][0]
fever_idx = np.where(np.array(words) == 'fever')[0][0]
cough_idx = np.where(np.array(words) == 'cough')[0][0]
blood_idx = np.where(np.array(words) == 'blood')[0][0]

ADRs_idx = [vomit_idx, blood_idx, headache_idx, fever_idx, cough_idx]
ADR_embeddings = no_diff_embeddings[ADRs_idx]
ADR_df = pd.DataFrame([ADR_embeddings], columns = ['vomiting', 'headache', 'fever', 'cough', 'blood'], index=['Number of distinct embeddings'])
ADR_df

Unnamed: 0,vomiting,headache,fever,cough,blood
Number of distinct embeddings,1,5,5,1,3


In [None]:
# Now check how different those embeddings are for the ADRs above
unique_tokens_list = list(set(input_ids.to("cpu").numpy().flatten())) 
headache_token = unique_tokens_list[headache_idx]

headache_idx_sent = np.where(input_ids.to("cpu").numpy().flatten() == headache_token)[0]

# Extract corresponding embeddings
headache_embeddings = token_vecs_array[headache_idx_sent, :]
dist = np.zeros((headache_embeddings.shape[0], headache_embeddings.shape[0]))

for idx_i, i in enumerate(headache_embeddings):
  for idx_j, j in enumerate(headache_embeddings):
    ind_dist = np.linalg.norm(i-j)
    dist[idx_i,idx_j] = ind_dist
    dist[idx_j,idx_i] = ind_dist

print(dist.mean())
    

18.699090118408204


In [None]:
blood_token = unique_tokens_list[blood_idx]
blood_idx_sent = np.where(input_ids.to("cpu").numpy().flatten() == blood_token)[0]

# Extract corresponding embeddings
blood_embeddings = token_vecs_array[blood_idx_sent, :]
dist_blood = np.zeros((blood_embeddings.shape[0], blood_embeddings.shape[0]))

for idx_i, i in enumerate(blood_embeddings):
  for idx_j, j in enumerate(blood_embeddings):
    ind_dist = np.linalg.norm(i-j)
    dist_blood[idx_i,idx_j] = ind_dist
    dist_blood[idx_j,idx_i] = ind_dist

print(dist_blood.mean())
    


38.2617790222168


In [None]:
np.linalg.norm(token_vecs_array, axis = 0).mean()

502.25906