In [8]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/NLP Project/Code

!pip install transformers
!pip install rank_bm25
!pip install stop_words
!pip install sklearn_crfsuite

import re
import torch
import pickle
import numpy as np
import pandas as pd
import joblib
from stop_words import get_stop_words
from rank_bm25 import BM25Okapi
from gensim.models import Word2Vec


# import transformers
from transformers import BertTokenizer, BertModel, BertForTokenClassification

import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

import torch
from torch.utils.data import DataLoader, TensorDataset

# Custom functions
from bert_text_pre_processing import add_labels
from CRF_utils import sent2features, find_cluster, preprocessing

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/.shortcut-targets-by-id/1p6nih715zcy9UuO638of1o9bkGKrO7ch/NLP Project/Code


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


ModuleNotFoundError: No module named 'numpy.strings'

In [None]:
def preprocessing(content, remove_sw):
    # convert the text to lowercase
    content = content.lower()

    # remove non-alphabetical characters
    regex = re.compile('[^a-z\s]+')
    content = regex.sub('', content)

    # https://www.adamsmith.haus/python/answers/how-to-remove-all-punctuation-marks-with-nltk-in-python
    # remove punctuation and tokenize (which will be the same as 1-grams)
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    one_grams = tokenizer.tokenize(content)

    #remove stopwords
    if remove_sw == True:
        one_grams = [i for i in one_grams if i not in get_stop_words('english')]

    # lemmatize
    lemmatizer = WordNetLemmatizer()
    words = []
    for word in one_grams:
        words.append(lemmatizer.lemmatize(word))

    return words

##**Lexicon-Based Information Retrieval Models**

In this section we use lexicon based approaches, namely TF-IDF and BM25 to retrieve the relevant ADRs for our reviews.

In [9]:
# get ADR lexicon dataframe
adr_lexicon = pd.read_csv('/content/drive/MyDrive/NLP Project/Data/ADR_lexicon.txt', sep='\t', names=['id', 'reaction', 'source'])
lexicon_list = adr_lexicon.reaction.to_list()

ModuleNotFoundError: No module named 'numpy.rec'

In [None]:
# ger reviews daatframe
reviews_df = pd.read_csv('/content/drive/MyDrive/NLP Project/Data/Combined Datasets/combined_df_1.csv')
reviews_list = reviews_df.text.to_list()
reviews_id = reviews_df.txt_id.to_list()

# 2388 reviews in total

In [None]:
# get list of nan reviews
d = dict(zip(reviews_id,reviews_list))
list_nan = [key for key, value in d.items() if isinstance(value, float)]

In [None]:
# remove nan reviews from review dataframe
reviews_df = reviews_df[~reviews_df['txt_id'].isin(list_nan)]
reviews_list = reviews_df.text.to_list()
reviews_id = reviews_df.txt_id.to_list()

# 2254 after removing nan

In [None]:
# remove nan reviews from adr dataframe
adr_df = pd.read_csv('/content/drive/MyDrive/NLP Project/Data/Combined Datasets/combined_df_2.csv')
adr_df = adr_df[~adr_df['txt_id'].isin(list_nan)]

In [None]:
# build initial ADR dictionary
ADRs = {}
for i in reviews_id:
    ADRs[i] = adr_df.loc[adr_df['txt_id'] == i]['symptom'].to_list()

In [None]:
# get review IDs that are in review dataframe but not in adr dataframe
no_adr_reviews = [k for k, v in ADRs.items() if v in (None, "", [])]

In [None]:
# remove them
reviews_df = reviews_df[~reviews_df['txt_id'].isin(no_adr_reviews)]
reviews_list = reviews_df.text.to_list()
reviews_id = reviews_df.txt_id.to_list()

# 2058 after removing the not annotated ones

adr_df = adr_df[~adr_df['txt_id'].isin(no_adr_reviews)]

In [None]:
# build final ADR dictionary
ADRs = {}
for i in reviews_id:
    ADRs[i] = adr_df.loc[adr_df['txt_id'] == i]['symptom'].to_list()

In [None]:
def get_ADRs_for_new_review(model, review, lexicon_list=lexicon_list, threshold_bm25=0.78, threshold_tf_idf=0.32):

    preprocessed_ADRs = [preprocessing(i, remove_sw=True) for i in lexicon_list]
    tokenized_review = preprocessing(review, remove_sw=True)

    # choose model
    if model == 'bm25':
        bm25 = BM25Okapi(preprocessed_ADRs)

        # get the scores for every ADRs for this specific review
        score_list = bm25.get_scores(tokenized_review)

        # build dataframe with scores
        scores_df = pd.DataFrame({'ADR': lexicon_list, 'score': score_list})

        # remove rows with score=0
        scores_df = scores_df[scores_df['score'] != 0]

        # normalize scores
        scores_df['normalized_score'] = (scores_df['score'] - scores_df.score.min()) / (scores_df.score.max() - scores_df.score.min())

        # keep only scores over thershold
        scores_df = scores_df[scores_df['normalized_score'] > threshold_bm25]

        # get final list of ADRs obtained from model
        model_ADRs = scores_df.ADR.to_list()

    elif model == 'tf_idf':
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(lexicon_list)

        query_tfidf = vectorizer.transform([review])

        # get scores
        cosine_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()

        # build dataframe with scores
        scores_df = pd.DataFrame({'ADR': lexicon_list, 'score': cosine_similarities})

        # keep only scores over thershold
        scores_df = scores_df[scores_df['score'] > threshold_tf_idf]

        # get final list of ADRs obtained from model
        model_ADRs = scores_df.ADR.to_list()

    return model_ADRs

In [None]:
input_string ="Within 5 minutes of taking drug, developed severe colon and uterine cramping. Cold sweat, fainting, heart palpitations. Will never use again or recommend. "

In [None]:
print(get_ADRs_for_new_review('tf_idf', input_string))

['cold sweat', 'uterine cramping', 'fainting']


In [None]:
print(get_ADRs_for_new_review('bm25', input_string))

['cold sweat', 'uterine cramping', 'fainting']


##**ADR-Mine and extensions**

In this section, the ADR-Mine and ADR-Mine with BERT models are used to extract the ADRs from an input string. All the extracted ADRs are shown, as well as their cluster assignments and all the tokes within each of the clusters.

###**ADR-Mine**

In [None]:
input_string ="Within 5 minutes of taking drug, developed severe colon and uterine cramping. Cold sweat, fainting, heart palpitations. Will never use again or recommend. "

In [None]:
# Load pre-trained word2vec, KMeans and CRF models
word2vec = Word2Vec.load("/content/drive/MyDrive/NLP Project/Models/word2vec.model")
kmeans = joblib.load("/content/drive/MyDrive/NLP Project/Models/model.pkl")
crf_W2v = joblib.load("/content/drive/MyDrive/NLP Project/Models/CRF_word2vec")

vocab = np.array(list(word2vec.wv.key_to_index.keys()))

In [None]:
# Tokenize the input string
tokenized_sentence = preprocessing(input_string)

# Extract the input features necessaty for CRF processing.
CRF_data = sent2features(tokenized_sentence, vocab,  kmeans.labels_)

In [None]:
# Predict with CRF
y_pred = crf_W2v.predict([CRF_data])

# Convert tokenized sentence and predictions into arrays
tokenized_array, prediction_array = np.array(tokenized_sentence), np.array(y_pred[0])

# Extract ADRs from prediction
ADRs = tokenized_array[prediction_array == "1"]

# Find the cluster assingment for each ADR
ADR_cluster_ids = [find_cluster(word, vocab, kmeans.labels_) for word in ADRs]

# Find the words in each of the clusters containing the ADRs in the input string
cluster_words = [vocab[kmeans.labels_ == cluster_id] for cluster_id in ADR_cluster_ids]

# Report results
ADR_df = pd.DataFrame(np.array([ADRs, ADR_cluster_ids, cluster_words], dtype=object).T,  columns = ['Extracted ADR', 'Cluster assignment', 'Cluster members'])
ADR_df

Unnamed: 0,Extracted ADR,Cluster assignment,Cluster members
0,cramping,30,"[effect, headache, cramp, nausea, swing, cramp..."
1,cold,114,"[migraine, infection, cold, cough, bowel, yeas..."
2,sweat,30,"[effect, headache, cramp, nausea, swing, cramp..."
3,fainting,15,"[especially, vision, breath, lack, general, br..."
4,heart,34,"[pressure, level, heart, sugar, system, rate, ..."
5,palpitation,15,"[especially, vision, breath, lack, general, br..."


###**ADR-Mine with BERT embeddings**

In [None]:
# Load embeddings and unique tokens
unique_embeddings = np.loadtxt("/content/drive/MyDrive/NLP Project/Data/BERT_embeddings/unique_embeddings")
unique_tokens = np.loadtxt("/content/drive/MyDrive/NLP Project/Data/BERT_embeddings/unique_tokens")

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
BERT_vocab = list(tokenizer.vocab.keys())

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Load k-means and CRF models
KMeans_BERT = pickle.load(open(f"/content/drive/MyDrive/NLP Project/Models/BERT_Kmeans_800.pkl", "rb"))
crf_BERT = joblib.load("/content/drive/MyDrive/NLP Project/Models/CRF_BERT")

In [None]:
# Tokenize input sentence
tokenized_sentence_BERT = tokenizer.tokenize(input_string)
token_ids = tokenizer.convert_tokens_to_ids(tokenized_sentence_BERT)

# Extract the input features necessaty for CRF processing.
CRF_data_BERT = sent2features(tokenized_sentence_BERT, BERT_vocab,  KMeans_BERT.labels_, max_cluster = 800)

In [None]:
# Predict with CRF
y_pred_BERT = crf_BERT.predict([CRF_data_BERT])

# Convert tokenized sentence and predictions into arrays
tokenized_array_BERT, prediction_array_BERT = np.array(tokenized_sentence_BERT), np.array(y_pred_BERT[0])

# Extract ADRs from prediction
ADRs_BERT = tokenized_array_BERT[prediction_array_BERT == "1"] # Get predicted ADRs
ADRs_id = np.array(token_ids)[prediction_array_BERT == "1"] # Get ADRs tokens

# Find the cluster assignment of each ADR and the rest of the words in that cluster
cluster_assignment = [KMeans_BERT.labels_[i] if i < len(KMeans_BERT.labels_) else 801 for i in ADRs_id]
cluster_words_BERT = [np.array(BERT_vocab)[np.where(KMeans_BERT.labels_ == id)[0]] for id in cluster_assignment]

# Report results
ADR_df_BERT = pd.DataFrame(np.array([ADRs_BERT, cluster_assignment, cluster_words_BERT], dtype=object).T,  columns = ['Extracted ADR', 'Cluster assignment', 'Cluster members'])
ADR_df_BERT

Unnamed: 0,Extracted ADR,Cluster assignment,Cluster members
0,sweat,275,"[ி, 林, people, staring, performances, check, e..."
1,",",212,"[,, n, ي, き, post, lord, whose, ##ina, 4th, ri..."
2,faint,107,"[lower, cases, incorporated, stream, laughing,..."
3,##ing,210,"[##ing, systems, failed, otherwise, gradually,..."


##**Fine-tuned BERT**



In [None]:
def pre_process_review(sentence):

  '''
  A function that perpocesses the text input and prepares
  the input features for the tokenizer
  '''

  # Load pre-trained model tokenizer (vocabulary)
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

  #tokenize the sentence
  tokenized_sentence = tokenizer.tokenize(sentence)

  #create the input features
  att_mask = [1] * len(tokenized_sentence)
  labels = [1] * len(tokenized_sentence)
  token_ids = tokenizer.convert_tokens_to_ids(tokenized_sentence)

  #convert to tensors
  att_mask = torch.tensor(att_mask)
  labels_1 = torch.tensor(labels)
  token_ids = torch.tensor(token_ids)

  #reshape the tensor
  att_mask_r = torch.reshape(att_mask, (1, len(att_mask)))
  labels_r = torch.reshape(labels_1, (1, len(att_mask)))
  token_ids_r = torch.reshape(token_ids, (1, len(att_mask)))

  return att_mask_r, labels_r, token_ids_r, tokenized_sentence




def make_prediction(att_mask_r, labels_r, token_ids_r, model):
  '''
  A function that makes prediction using the model
  '''
  #make predictions
  model.eval()

  output = model(token_ids_r,
                    token_type_ids=None,
                    attention_mask=att_mask_r,
                    labels=labels_r)

  eval_prediction = output[1]

  eval_prediction = np.argmax(eval_prediction.detach().to('cpu').numpy(), axis = 2)

  return eval_prediction


def extract_ADR(eval_prediction, tokenized_sentence):
  '''
  A function that extracts the ADRs given a sentence
  '''
  ADR = []
  for i in range(len(eval_prediction[0])):

    if eval_prediction[0][i] == 1:
      ADR.append(tokenized_sentence[i])

  return ADR

In [None]:
# Initialize the model
model = transformers.BertForTokenClassification.from_pretrained('bert-base-uncased',  num_labels = 3)

#load model
model.load_state_dict(torch.load('/content/drive/MyDrive/NLP Project/Models/bert_model_4.pt', map_location=torch.device('cpu')))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

<All keys matched successfully>

In [None]:
#prepare the input features
att_mask_r, labels_r, token_ids_r, tokenized_sentence = pre_process_review(input_string)

In [None]:
#get labels
eval_prediction = make_prediction(att_mask_r, labels_r, token_ids_r, model)

In [None]:
#Extract the ADRs
extracted_adr = extract_ADR(eval_prediction, tokenized_sentence)

# Report results
ADR_df_BERT_ft = pd.DataFrame(np.array([extracted_adr], dtype=object).T,  columns = ['Extracted ADR'])
ADR_df_BERT_ft

Unnamed: 0,Extracted ADR
0,colon
1,ut
2,##erine
3,cr
4,##amp
5,##ing
6,cold
7,sweat
8,faint
9,##ing
