In [1]:
#installations that need to be done every time this Notebook is ran
!pip install transformers
!pip install rank_bm25
!pip install stop_words

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m55.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m67.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.4
Looking in indexes: https://pypi.org/simple, http

In [2]:
#mount to google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#import libraries
import pandas as pd
import torch
import transformers
from transformers import BertTokenizer, BertModel, BertForTokenClassification
import numpy as np
import nltk
import re
from stop_words import get_stop_words
from nltk.stem import WordNetLemmatizer
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import nltk
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [9]:
#Functions

#Function that checks if an ADR is in text
def is_included(row):
    list1, list2 = row['tokenized'], row['tokenized_adr']
    result = []

    i = 0
    while i < len(list1):

        if list1[i:i+len(list2)] == list2:
          for j in range(len(list2)):
            result.append(1)
            i += 1
        else:
            result.append(0)
            i += 1
            
    return result


# define a function to sum the lists
def sum_lists(lst):
    return [sum(x) for x in zip(*lst)]



#convert to BERT format
def convert_BERT_format(df_1, column):

  # add the symbol [CLS] as the beginning of each sentence
  df_1[column] = df_1.apply(lambda x: '[CLS]' + ' ' + str(x[column]), axis = 1)
  # add the symbol [SEP] at the end of each sentence in the review 
  df_1[column] = df_1.apply(lambda x: str(x[column]).replace('.', '. [SEP]'), axis = 1)

  return df_1


def remove_whitespace(df_1, column):
  
  #remove \n and \t (for some reason we have to do this when we load back in the data)
  df_1[column] = df_1.apply(lambda x: str(x[column]).replace("\n", " "), axis = 1)
  df_1[column] = df_1.apply(lambda x: str(x[column]).replace("\t", " "), axis = 1)


def preprocessing(content, remove_sw):
    # convert the text to lowercase
    content = content.lower()
    regex = re.compile('[^a-z\s]+')

    # remove all commas so that constructions such as $70,000 maintain their meaning and do not get split:'70', '000'
    content = regex.sub('', content)

    # https://www.adamsmith.haus/python/answers/how-to-remove-all-punctuation-marks-with-nltk-in-python
    # remove punctuation and tokenize (which will be the same as 1-grams)
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    one_grams = tokenizer.tokenize(content)

    #remove stopwords
    if remove_sw == True:
        one_grams = [i for i in one_grams if i not in get_stop_words('english')]

    # lemmatize
    lemmatizer = WordNetLemmatizer()
    words = []
    for word in one_grams:
        words.append(lemmatizer.lemmatize(word))   

    return words


def add_labels(df_1, df_2, tokenizer_type, column, id_column, symptom, remove_stopw):
  '''
  A function that geiven input of a review and symptoms, returns a preprocessed data set
  where we have the review tokenized, and a corresponding columns that shows at which position
  an ADR can be dfound. For examle if we have "My myscles hurt", the returned value would be
  one column ['my', 'muslces', 'hurt'] and another column [0,1,0] with the 1 indicating the position
  of the ADR
  Inputs:
        df_1 - dataframe which contains the reviwes (dataframe)
        df_2 - a dataframe that contains the ADRs (dataframe)
        tokenizer_type - input "BERT" for bert tokenizer and "Other" for regular (str)
        column - the column in which the text review is found in df_1 dataset (str)
        id_column - the column by which df_1 and df_2 can be merged (str)
        symptom - the name of the column which contains the AD in df_2 (str)
        remove_stopw - set True to remove and False not to remove (boolen)
                      Note: for BERT, we never remove the stop words
  Output:
        preprocess_data - the datframe whith the processed reviews, and correposing
        location of the ADR (dataframe)
  '''
  #remove white space
  remove_whitespace(df_1, column)
  
  #BERT Tokenizer
  if tokenizer_type == "BERT":
    convert_BERT_format(df_1, column)

    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    df_1['tokenized'] = df_1.apply(lambda x: tokenizer.tokenize(x[column]), axis =1)
    df_2['tokenized_adr'] = df_2.apply(lambda x: tokenizer.tokenize(str(x[symptom])), axis =1)

  #Non-BERT tokenizer
  else:

    
    #convert the columns to list
    df_1_list = df_1[column].to_list()
    df_2_list = df_2[symptom].to_list()

    #preprocess the text
    preprocessed_1 = [preprocessing(str(i), remove_sw=remove_stopw) for i in df_1_list]
    preprocessed_2 = [preprocessing(str(i), remove_sw=remove_stopw) for i in df_2_list]

    df_1['tokenized'] = preprocessed_1
    df_2['tokenized_adr'] = preprocessed_2


  #merge the 2 dataframes
  merged_df = df_1.merge(df_2, on= id_column, how='left')
  merged_df = merged_df[merged_df['tokenized_adr'].notna()]

  merged_df['token_included'] = merged_df.apply(is_included, axis=1)

  # group the dataframe by "group_col" and apply the "sum_lists" function to "Tokens"
  result = merged_df.groupby(id_column)['token_included'].apply(sum_lists).reset_index()

  # rename the columns in the result dataframe
  result.columns = [id_column, 'pre_processed_tokens']

  #merge back with the rest of the data to get final output
  preprocess_data = df_1.merge(result, on=id_column, how='left')
  preprocess_data = preprocess_data[preprocess_data['pre_processed_tokens'].notna()]
  
  #replace all values higher than 1 with 0
  list_tokenized =  preprocess_data['pre_processed_tokens'].to_list()
  for i in list_tokenized:
    for j in range(len(i)):

      if i[j] > 1: 
        i[j] = 1

  return preprocess_data


def split_long_reviews(tokens, labels, txt_id_l, dataset_l, text_l):

  '''
  This functiont akes as input the subset of the dataframe to be split into 
  smaller subset if the maximum length of the input size is exceeded

  Inputs:
    tokens - a list of the column that includes the tokenized text (list)
    lables - a list of the column that includes the labels (list)
    txt_id_1 - a list of the column which includes the text id (list)
    dataset_1 - a list of the column that includes the dataset name (list)
    text_l - a list of the column that includes the text (list)
  
  Outoput:
    dubset_df - a dataframe which has split the long texts into smaller ones
                presented as a dataframe of the same format
  '''

  new_tokens = []
  new_labels = []
  new_txt_id = []
  new_dataset = []
  new_text = []

  for i in range(len(tokens)):

    #split them the tokens and the labels in half

    tokens_1 = tokens[i][:len(tokens[i])//2]
    tokens_2 = tokens[i][len(tokens[i])//2:]

    labels_1 = labels[i][:len(labels[i])//2]
    labels_2 = labels[i][len(labels[i])//2:]

    #add the [SEP] token to the tokens_1 and [CLS] to tokens_2
    tokens_1.append('[SEP]')
    tokens_2.insert(0, '[CLS]')

    labels_1.append(0)
    labels_2.insert(0, 0)

    #append the newly created tokens
    new_tokens.append(tokens_1)
    new_tokens.append(tokens_2)

    new_labels.append(labels_1)
    new_labels.append(labels_2)

    new_txt_id.append(txt_id_l[i] + '_1')
    new_txt_id.append(txt_id_l[i] + '_2')

    new_dataset.append(dataset_l[i])
    new_dataset.append(dataset_l[i])

    new_text.append(text_l[i])
    new_text.append(text_l[i])

  #create a dataframe with these new inputs
  subset = {'txt_id': new_txt_id, 'text': new_text, 'dataset': new_dataset, 'tokenized': new_tokens, 'pre_processed_tokens': new_labels}
  subset_df = pd.DataFrame(data=subset)

  #return the new dataframe with split columns
  return subset_df



def introduce_padding(max_len, t, symbol):
  '''
  A function that pads the inputs.

  Inputs: max_len - the maximum length until which to pad (int)
          t - the list to be added with padding (list)
          symbol - the symbol to be padded with. For example
          if we are padding a list of tokens this would be [PAD]
          if we are padding a list of labels this would be 0 (int/str)
  
  Output: t - the padded list (list)
  '''
  #get the current length of the list
  t_len = len(t)

  #find the difference between the max len an the list len
  diff = max_len - t_len

  #padd in the place of all the difference
  for i in range(diff):
    t.append(symbol)

  #return the padded list
  return t


def convert_data_to_tensor(df, label, mask, token_id):

  ''''
  This function converts the necessary data for train/test
  and converts it into tensors

  Inputs: df - the dataframe which contains the columns (dataframe)
          label - the label column (str)
          mask - the column which contains the mask (str)
          token_id - the column which contains the column id (str)
  Outputs:
          padded_att_mask - tensor of the mask matrix (tensor)
          padded_token_ids - tensor of the token matrix (tensor)
          padded_labels - tensor of the label matrix (tensor)
  '''

  #get the data needed for the data loader (test)
  padded_att_mask = np.stack(df[mask].values, axis=0)
  padded_token_ids = np.stack(df[token_id].values, axis=0)
  padded_labels = np.stack(df[label].values, axis=0)

  #convert the data to tensor (test)
  padded_att_mask = torch.from_numpy(padded_att_mask)
  padded_token_ids = torch.from_numpy(padded_token_ids)
  padded_labels = torch.from_numpy(padded_labels)

  return padded_att_mask, padded_token_ids, padded_labels



def evaluate_model(dataloader_test, model): 

  '''
  A function that evaluates the performance of the model
  It tests it on the test datasets and returns the 
  predicted and true labels

  Inputs: dataloader_test - dataloader which contains the test data
          model - the trained model 
  
  Outputs: predictions - an n x m numpy array which contains the predictions (numpy array)
           true_labels - an n x m numpy array with contains the true labels (numpy array)
  '''

  model.eval()
    
  eval_loss = 0
  predictions = np.array([], dtype = np.int64).reshape(0, max_len)
  true_labels = np.array([], dtype = np.int64).reshape(0, max_len)


  with torch.no_grad():
    for i, (padded_att_mask, padded_token_ids, padded_labels) in enumerate(dataloader_test):

      #set to available device
      padded_att_mask = padded_att_mask.to(device)
      padded_token_ids = padded_token_ids.to(device)
      padded_labels = padded_labels.to(device)

      #make predictions
      output = model(padded_token_ids, 
                        token_type_ids=None,
                        attention_mask=padded_att_mask,
                        labels=padded_labels)
      

      step_loss = output[0]
      eval_prediction = output[1]

      eval_loss += step_loss

      eval_prediction = np.argmax(eval_prediction.detach().to('cpu').numpy(), axis = 2)
      actual = padded_labels.to('cpu').numpy()

      predictions = np.concatenate((predictions, eval_prediction), axis = 0)
      true_labels = np.concatenate((true_labels, actual), axis = 0)


    return predictions, true_labels


In [10]:
#load the combined files
df_1 = pd.read_csv(r'/content/drive/MyDrive/NLP Project/Data/Combined Datasets/combined_df_1.csv')
df_2 = pd.read_csv(r'/content/drive/MyDrive/NLP Project/Data/Combined Datasets/combined_df_2.csv')

In [11]:
df_1.head()

Unnamed: 0,txt_id,text,dataset
0,LIPITOR.86,"headaches, pain in throat , tingling in side o...",CADEC
1,LIPITOR.92,"Muscle aches and weakness in neck, arms, shoul...",CADEC
2,LIPITOR.952,I have taken Lipitor for 4 years with no probl...,CADEC
3,LIPITOR.946,"Aches and pain from head to toe, very grouchy....",CADEC
4,LIPITOR.45,"Cannot be sure it is the drug, but around the ...",CADEC


In [12]:
len(df_1)

2388

In [13]:
df_2.head()

Unnamed: 0,symptom,txt_id,dataframe,start,end
0,little blurred vision,ARTHROTEC.1,CADEC,29,50
1,feel a bit weird,ARTHROTEC.1,CADEC,437,453
2,gastric problems,ARTHROTEC.1,CADEC,62,78
3,bit drowsy,ARTHROTEC.1,CADEC,9,19
4,Hunger pangs,ARTHROTEC.10,CADEC,0,12


In [14]:
pre_processed = add_labels(df_1, df_2, 'BERT', 'text', 'txt_id', 'symptom', False)

pre_processed.head()

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Unnamed: 0,txt_id,text,dataset,tokenized,pre_processed_tokens
0,LIPITOR.86,"[CLS] headaches, pain in throat , tingling in ...",CADEC,"[[CLS], headache, ##s, ,, pain, in, throat, ,,...","[0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, ..."
1,LIPITOR.92,"[CLS] Muscle aches and weakness in neck, arms,...",CADEC,"[[CLS], muscle, ache, ##s, and, weakness, in, ...","[0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,LIPITOR.946,"[CLS] Aches and pain from head to toe, very gr...",CADEC,"[[CLS], ache, ##s, and, pain, from, head, to, ...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, ..."
4,LIPITOR.45,"[CLS] Cannot be sure it is the drug, but aroun...",CADEC,"[[CLS], cannot, be, sure, it, is, the, drug, ,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5,LIPITOR.775,[CLS] Muscle and joint pain developed after ap...,CADEC,"[[CLS], muscle, and, joint, pain, developed, a...","[0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [15]:
#get the length of each of the reviews
pre_processed['r_lenght'] = pre_processed.apply(lambda x: len(x.tokenized), axis = 1)
#find the maximum lenthg of the tokens
max_len = max(pre_processed['r_lenght'])
print("Max length is", max_len)

Max length is 977


In [16]:
#There are 7 reviews that have tokens larger than the max input size of BERT. 
reviews_long = pre_processed[pre_processed['r_lenght'] >= 512]
reviews_long.head()

Unnamed: 0,txt_id,text,dataset,tokenized,pre_processed_tokens,r_lenght
686,ARTHROTEC.137,[CLS] Ate a decent meal and took one pill at 4...,CADEC,"[[CLS], ate, a, decent, meal, and, took, one, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",569
991,LIPITOR.59,"[CLS] Severe pain & cramping in ankles, feet a...",CADEC,"[[CLS], severe, pain, &, cr, ##amp, ##ing, in,...","[0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",566
1002,LIPITOR.780,[CLS] SEVERE FATIGUE IN MY BODY AND TIREDNESS ...,CADEC,"[[CLS], severe, fatigue, in, my, body, and, ti...","[0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, ...",678
1228,LIPITOR.546,[CLS] I had been taking Lipitor 10mg for 2 yea...,CADEC,"[[CLS], i, had, been, taking, lip, ##itor, 10,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",977
1427,lexapro.178,[CLS] First two weeks were a nightmare. [SEP] ...,PsyTar,"[[CLS], first, two, weeks, were, a, nightmare,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",542


In [17]:
#split the tokens which have size of more than 512 in half
tokens = reviews_long['tokenized'].to_list()
labels = reviews_long['pre_processed_tokens'].to_list()
txt_id_l = reviews_long['txt_id'].to_list()
dataset_l = reviews_long['dataset'].to_list()
text_l = reviews_long['text'].to_list()

split_df = split_long_reviews(tokens, labels, txt_id_l, dataset_l, text_l)

#include a column for the length
split_df['r_lenght'] = split_df.apply(lambda x: len(x.tokenized), axis = 1)
split_df.head()

Unnamed: 0,txt_id,text,dataset,tokenized,pre_processed_tokens,r_lenght
0,ARTHROTEC.137_1,[CLS] Ate a decent meal and took one pill at 4...,CADEC,"[[CLS], ate, a, decent, meal, and, took, one, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",285
1,ARTHROTEC.137_2,[CLS] Ate a decent meal and took one pill at 4...,CADEC,"[[CLS], will, carefully, take, ib, ##up, ##ro,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",286
2,LIPITOR.59_1,"[CLS] Severe pain & cramping in ankles, feet a...",CADEC,"[[CLS], severe, pain, &, cr, ##amp, ##ing, in,...","[0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",284
3,LIPITOR.59_2,"[CLS] Severe pain & cramping in ankles, feet a...",CADEC,"[[CLS], ., [SEP], besides, the, toll, that, st...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",284
4,LIPITOR.780_1,[CLS] SEVERE FATIGUE IN MY BODY AND TIREDNESS ...,CADEC,"[[CLS], severe, fatigue, in, my, body, and, ti...","[0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, ...",340


In [18]:
# remove the longer queries from the final data
preprocessed_same_length = pre_processed[pre_processed['r_lenght'] < 512]
#append the new splitdataframe to this dataframe
preprocessed_comibined = pd.concat([preprocessed_same_length, split_df]).reset_index()
preprocessed_comibined.head()

Unnamed: 0,index,txt_id,text,dataset,tokenized,pre_processed_tokens,r_lenght
0,0,LIPITOR.86,"[CLS] headaches, pain in throat , tingling in ...",CADEC,"[[CLS], headache, ##s, ,, pain, in, throat, ,,...","[0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, ...",39
1,1,LIPITOR.92,"[CLS] Muscle aches and weakness in neck, arms,...",CADEC,"[[CLS], muscle, ache, ##s, and, weakness, in, ...","[0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",300
2,3,LIPITOR.946,"[CLS] Aches and pain from head to toe, very gr...",CADEC,"[[CLS], ache, ##s, and, pain, from, head, to, ...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, ...",101
3,4,LIPITOR.45,"[CLS] Cannot be sure it is the drug, but aroun...",CADEC,"[[CLS], cannot, be, sure, it, is, the, drug, ,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",110
4,5,LIPITOR.775,[CLS] Muscle and joint pain developed after ap...,CADEC,"[[CLS], muscle, and, joint, pain, developed, a...","[0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",89


In [19]:
#write a padding function
print("The maximum lenthg is", max(preprocessed_comibined['r_lenght']))

The maximum lenthg is 506


In [20]:
preprocessed_comibined.head()

Unnamed: 0,index,txt_id,text,dataset,tokenized,pre_processed_tokens,r_lenght
0,0,LIPITOR.86,"[CLS] headaches, pain in throat , tingling in ...",CADEC,"[[CLS], headache, ##s, ,, pain, in, throat, ,,...","[0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, ...",39
1,1,LIPITOR.92,"[CLS] Muscle aches and weakness in neck, arms,...",CADEC,"[[CLS], muscle, ache, ##s, and, weakness, in, ...","[0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",300
2,3,LIPITOR.946,"[CLS] Aches and pain from head to toe, very gr...",CADEC,"[[CLS], ache, ##s, and, pain, from, head, to, ...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, ...",101
3,4,LIPITOR.45,"[CLS] Cannot be sure it is the drug, but aroun...",CADEC,"[[CLS], cannot, be, sure, it, is, the, drug, ,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",110
4,5,LIPITOR.775,[CLS] Muscle and joint pain developed after ap...,CADEC,"[[CLS], muscle, and, joint, pain, developed, a...","[0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",89


In [21]:
#add an attention mask such that all tokens are 1 and later when we padd the input we itnroduce 0 for the padded portions
preprocessed_comibined['att_mask'] = preprocessed_comibined.apply(lambda x: x.r_lenght * [1], axis = 1)
preprocessed_comibined.head()

Unnamed: 0,index,txt_id,text,dataset,tokenized,pre_processed_tokens,r_lenght,att_mask
0,0,LIPITOR.86,"[CLS] headaches, pain in throat , tingling in ...",CADEC,"[[CLS], headache, ##s, ,, pain, in, throat, ,,...","[0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, ...",39,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,1,LIPITOR.92,"[CLS] Muscle aches and weakness in neck, arms,...",CADEC,"[[CLS], muscle, ache, ##s, and, weakness, in, ...","[0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",300,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,3,LIPITOR.946,"[CLS] Aches and pain from head to toe, very gr...",CADEC,"[[CLS], ache, ##s, and, pain, from, head, to, ...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, ...",101,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,4,LIPITOR.45,"[CLS] Cannot be sure it is the drug, but aroun...",CADEC,"[[CLS], cannot, be, sure, it, is, the, drug, ,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",110,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,5,LIPITOR.775,[CLS] Muscle and joint pain developed after ap...,CADEC,"[[CLS], muscle, and, joint, pain, developed, a...","[0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",89,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [22]:
#Introduce padding to the inputs
max_len = max(preprocessed_comibined['r_lenght']) #the max length for the padding would be the longest list
#padd the tokens
preprocessed_comibined['padded_tokens'] = preprocessed_comibined.apply(lambda x: introduce_padding(max_len, x.tokenized, '[PAD]'), axis = 1)
#padd the labels
preprocessed_comibined['padded_labels'] = preprocessed_comibined.apply(lambda x: introduce_padding(max_len, x.pre_processed_tokens, 2), axis = 1)
#padd the attention mask
preprocessed_comibined['padded_att_mask'] = preprocessed_comibined.apply(lambda x: introduce_padding(max_len, x.att_mask, 0), axis = 1)
preprocessed_comibined.head()

Unnamed: 0,index,txt_id,text,dataset,tokenized,pre_processed_tokens,r_lenght,att_mask,padded_tokens,padded_labels,padded_att_mask
0,0,LIPITOR.86,"[CLS] headaches, pain in throat , tingling in ...",CADEC,"[[CLS], headache, ##s, ,, pain, in, throat, ,,...","[0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, ...",39,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[CLS], headache, ##s, ,, pain, in, throat, ,,...","[0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,1,LIPITOR.92,"[CLS] Muscle aches and weakness in neck, arms,...",CADEC,"[[CLS], muscle, ache, ##s, and, weakness, in, ...","[0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",300,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[CLS], muscle, ache, ##s, and, weakness, in, ...","[0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,3,LIPITOR.946,"[CLS] Aches and pain from head to toe, very gr...",CADEC,"[[CLS], ache, ##s, and, pain, from, head, to, ...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, ...",101,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[CLS], ache, ##s, and, pain, from, head, to, ...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,4,LIPITOR.45,"[CLS] Cannot be sure it is the drug, but aroun...",CADEC,"[[CLS], cannot, be, sure, it, is, the, drug, ,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",110,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[CLS], cannot, be, sure, it, is, the, drug, ,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,5,LIPITOR.775,[CLS] Muscle and joint pain developed after ap...,CADEC,"[[CLS], muscle, and, joint, pain, developed, a...","[0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",89,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[CLS], muscle, and, joint, pain, developed, a...","[0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [23]:
len(preprocessed_comibined['padded_att_mask'][0])

506

In [24]:
#double check the length to confirm that the padding was done correctly
preprocessed_comibined['new_len'] = preprocessed_comibined.apply(lambda x: len(x.padded_tokens), axis = 1)
preprocessed_comibined.head()

Unnamed: 0,index,txt_id,text,dataset,tokenized,pre_processed_tokens,r_lenght,att_mask,padded_tokens,padded_labels,padded_att_mask,new_len
0,0,LIPITOR.86,"[CLS] headaches, pain in throat , tingling in ...",CADEC,"[[CLS], headache, ##s, ,, pain, in, throat, ,,...","[0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, ...",39,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[CLS], headache, ##s, ,, pain, in, throat, ,,...","[0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",506
1,1,LIPITOR.92,"[CLS] Muscle aches and weakness in neck, arms,...",CADEC,"[[CLS], muscle, ache, ##s, and, weakness, in, ...","[0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",300,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[CLS], muscle, ache, ##s, and, weakness, in, ...","[0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",506
2,3,LIPITOR.946,"[CLS] Aches and pain from head to toe, very gr...",CADEC,"[[CLS], ache, ##s, and, pain, from, head, to, ...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, ...",101,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[CLS], ache, ##s, and, pain, from, head, to, ...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",506
3,4,LIPITOR.45,"[CLS] Cannot be sure it is the drug, but aroun...",CADEC,"[[CLS], cannot, be, sure, it, is, the, drug, ,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",110,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[CLS], cannot, be, sure, it, is, the, drug, ,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",506
4,5,LIPITOR.775,[CLS] Muscle and joint pain developed after ap...,CADEC,"[[CLS], muscle, and, joint, pain, developed, a...","[0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",89,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[CLS], muscle, and, joint, pain, developed, a...","[0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",506


In [25]:
#Introduce the token ids into the dataset

#define the tokenizer one more time
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#get the token ids for each of the tokens
preprocessed_comibined['token_ids'] = preprocessed_comibined.apply(lambda x: tokenizer.convert_tokens_to_ids(x.padded_tokens), axis = 1)

preprocessed_comibined.head()

Unnamed: 0,index,txt_id,text,dataset,tokenized,pre_processed_tokens,r_lenght,att_mask,padded_tokens,padded_labels,padded_att_mask,new_len,token_ids
0,0,LIPITOR.86,"[CLS] headaches, pain in throat , tingling in ...",CADEC,"[[CLS], headache, ##s, ,, pain, in, throat, ,,...","[0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, ...",39,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[CLS], headache, ##s, ,, pain, in, throat, ,,...","[0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",506,"[101, 14978, 2015, 1010, 3255, 1999, 3759, 101..."
1,1,LIPITOR.92,"[CLS] Muscle aches and weakness in neck, arms,...",CADEC,"[[CLS], muscle, ache, ##s, and, weakness, in, ...","[0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",300,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[CLS], muscle, ache, ##s, and, weakness, in, ...","[0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",506,"[101, 6740, 12336, 2015, 1998, 11251, 1999, 33..."
2,3,LIPITOR.946,"[CLS] Aches and pain from head to toe, very gr...",CADEC,"[[CLS], ache, ##s, and, pain, from, head, to, ...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, ...",101,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[CLS], ache, ##s, and, pain, from, head, to, ...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",506,"[101, 12336, 2015, 1998, 3255, 2013, 2132, 200..."
3,4,LIPITOR.45,"[CLS] Cannot be sure it is the drug, but aroun...",CADEC,"[[CLS], cannot, be, sure, it, is, the, drug, ,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",110,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[CLS], cannot, be, sure, it, is, the, drug, ,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",506,"[101, 3685, 2022, 2469, 2009, 2003, 1996, 4319..."
4,5,LIPITOR.775,[CLS] Muscle and joint pain developed after ap...,CADEC,"[[CLS], muscle, and, joint, pain, developed, a...","[0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",89,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[CLS], muscle, and, joint, pain, developed, a...","[0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",506,"[101, 6740, 1998, 4101, 3255, 2764, 2044, 2248..."


In [26]:
#Split the dataset into test and train
np.random.seed(100)
train_df, valid_df = train_test_split(preprocessed_comibined, test_size=0.3)

train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)

print(pre_processed.shape)
print(train_df.shape, valid_df.shape)

(2162, 6)
(1518, 13) (651, 13)


In [28]:
#get the train data as tensors
padded_att_mask_train, padded_token_ids_train, padded_labels_train = convert_data_to_tensor(train_df,'padded_labels', 'padded_att_mask', 'token_ids')
#get the test data as tensors
padded_att_mask_test, padded_token_ids_test, padded_labels_test = convert_data_to_tensor(valid_df,'padded_labels', 'padded_att_mask', 'token_ids')

In [29]:
#create data loaders

#train loader
dataset = TensorDataset(padded_att_mask_train, padded_token_ids_train, padded_labels_train)
dataloader_train = DataLoader(dataset, batch_size=10, shuffle=True)

#test loader
dataset = TensorDataset(padded_att_mask_test, padded_token_ids_test, padded_labels_test)
dataloader_test = DataLoader(dataset, batch_size=10, shuffle=False)

In [5]:
# Initialize the model
model = transformers.BertForTokenClassification.from_pretrained('bert-base-uncased',  num_labels = 3)

#check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

#define the optimizer
params = model.parameters()
optimizer = torch.optim.Adam(params, lr= 3e-5)

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

In [None]:
#Train the model
epoch = 4

for i in range(epoch):

  train_loss = 0

  for j, (padded_att_mask, padded_token_ids, padded_labels) in enumerate(dataloader_train):

    #set to available device
    padded_att_mask = padded_att_mask.to(device)
    padded_token_ids = padded_token_ids.to(device)
    padded_labels = padded_labels.to(device)

    #make predictions
    output = model(padded_token_ids, 
                       token_type_ids=None,
                       attention_mask=padded_att_mask,
                       labels=padded_labels)
    

    step_loss = output[0]
    prediction = output[1]
        
    step_loss.sum().backward()
    optimizer.step()        
    train_loss += step_loss
    optimizer.zero_grad()

  print(f"Epoch {i} , Train loss: {train_loss.sum()}")

Epoch 0 , Train loss: 12.335312843322754
Epoch 1 , Train loss: 6.028031826019287
Epoch 2 , Train loss: 3.773472547531128
Epoch 3 , Train loss: 2.484161853790283


In [None]:
#save the model 
torch.save(model.state_dict(), 'bert_model_4.pt')

In [7]:
#load model
model.load_state_dict(torch.load('/content/drive/MyDrive/NLP Project/Models/bert_model_4.pt', map_location=torch.device('cpu')))

<All keys matched successfully>

In [30]:
#evaluate model
predictions, true_labels = evaluate_model(dataloader_test, model)

In [31]:
#remove the paddings before calculating the evaulation metrics
true_labels_2classes = []
true_pred_2classes = []

for i in range(len(true_labels)):

  for j in range(len(true_labels[i])):

    if true_labels[i][j] != 2:
      true_labels_2classes.append(true_labels[i][j])
      true_pred_2classes.append(predictions[i][j])



true_final_labels = []
pred_final_labels = []

for i in range(len(true_pred_2classes)):

  if true_pred_2classes[i] != 2:
    true_final_labels.append(true_labels_2classes[i])
    pred_final_labels.append(true_pred_2classes[i])

In [None]:
#Calculate the f1 score, percision, recall and accuracy

#calculate the f1score
f1_score_r = f1_score(true_final_labels, pred_final_labels, pos_label = 1)

#calculate percision and recall
precision = precision_score(true_final_labels, pred_final_labels, pos_label = 1)
recall = recall_score(true_final_labels, pred_final_labels, pos_label = 1)

#calculate the accuracy
accuracy = accuracy_score(true_final_labels, pred_final_labels)

print("The F1 score is", f1_score_r)
print("Precision is", precision)
print("Recall is", recall)
print("Acuracy is", accuracy)

The F1 score is 0.7672791296104012
Precision is 0.7781338705246772
Recall is 0.7567230632235085
Acuracy is 0.9340691666240344


In [33]:
predictions[0]

array([0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

In [56]:
valid_df.head(20)

Unnamed: 0,index,txt_id,text,dataset,tokenized,pre_processed_tokens,r_lenght,att_mask,padded_tokens,padded_labels,padded_att_mask,new_len,token_ids,predictions
0,1178,LIPITOR.779,[CLS] within 3 days I experienced extreme musc...,CADEC,"[[CLS], within, 3, days, i, experienced, extre...","[0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, ...",225,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[CLS], within, 3, days, i, experienced, extre...","[0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",506,"[101, 2306, 1017, 2420, 1045, 5281, 6034, 6740...","[0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, ..."
1,2164,23,[CLS] I've been taking Evista for four years w...,Annotated_dataset,"[[CLS], i, ', ve, been, taking, ev, ##ista, fo...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",196,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[CLS], i, ', ve, been, taking, ev, ##ista, fo...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",506,"[101, 1045, 1005, 2310, 2042, 2635, 23408, 119...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,1871,cymbalta.190,[CLS] I felt a little nausea the first day. [S...,PsyTar,"[[CLS], i, felt, a, little, nausea, the, first...","[0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, ...",187,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[CLS], i, felt, a, little, nausea, the, first...","[0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",506,"[101, 1045, 2371, 1037, 2210, 19029, 1996, 203...","[0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, ..."
3,2174,33,"[CLS] severe foot pain, weight gain, raised ch...",Annotated_dataset,"[[CLS], severe, foot, pain, ,, weight, gain, ,...","[0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, ...",73,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[CLS], severe, foot, pain, ,, weight, gain, ,...","[0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",506,"[101, 5729, 3329, 3255, 1010, 3635, 5114, 1010...","[0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, ..."
4,1735,cymbalta.54,[CLS] nan,PsyTar,"[[CLS], nan, [PAD], [PAD], [PAD], [PAD], [PAD]...","[0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...",2,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[CLS], nan, [PAD], [PAD], [PAD], [PAD], [PAD]...","[0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",506,"[101, 16660, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
5,697,ARTHROTEC.81,[CLS] frequent heartburn. [SEP] take it once a...,CADEC,"[[CLS], frequent, heart, ##burn, ., [SEP], tak...","[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",21,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[CLS], frequent, heart, ##burn, ., [SEP], tak...","[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",506,"[101, 6976, 2540, 8022, 1012, 102, 2202, 2009,...","[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6,1543,zoloft.75,"[CLS] Lack of sex drive, no orgasms. [SEP] Wei...",PsyTar,"[[CLS], lack, of, sex, drive, ,, no, orgasm, #...","[0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, ...",57,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[CLS], lack, of, sex, drive, ,, no, orgasm, #...","[0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",506,"[101, 3768, 1997, 3348, 3298, 1010, 2053, 1389...","[0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, ..."
7,2383,242,"[CLS] Only hot flashes at first, then more and...",Annotated_dataset,"[[CLS], only, hot, flashes, at, first, ,, then...","[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",64,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[CLS], only, hot, flashes, at, first, ,, then...","[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",506,"[101, 2069, 2980, 16121, 2012, 2034, 1010, 205...","[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
8,673,LIPITOR.736,[CLS] After a few months of Lipitor and Tricor...,CADEC,"[[CLS], after, a, few, months, of, lip, ##itor...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...",79,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[CLS], after, a, few, months, of, lip, ##itor...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",506,"[101, 2044, 1037, 2261, 2706, 1997, 5423, 1566...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ..."
9,2066,effexorXR.154,"[CLS] Night sweats, interrupted sleep most nig...",PsyTar,"[[CLS], night, sweat, ##s, ,, interrupted, sle...","[0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, ...",149,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[CLS], night, sweat, ##s, ,, interrupted, sle...","[0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",506,"[101, 2305, 7518, 2015, 1010, 7153, 3637, 2087...","[0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, ..."


In [35]:
valid_df['predictions'] = predictions.tolist()

In [36]:
valid_df.head()

Unnamed: 0,index,txt_id,text,dataset,tokenized,pre_processed_tokens,r_lenght,att_mask,padded_tokens,padded_labels,padded_att_mask,new_len,token_ids,predictions
0,1178,LIPITOR.779,[CLS] within 3 days I experienced extreme musc...,CADEC,"[[CLS], within, 3, days, i, experienced, extre...","[0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, ...",225,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[CLS], within, 3, days, i, experienced, extre...","[0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",506,"[101, 2306, 1017, 2420, 1045, 5281, 6034, 6740...","[0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, ..."
1,2164,23,[CLS] I've been taking Evista for four years w...,Annotated_dataset,"[[CLS], i, ', ve, been, taking, ev, ##ista, fo...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",196,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[CLS], i, ', ve, been, taking, ev, ##ista, fo...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",506,"[101, 1045, 1005, 2310, 2042, 2635, 23408, 119...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,1871,cymbalta.190,[CLS] I felt a little nausea the first day. [S...,PsyTar,"[[CLS], i, felt, a, little, nausea, the, first...","[0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, ...",187,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[CLS], i, felt, a, little, nausea, the, first...","[0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",506,"[101, 1045, 2371, 1037, 2210, 19029, 1996, 203...","[0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, ..."
3,2174,33,"[CLS] severe foot pain, weight gain, raised ch...",Annotated_dataset,"[[CLS], severe, foot, pain, ,, weight, gain, ,...","[0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, ...",73,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[CLS], severe, foot, pain, ,, weight, gain, ,...","[0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",506,"[101, 5729, 3329, 3255, 1010, 3635, 5114, 1010...","[0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, ..."
4,1735,cymbalta.54,[CLS] nan,PsyTar,"[[CLS], nan, [PAD], [PAD], [PAD], [PAD], [PAD]...","[0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...",2,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[CLS], nan, [PAD], [PAD], [PAD], [PAD], [PAD]...","[0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",506,"[101, 16660, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."


In [55]:
valid_df['padded_tokens'][1][19]

'gain'

In [54]:
valid_df['predictions'][1][19]

1