## Drug Review - Rating Classification

In [None]:
import pandas as pd

from tqdm.auto import tqdm

from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, BertForSequenceClassification, BertModel, get_scheduler, logging

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import AdamW
import torch.optim as optim
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler
from sklearn.model_selection import train_test_split

import plotly.express as px

logging.set_verbosity_error()

In [None]:
df_train = pd.read_csv("datasets/drugComTrain_raw.csv")
df_test = pd.read_csv("datasets/drugComTest_raw.csv")

### Data Analysis

##### First look

In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
df_train.loc[4,'review']

In [None]:
df_train.loc[35696, 'review']

#### Check for inconsistencies in String values

In [None]:
# columns having string values
string_columns = ['drugName', 'condition', 'review']

##### Checking for Non ASCII char

In [None]:
# row with non ASCII char in either: drugName, condition or review
df_aux = df_train.loc[pd.isna(df_train["condition"]) == False, string_columns].applymap(lambda str: str.isascii())
df_aux[(df_aux[string_columns] == True).any(axis=1)]
print("The number of occurence of a non ASCII char in review is = " + str(len(df_aux[df_aux['review'] == False])))
print("The number of occurence of a non ASCII char in drugName or condition is = " + str(len(df_aux[df_aux['drugName'] == False]) + len(df_aux[df_aux['condition'] == False])))

In [None]:
# smile emoji at the end is not an ASCII char
df_train.loc[117, 'review']

In [None]:
# the zero width space = \u200b is not an ASCII chat
df_train.loc[168, 'review']

In [None]:
def search_non_ASCII_char(str):
    # it take in input a string and return the first non ASCII char it encounters
    
    for c in str:
        if not c.isascii():
            return c
    
    return ""

In [None]:
df_train.review.map(lambda str: search_non_ASCII_char(str)).value_counts()

In [None]:
for c in df_train.review.map(lambda str: search_non_ASCII_char(str)).unique():
    print(str(c.encode('raw_unicode_escape')) + str(c), end="                ")

##### Check for start and finish with character "

In [None]:
# check how many review start or finish 
df_aux = df_train['review'].map(lambda str: str.startswith('"'))
print("""Number of review starting with " """ + str(len(df_aux[(df_aux == True)])))

df_aux = df_train['review'].map(lambda str: str.endswith('"'))
print("""Number of review ending with " """ + str(len(df_aux[(df_aux == True)])))

##### Checking HTML entities

In [None]:
def search_HTML_entities(str):
    # it take in input a string and return the first HTML entities it encounter or the empty string if there isn't any
    # An HTML entity is a string that start with the char '&' and ends with ';' without whitespaces in between

    start_index = -1

    for i, c in enumerate(str):
        if start_index == -1: # I'm currently searching for the char '&'
            if c == '&':
                start_index = i
        else: # it means that I already found a char '&' and need to see if there is a subsequent char ';'
            if c == ';':
                return str[start_index : i + 1]
            elif c == ' ':
                start_index = -1

    return ""

In [None]:
df_aux = df_train.review.map(lambda str: search_HTML_entities(str))

df_aux.unique()

##### Checking for HTML tags

In [None]:
import re

df_aux = df_train.loc[pd.isna(df_train["condition"]) == False, string_columns].applymap(lambda str: bool(re.search("""<\S*>""", str)))

print("The number of rows having an occurece of char '<' or '>' in condition = " + str(len(df_aux[df_aux['condition']])))
print("The number of rows having an occurece of char '<' or '>' in review = " + str(len(df_aux[df_aux['review']])))
print("While in the drugName is " + str(len(df_aux[df_aux['drugName']])))

In [None]:
print(len(df_train.loc[df_aux[df_aux['condition'] == True].index, 'condition']))
df_train.loc[df_aux[df_aux['condition'] == True].index, 'condition'].unique()

The review and drugName corresponding to those kind of condition are regular

In [None]:
df_train.loc[df_aux[df_aux['condition'] == True].index[10], 'drugName']

In [None]:
df_train.loc[df_aux[df_aux['condition'] == True].index[10], 'review']

#### Feature Analysis

rating

In [None]:
# imbalanced rating
df_train.rating.hist()

condition

In [None]:
len(df_train.condition.unique())

In [None]:
# comparing mean rating for each conditions
df_train[['rating', 'condition']].groupby(by='condition').rating.mean().hist()

drugName

In [None]:
# comparing mean rating for each drugName
df_train[['rating', 'drugName']].groupby(by='drugName').rating.mean().hist()

#### Train-Test data distribution

In [None]:
# unique condition inside the rows of df_train
train_condition = pd.Series(df_train.condition.unique())
# unique condition inside the rows of df_test
test_condition = pd.Series(df_test.condition.unique())
# condition present in test but not in train
print("Number of condition that occur in train but not in test " + str(len(test_condition[~test_condition.isin(train_condition)])))
print("For a total of " + str(len(df_test[df_test.condition.isin(test_condition[~test_condition.isin(train_condition)])]))+ ' rows')
print("")
print("Number of condition that occur in test but not in train " + str(len(train_condition[~train_condition.isin(test_condition)])))
print("For a total of " + str(len(df_train[df_train.condition.isin(train_condition[~train_condition.isin(test_condition)])]))+ ' rows')

In [None]:
# unique drugName inside the rows of df_train
train_drugName = pd.Series(df_train.drugName.unique())
# unique drugName inside the rows of df_test
test_drugName = pd.Series(df_test.drugName.unique())
# drugName present in test but not in train
print("Number of drugName that occur in train but not in test " + str(len(test_drugName[~test_drugName.isin(train_drugName)])))
print("For a total of " + str(len(df_test[df_test.drugName.isin(test_drugName[~test_drugName.isin(train_drugName)])]))+ ' rows')
print("")
print("Number of drugName that occur in test but not in train " + str(len(train_drugName[~train_drugName.isin(test_drugName)])))
print("For a total of " + str(len(df_train[df_train.drugName.isin(train_drugName[~train_drugName.isin(test_drugName)])]))+ ' rows')

### Cleaning and Feature engineering

In [None]:
# here we just remove row with null values to better handle data in this notebook
df_train.dropna(inplace=True)
df_test.dropna(inplace=True)

# here we currently drop the row having a wrong condition value (es.: '2</span> users found this comment helpful.')
df_train = df_train[~df_train.condition.str.contains("</span>")]
df_test = df_test[~df_test.condition.str.contains("</span>")]

In [None]:
import html

def cleaning_string(str):
    str = str.strip('"') # removing quotes from beginning / end
    str = str.strip() # removing whitespaces from beginning / end
    str = " ".join(str.split()) # removing double spaces
    str = html.unescape(str)
    return str

In [None]:
# columns having string values
string_columns = ['drugName', 'condition', 'review']

# cleaning
df_train[string_columns] = df_train[string_columns].applymap(lambda str: cleaning_string(str))
df_test[string_columns] = df_test[string_columns].applymap(lambda str: cleaning_string(str))

In [None]:
def rating_to_binary(rating):
  if rating > 5:
    return 1
  else:
    return 0

# let rating start from 0 instead of 1
df_train['rating'] = df_train['rating'].map(lambda rating: rating_to_binary(rating))
df_test['rating'] = df_test['rating'].map(lambda rating: rating_to_binary(rating))

### Splitting & Tokenizing the dataset

In [None]:
# creating validation set
df_train, df_val = train_test_split(df_train, test_size=0.2)

In [None]:
# we create a hugginface dataset object to gain acccess to tokenization utilities
train_dataset = Dataset.from_pandas(df_train[['review', 'rating']], preserve_index=False)
val_dataset = Dataset.from_pandas(df_val[['review', 'rating']], preserve_index=False)
test_dataset = Dataset.from_pandas(df_test[['review', 'rating']], preserve_index=False)
ds = DatasetDict()
ds['train'] = train_dataset
ds['val'] = val_dataset
ds['test'] = test_dataset
ds

In [None]:
cols = 'review'
# tokenizing the reviews
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
def tokenize(str):
    # TODO -> switch to dynamic padding for less memory usage
    return tokenizer(str['review'], padding='max_length', truncation=True)
tokenized_ds = ds.map(tokenize, batched=True,remove_columns=cols, num_proc=2)
# setting the return format to the torch one
tokenized_ds.set_format("torch")
tokenized_ds = tokenized_ds.rename_column("rating", "labels")
tokenized_ds

### Classification

#### Defining training components

In [None]:
batch_size = 1
# CHANGE! : here it just take 3 training sample from the training and valiation dataset
training_subset = 3

# constructing pytorch dataloaders for train and validation
if training_subset > 0:
  small_train_ds = tokenized_ds["train"].shuffle(seed=42).select(range(training_subset))
  small_val_ds = tokenized_ds["val"].shuffle(seed=42).select(range(training_subset))
  train_dataloader = DataLoader(small_train_ds, shuffle=True, batch_size=batch_size)
  val_dataloader = DataLoader(small_val_ds, batch_size=batch_size)
else:
  train_dataloader = DataLoader(tokenized_ds["train"], shuffle=True, batch_size=batch_size)
  val_dataloader = DataLoader(tokenized_ds["val"], batch_size=batch_size, num_workers=0)
  test_dataloader = DataLoader(tokenized_ds["test"], batch_size=batch_size)

def get_dataloader():
  return train_dataloader, val_dataloader, test_dataloader

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
class BERT(nn.Module):
    def __init__(self):
        super(BERT, self).__init__()
        self.bert_model = BertModel.from_pretrained("bert-base-uncased")
        self.out = nn.Linear(768, 2)
        
    def forward(self,input_ids,token_type_ids,attention_mask,):
        
        _,cls_embedding= self.bert_model(input_ids=input_ids,\
                                        token_type_ids=token_type_ids,\
                                        attention_mask=attention_mask,\
                                        return_dict=False)
        out= self.out(cls_embedding)
        return out

# instantiate the model
model = BERT()
# freeze the weight of BERT
for param in model.bert_model.parameters():
    param.requires_grad = False
model.to(device)

# instantiate the loss and optimzer
loss_fun = nn.CrossEntropyLoss()
optimizer= optim.Adam(model.parameters(),lr= 4e-4)

#### Training

In [None]:
n_batch_track = 3

training_loss = []
validation_loss = []

for epoch in range(1):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, batch in enumerate(train_dataloader, 0):

        # zero the parameter gradients
        optimizer.zero_grad()

        batch_gpu = {key: batch[key].to(device) for key in batch.keys()}

        # forward
        outputs = model(input_ids= batch_gpu['input_ids'], \
                        token_type_ids=batch_gpu['token_type_ids'], \
                        attention_mask=batch_gpu['attention_mask'])

        # backward
        if device == 'cuda':
          with torch.autocast('cuda'):
            loss = loss_fun(outputs, torch.tensor(batch_gpu['labels']).cuda())
        else:
            loss = loss_fun(outputs, torch.tensor(batch_gpu['labels']))
        loss.backward()
        # optimize
        optimizer.step()

        # compute metrics
        running_loss += loss.item()

        if (i+1) % n_batch_track == 0:    # track the training loss every i mini-batches

            running_loss_val = 0.0

            #compute loss on validation set for thre batches
            for j, batch in enumerate(val_dataloader, 0):

                with torch.no_grad():
                    batch_gpu = {key: batch[key].to(device) for key in batch.keys()}
                    outputs = model(input_ids= batch_gpu['input_ids'], \
                                    token_type_ids=batch_gpu['token_type_ids'], \
                                    attention_mask=batch_gpu['attention_mask'])
                    if device == 'cuda':
                      with torch.autocast('cuda'):
                        running_loss_val += loss_fun(outputs, torch.tensor(batch_gpu['labels']).cuda()).item()
                    else:
                      running_loss_val += loss_fun(outputs, torch.tensor(batch_gpu['labels'])).item()
                    if j == n_batch_track:
                        break
            
            print(f'[{epoch + 1}, {i + 1}]\
                    training_loss: {running_loss / n_batch_track:.3f} \
                    validation_loss = {running_loss_val / n_batch_track:.3f}')
            
            validation_loss.append(running_loss_val/n_batch_track)
            training_loss.append(running_loss/n_batch_track)
            
            running_loss = 0.0
              
print('Finished Training')

In [None]:
df_loss = pd.DataFrame(training_loss,columns=['training_loss'])
df_loss['validation_loss'] = validation_loss
fig = px.line(df_loss, x=df_loss.index, y = df_loss.columns)
fig.show()

#### Validation

In [None]:
correct = 0
total = 0

predictions=[]
labels=[]

with torch.no_grad():
    for i, batch in enumerate(val_dataloader):
        batch_gpu = {key: batch[key].to(device) for key in batch.keys()}

        outputs = model(input_ids= batch_gpu['input_ids'], token_type_ids=batch_gpu['token_type_ids'], attention_mask=batch_gpu['attention_mask'])
        _, predicted = torch.max(outputs.data, 1)
        total += batch_gpu['labels'].size(0)
        correct += (predicted == batch_gpu['labels']).sum().item()
        labels.extend(batch_gpu['labels'].tolist())
        predictions.extend(predicted.tolist())

print(f'Accuracy of the network: {100 * correct // total} %')

#### Confusion matrix definition

In [None]:
import numpy as np


def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):
    """
    given a sklearn confusion matrix (cm), make a nice plot

    Arguments
    ---------
    cm:           confusion matrix from sklearn.metrics.confusion_matrix

    target_names: given classification classes such as [0, 1, 2]
                  the class names, for example: ['high', 'medium', 'low']

    title:        the text to display at the top of the matrix

    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
                  see http://matplotlib.org/examples/color/colormaps_reference.html
                  plt.get_cmap('jet') or plt.cm.Blues

    normalize:    If False, plot the raw numbers
                  If True, plot the proportions

    Usage
    -----
    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
                                                              # sklearn.metrics.confusion_matrix
                          normalize    = True,                # show proportions
                          target_names = y_labels_vals,       # list of names of the classes
                          title        = best_estimator_name) # title of graph

    Citiation
    ---------
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

    """
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools

    accuracy = np.trace(cm) / np.sum(cm).astype('float')
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.show()

##### Confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix
plot_confusion_matrix(confusion_matrix(labels, predictions), ["not satisfied","satisfied"])

### Next considerations

- trying to accurately classify the different review's score (from 1 to 10):
    - using an ordinal multi label loss

- improving the classifier by:
    - better hyperparameter search
    - training for longer
    - using different backbone models such as: biobert or bert pretrained on some sentiment analysis task

- improving the performance evaluation
  - trynig to change the train, validation & test split to analyze if the model is able to generalize to new drugs, or new conditions