In [0]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip
!wget http://evexdb.org/pmresources/vec-space-models/PMC-w2v.bin
!unzip drugsCom_raw.zip
!rm *.zip
!ls

In [0]:
!mkdir data

In [0]:
import pandas as pd
import numpy as np
import keras
from keras.preprocessing.text import Tokenizer,  text_to_word_sequence
from keras.engine.topology import Layer
from keras import initializers as initializers, regularizers, constraints
from keras.callbacks import Callback, ModelCheckpoint, EarlyStopping
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding, Input, Dense, LSTM, GRU, Bidirectional, TimeDistributed, Dropout,Concatenate,Lambda
from keras import backend as K
from keras import optimizers
from keras.utils import to_categorical
from keras.models import Model,clone_model,load_model
import tensorflow as tf
import nltk
nltk.download('popular')
nltk.download('stopwords')
import re
import matplotlib.pyplot as plt
import sys
from nltk import tokenize
import os
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix  
from keras.backend.tensorflow_backend import set_session  
from scipy.spatial.distance import euclidean
from sklearn.neighbors import NearestNeighbors
import random
from nltk.corpus import stopwords
from gensim.models.keyedvectors import KeyedVectors

In [0]:
MAX_FEATURES=200000
MAX_SENT_LEN = 40
MAX_SENT_NUM = 40
EMBED_SIZE=100
NUM_EPOCHS=25
REG_PARAM = 1e-13
OUTPUT_DIM = 3

In [0]:
def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatibl|e with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)

class AttentionWithContext(Layer):
    """
    Attention operation, with a context/query vector, for temporal data.
    Supports Masking.
    Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf]
    "Hierarchical Attention Networks for Document Classification"
    by using a context vector to assist the attention
    # Input shape
        3D tensor with shape: `(samples, steps, features)`.
    # Output shape
        2D tensor with shape: `(samples, features)`.
    How to use:
    Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
    The dimensions are inferred based on the output shape of the RNN.
    Note: The layer has been tested with Keras 2.0.6
    Example:
        model.add(LSTM(64, return_sequences=True))
        model.add(AttentionWithContext())
        # next add a Dense layer (for classification/regression) or whatever...
    """

    def __init__(self,
                 W_regularizer=None, u_regularizer=None, b_regularizer=None,
                 W_constraint=None, u_constraint=None, b_constraint=None,
                 bias=True, **kwargs):

        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.u_regularizer = regularizers.get(u_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.u_constraint = constraints.get(u_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(AttentionWithContext, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1], input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight((input_shape[-1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)

        self.u = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_u'.format(self.name),
                                 regularizer=self.u_regularizer,
                                 constraint=self.u_constraint)

        super(AttentionWithContext, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        uit = dot_product(x, self.W)

        if self.bias:
            uit += self.b

        uit = K.tanh(uit)
        ait = dot_product(uit, self.u)

        a = K.exp(ait)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]


In [0]:
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, list_IDs, labels, batch_size=32, dim=(32,32,32),
                 n_classes=10, shuffle=True):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.labels = labels
        self.list_IDs = list_IDs
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim)
        # Initialization
        X = np.empty((self.batch_size, *self.dim))
        y = np.empty((self.batch_size), dtype=int)

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            # Store sample
            X[i,] = np.load('data/out' + str(ID) + '.npy').reshape(MAX_SENT_NUM,MAX_SENT_LEN)

            # Store class
            y[i] = self.labels[ID]

        return X, keras.utils.to_categorical(y, num_classes=self.n_classes)

In [0]:
def create_embed():
  embed_model = KeyedVectors.load_word2vec_format('PMC-w2v.bin',binary=True)
  return embed_model.get_keras_embedding(train_embeddings=False)

In [0]:
def load_data(mode='train'):
    if mode=='train':
        df = pd.read_csv('drugsComTrain_raw.tsv',sep='\t')
        to_remove = np.random.choice(df[df['rating']>=7].index,size=92510,replace=False)
        df = df.drop(to_remove).reset_index()
        to_remove = np.random.choice(df[df['rating']<=4].index,size=25719,replace=False)
        df = df.drop(to_remove).reset_index()
        if OUTPUT_DIM==3:
          df = df.replace(to_replace={1:'negative',2:'negative',3:'negative',4:'negative',5:'neutral',6:'neutral',7:'positive',8:'positive',9:'positive',10:'positive'})
          df = df.replace(to_replace={'positive':2,'neutral':1,'negative':0})
        return pd.DataFrame(df['review']),df['rating'].to_numpy()
    if mode=='test':
        df = pd.read_csv('drugsComTest_raw.tsv', sep='\t')
        if OUTPUT_DIM==3:
          df = df.replace(to_replace={1:'negative',2:'negative',3:'negative',4:'negative',5:'neutral',6:'neutral',7:'positive',8:'positive',9:'positive',10:'positive'})
          df = df.replace(to_replace={'positive':2,'neutral':1,'negative':0})
        return pd.DataFrame(df['review']), pd.get_dummies(df['rating']).to_numpy()

In [0]:
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    # st_w = set(stopwords.words('english'))
    st_w = ['ourselves', 'hers', 'between', 'yourself', 'again', 'there', 'about', 'once', 'during', 'out', 'very', 
            'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for','its', 'yours', 'such', 'into', 'of', 'most', 
            'itself', 'other', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 
            'below',  'we', 'these', 'your', 'his', 'through', 'me', 'were', 'her', 'more', 'himself', 
            'this', 'down',  'our', 'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'she', 'all', 
             'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on',  
            'yourselves', 'then', 'that', 'what', 'over', 'why', 'so', 'now', 'under', 
            'he', 'you', 'herself', 'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 
            'few', 'whom', 'being', 'if', 'theirs', 'my',  'a', 'by', 'doing', 'it', 'how', 'further', 'here', 'than']
    try:
        string = re.sub(r"\\", "", string)    
        string = re.sub(r"\'", "", string)    
        string = re.sub(r"\"", "", string) 
        string = re.sub(r"&#039;", "'",string)
        string = re.sub(r"\r","",string)
        string = re.sub(r"\n","",string)
        string = string.split()
        string = ' '.join([w for w in string if w not in st_w])
    except:
        print(string)
    return string.strip().lower()

In [0]:
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [0]:
texts = []
paras = []
train,labels = load_data('train')

for idx in range(train['review'].shape[0]):
    text = clean_str(train['review'][idx])
    texts.append(text)
    sentences = tokenize.sent_tokenize(text)
    paras.append(sentences)

In [0]:
data = np.zeros((len(texts), MAX_SENT_NUM, MAX_SENT_LEN), dtype='int32')
w2v_model = KeyedVectors.load_word2vec_format('PMC-w2v.bin',binary=True)
for i, sentences in enumerate(paras):
    for j, sent in enumerate(sentences):
        if j< MAX_SENT_NUM:
            wordTokens = text_to_word_sequence(sent)
            data[i,j,0] = i
            k=1
            for _, word in enumerate(wordTokens):
                try:
                    if word in w2v_model.wv.vocab:
                      word_index = w2v_model.wv.vocab[word].index
                      if k<=MAX_SENT_LEN and word_index<MAX_FEATURES:
                          data[i,j,k] = word_index
                          k=k+1
                except:
                    pass

for i in range(len(data)):
  np.save("data/out"+str(i),data[i])

In [0]:
partition = {}
idx = np.arange(len(data))
np.random.shuffle(idx)
train_samples = int(0.8*(len(data)))
partition['train']=idx[:train_samples]
partition['validation'] = idx[train_samples:]
label_dict = {}
for i in range(len(data)):
  label_dict[i] = labels[i]

In [0]:
params = {'dim': (MAX_SENT_NUM,MAX_SENT_LEN),
          'batch_size': 128,
          'n_classes': 3,
          'shuffle': True}

In [0]:
training_generator = DataGenerator(partition['train'], label_dict, **params)
validation_generator = DataGenerator(partition['validation'], label_dict, **params)

In [0]:
# word-level

l2_reg = regularizers.l2(REG_PARAM)
embedding_layer = create_embed()
word_input = Input(shape=(MAX_SENT_LEN,),dtype='float32')
word_sequences = embedding_layer(word_input)
word_lstm = Bidirectional(LSTM(200,return_sequences = True,kernel_regularizer=l2_reg))(word_sequences)
word_dense = TimeDistributed(Dense(200,kernel_regularizer=l2_reg))(word_lstm)
word_att = AttentionWithContext()(word_dense)
wordEncoder = Model(word_input,word_att)

#sentence-level
sentence_input = Input(shape=(MAX_SENT_NUM,MAX_SENT_LEN),dtype='float32')
sentence_encoder = TimeDistributed(wordEncoder)(sentence_input)
sentence_lstm = Bidirectional(LSTM(200,return_sequences=True,kernel_regularizer=l2_reg))(sentence_encoder)
sentence_dense = TimeDistributed(Dense(200,kernel_regularizer=l2_reg))(sentence_lstm)
sentence_att = Dropout(0.5)(AttentionWithContext()(sentence_dense))

#FC
preds_1 = Dense(50,activation='relu')(sentence_att)
preds = Dense(OUTPUT_DIM,activation='softmax')(preds_1)

In [0]:
model = Model(sentence_input,preds)
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['acc',f1])
checkpoint = ModelCheckpoint('han_pmc_unskewed.h5',verbose=0,monitor='val_loss',save_best_only=True,mode='auto')
earlystop = EarlyStopping(monitor='val_loss',min_delta=0,patience=5,verbose=0,restore_best_weights=True)
history = model.fit_generator(generator=training_generator,
                    validation_data=validation_generator,
                    use_multiprocessing=True,
                    epochs =NUM_EPOCHS,callbacks=[checkpoint,earlystop])


In [0]:
model = load_model('han_pmc_unskewed.h5',custom_objects={'AttentionWithContext':AttentionWithContext,'f1':f1})

In [0]:
test, test_labels = load_data('test')

In [0]:
test_texts = []
test_paras = []

for idx in range(test['review'].shape[0]):
    test_text = clean_str(test['review'][idx])
    test_texts.append(test_text)
    test_sentences = tokenize.sent_tokenize(test_text)
    test_paras.append(test_sentences)

In [0]:
test_data = np.zeros((len(test_texts), MAX_SENT_NUM, MAX_SENT_LEN), dtype='int32')
for i, sentences in enumerate(test_paras):
    for j, sent in enumerate(test_sentences):
        if j< MAX_SENT_NUM:
            wordTokens = text_to_word_sequence(sent)
            test_data[i,j,0] = i
            k=1
            for _, word in enumerate(wordTokens):
                try:
                    if word in w2v_model.wv.vocab:
                      word_index = w2v_model.wv.vocab[word].index
                      if k<=MAX_SENT_LEN and word_index<MAX_FEATURES:
                          data[i,j,k] = word_index
                          k=k+1
                except:
                    # print(word)
                    pass

In [0]:
pred = model.predict(test_data,batch_size=500)

In [0]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
pred_labels = np.argmax(pred,axis=1)
act_labels = np.argmax(test_labels,axis=1)
print('Acc: ',accuracy_score(act_labels,pred_labels))
print('F1: ',f1_score(act_labels,pred_labels,average='weighted'))
print(confusion_matrix(act_labels,pred_labels))
print(classification_report(act_labels,pred_labels))