In [0]:
!wget nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip
!wget http://evexdb.org/pmresources/vec-space-models/PMC-w2v.bin
!unzip drugsCom_raw.zip
!rm *.zip
!ls

--2019-11-14 17:51:00--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2019-11-14 17:51:01--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2019-11-14 17:51:01--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2019-1

In [0]:
!mkdir data

In [0]:
import numpy as np
import pandas as pd
import keras
from keras.preprocessing.text import Tokenizer,  text_to_word_sequence
from keras.layers import Embedding, Input, Dense, LSTM,Bidirectional
from keras import backend as K
from keras.models import Model
import nltk
from gensim.models import KeyedVectors
nltk.download('stopwords')
from sklearn.metrics import accuracy_score, f1_score
import re
from keras import regularizers
import tensorflow as tf
from nltk import tokenize
from nltk.corpus import stopwords
from keras.callbacks import EarlyStopping,ModelCheckpoint
from random import shuffle
from keras.models import load_model
%tensorflow_version 1.x

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
MAX_FEATURES=200000
MAX_SENT_LEN=100
EMBED_SIZE=50
NUM_EPOCHS=25
REG_PARAM = 1e-13
OUTPUT_DIM = 3

In [0]:
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, list_IDs, labels, batch_size=32, dim=(32,32,32),
                 n_classes=3, shuffle=True):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.labels = labels
        self.list_IDs = list_IDs
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim)
        # Initialization
        X = np.empty((self.batch_size, *self.dim))
        y = np.empty((self.batch_size), dtype=int)

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            # Store sample
            X[i,] = np.load('data/out' + str(ID) + '.npy').reshape(MAX_SENT_NUM,MAX_SENT_LEN)

            # Store class
            y[i] = self.labels[ID]

        return X, keras.utils.to_categorical(y, num_classes=self.n_classes)

In [0]:
def create_embed():
  embed_model = KeyedVectors.load_word2vec_format('PMC-w2v.bin',binary=True)
  return embed_model.get_keras_embedding(train_embeddings=False)

In [0]:
def load_data(mode='train'):
    if mode=='train':
        df = pd.read_csv('drugsComTrain_raw.tsv',sep='\t')
        to_remove = np.random.choice(df[df['rating']>=7].index,size=92510,replace=False)
        df = df.drop(to_remove).reset_index()
        to_remove = np.random.choice(df[df['rating']<=4].index,size=25719,replace=False)
        df = df.drop(to_remove).reset_index()
        if OUTPUT_DIM==3:
          df = df.replace(to_replace={1:'negative',2:'negative',3:'negative',4:'negative',5:'neutral',6:'neutral',7:'positive',8:'positive',9:'positive',10:'positive'})
          df = df.replace(to_replace={'positive':2,'neutral':1,'negative':0})
        return pd.DataFrame(df['review']),df['rating'].to_numpy()
    if mode=='test':
        df = pd.read_csv('drugsComTest_raw.tsv', sep='\t')
        if OUTPUT_DIM==3:
          df = df.replace(to_replace={1:'negative',2:'negative',3:'negative',4:'negative',5:'neutral',6:'neutral',7:'positive',8:'positive',9:'positive',10:'positive'})
          df = df.replace(to_replace={'positive':2,'neutral':1,'negative':0})
        return pd.DataFrame(df['review']), pd.get_dummies(df['rating']).to_numpy()

In [0]:
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    st_w = ['ourselves', 'hers', 'between', 'yourself', 'again', 'there', 'about', 'once', 'during', 'out',
            'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for','its', 'yours', 'such', 'into', 'of', 
            'itself', 'other', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 
            'below',  'we', 'these', 'your', 'his', 'through', 'me', 'were', 'her',  'himself', 
            'this', 'down',  'our', 'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'she', 'all', 
             'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on',  
            'yourselves', 'then', 'that', 'what', 'over', 'why', 'so', 'now', 'under', 
            'he', 'you', 'herself', 'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 
            'few', 'whom', 'being', 'if', 'theirs', 'my',  'a', 'by', 'doing', 'it', 'how',  'here', 'than']
    try:
        string = re.sub(r"\\", "", string)    
        string = re.sub(r"\'", "", string)    
        string = re.sub(r"\"", "", string) 
        string = re.sub(r"&#039;", "'",string)
        string = re.sub(r"\r","",string)
        string = re.sub(r"\n","",string)
        string = string.split()
        string = ' '.join([w for w in string if w not in st_w])
    except:
        print(string)
    return string.strip().lower()

In [0]:
texts = []
train,labels = load_data('train')
for idx in range(train['review'].shape[0]):
    text = clean_str(train['review'][idx])
    texts.append(text)

In [0]:
tokenizer = Tokenizer(num_words=MAX_FEATURES, oov_token=True)
tokenizer.fit_on_texts(texts)

In [0]:
data = np.zeros((len(texts), MAX_SENT_LEN), dtype='int32')
for i, sent in enumerate(texts):
    wordTokens = text_to_word_sequence(sent)
    k=0
    for _, word in enumerate(wordTokens):
        try:
          if k<MAX_SENT_LEN:
              data[i,k] = tokenizer.word_index[word]
              k=k+1
        except:
              print(word)
              pass
for i in range(len(data)):
  np.save("data/out"+str(i),data[i])              

In [0]:
l2_reg = regularizers.l2(REG_PARAM)
word_index= tokenizer.word_index
embedding_layer = create_embed()
word_input = Input(shape=(MAX_SENT_LEN,), dtype='float32')
word_sequences = embedding_layer(word_input)
word_lstm = Bidirectional(LSTM(150, return_sequences=False, kernel_regularizer=l2_reg))(word_sequences)
preds = Dense(OUTPUT_DIM,activation='softmax')(word_lstm)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL













In [0]:
model = Model(word_input,preds)
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['acc'])
earlystop = EarlyStopping(monitor='val_loss',min_delta=0,patience=50,verbose=0,restore_best_weights=True)
checkpoint = ModelCheckpoint('textrnn.h5', verbose=0, monitor='val_loss',save_best_only=True, mode='auto') 
data_labels = list(zip(data,labels))
shuffle(data_labels)
data,labels = zip(*data_labels)
data = np.array(data)
labels = np.array(labels)
history = model.fit(data[1000:5000],labels[1000:5000],validation_data=(data[:1000],labels[:1000]),epochs=20, batch_size=500)#,callbacks=[earlystop,checkpoint]

Train on 4000 samples, validate on 1000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [0]:
test, test_labels = load_data('test')

In [0]:
test_texts=[]
for idx in range(test['review'].shape[0]):
    text = clean_str(test['review'][idx])
    test_texts.append(text)

test_data = np.zeros((len(test_texts), MAX_SENT_LEN), dtype='int32')
for i, sent in enumerate(test_texts):
    wordTokens = text_to_word_sequence(sent)
    k=0
    for _, word in enumerate(wordTokens):
        try:
            if k<MAX_SENT_LEN:
                test_data[i,k] = tokenizer.word_index[word]
                k=k+1
        except:
            # print(word)
            pass

In [0]:
score = model.evaluate(test_data, test_labels, batch_size=500)



In [0]:
print(score)

[0.9142238375943974, 0.6731949540066656]


In [0]:
pred = model.predict(test_data,batch_size=1000)

In [0]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
pred_labels = np.argmax(pred,axis=1)
act_labels = np.argmax(test_labels,axis=1)
print('Acc: ',accuracy_score(act_labels,pred_labels))
print('F1: ',f1_score(act_labels,pred_labels,average='weighted'))
print(confusion_matrix(act_labels,pred_labels))
print(classification_report(act_labels,pred_labels))