In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import Conv1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
# fix random seed for reproducibility
np.random.seed(7)
%matplotlib inline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


## Working with Original Data Format

In [2]:
data = pd.read_csv('stanford_movie_data.csv')
data.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [3]:
X_train = data['review'].iloc[0:25000]
y_train = data['sentiment'].iloc[0:25000]

X_test = data['review'].iloc[25000:]
y_test = data['sentiment'].iloc[25000:]

In [4]:
len(set(imdb.get_word_index().values())) == len(imdb.get_word_index().values())

True

In [5]:
import re # regex library
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text) # Effectively removes HTML markup tags
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    return text

In [6]:
data['review'] = data['review'].apply(preprocessor)

In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['review'])

In [8]:
len(tokenizer.word_index)

103893

In [9]:
def text_to_int_sequence(text):
    return [tokenizer.word_index[word] for word in text_to_word_sequence(text)]

In [10]:
from keras.callbacks import Callback
class ValAccCheck(Callback):
    
    def __init__(self, validation_data, interval=25):
        self.validation_data = validation_data
        self.val_accs = []
        self.val_losses = []
        self.interval = interval
    def on_train_begin(self, logs={}):
        self.steps_completed = 0
        return
 
    def on_train_end(self, logs={}):
        return
 
    def on_epoch_begin(self, epoch, logs={}):
        return
 
    def on_epoch_end(self, epoch, logs={}):
        return
 
    def on_batch_begin(self, batch, logs={}):
        return
 
    def on_batch_end(self, batch, logs={}):
        
        self.steps_completed += 1
        if self.steps_completed % self.interval == 0:
            x_valid, y_valid = self.validation_data[0], self.validation_data[1]
            loss, acc = self.model.evaluate(x_valid, y_valid, verbose=0)
            self.val_accs.append(acc)
            self.val_losses.append(loss)
            print('\n')
            print('val_acc: {0}, val_loss: {1}'.format(acc, loss))
        return

In [32]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import accuracy_score
from keras.preprocessing.text import hashing_trick

class LSTM_Sentiment_Classifier(BaseEstimator, ClassifierMixin):
    
    def __init__(self, embedding_vector_length, max_seq_length, lstm_layers, batch_size=32, num_epochs=3, use_hash=False,
                dropout=None, conv_params=None):
        
        self.embedding_vector_length = embedding_vector_length
        self.max_seq_length = max_seq_length
        self.lstm_layer_sizes = lstm_layers
        self.num_epochs = num_epochs
        self.batch_size = batch_size
        self.use_hashing_trick = use_hash
        if not self.use_hashing_trick:
            self.tokenizer = Tokenizer()
        self.dropout = dropout
        self.conv_params = conv_params
        
    def _text_to_int_sequence(self, text):
        return [self.tokenizer.word_index[word] for word in text_to_word_sequence(text)]
        
    def fit(self, X, y, validation_data):
        
        all_X = pd.concat([X, validation_data[0]])
        if self.use_hashing_trick:
            all_words = set()
            for text in all_X:
                new_words = set(text_to_word_sequence(text))
                all_words = all_words.union(new_words)
            self.max_vocab = len(all_words)*1.3
            
            for i in range(len(X)):
                X[i] = hashing_trick(X[i], max_vocab, hash_function='md5')
            X_pad = sequence.pad_sequences(X, maxlen=self.max_seq_length)
            
            X_valid = validation_data[0]
            
            for i in range(len(X_valid)):
                X_valid[i] = hashing_trick(X_valid[i], max_vocab, hash_function='md5')
            X_valid_pad = sequence.pad_sequences(X_valid, maxlen=self.max_seq_length)
        
            y_valid = validation_data[1]
            
        else:    
            print('Fitting Tokenizer...')
            self.tokenizer.fit_on_texts(all_X)
            self.max_vocab = len(self.tokenizer.word_index) + 20
            X = X.apply(self._text_to_int_sequence)
            X_pad = sequence.pad_sequences(X, maxlen=self.max_seq_length)
        
            X_valid = validation_data[0].apply(self._text_to_int_sequence)
            X_valid_pad = sequence.pad_sequences(X_valid, maxlen=self.max_seq_length)
        
            y_valid = validation_data[1]
        
        self.model = Sequential()
        self.model.add(Embedding(self.max_vocab, self.embedding_vector_length, input_length=self.max_seq_length))
        if self.dropout is not None:
            self.model.add(Dropout(self.dropout))
            
        if self.conv_params is not None:
            self.model.add(Conv1D(filters=self.conv_params['filters'], 
                                  kernel_size=self.conv_params['kernel_size'], padding='same', activation='relu'))
            self.model.add(MaxPooling1D(pool_size=self.conv_params['pool_size']))
            
            self.model.add(Conv1D(filters=2*self.conv_params['filters'], 
                                  kernel_size=self.conv_params['kernel_size'], padding='same', activation='relu'))
            self.model.add(MaxPooling1D(pool_size=self.conv_params['pool_size']))
            
        if len(self.lstm_layer_sizes) > 1:
            for lstm_layer_size in self.lstm_layer_sizes[:-1]:
                self.model.add(LSTM(lstm_layer_size, return_sequences=True))
                self.model.add(Dropout(self.dropout))
            self.model.add(LSTM(self.lstm_layer_sizes[-1]))
        else:
            self.model.add(LSTM(self.lstm_layer_sizes[0]))
        if self.dropout is not None:
            self.model.add(Dropout(self.dropout))
        self.model.add(Dense(1, activation='sigmoid'))
        self.model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        early_stopping = EarlyStopping(monitor='val_loss',
                              min_delta=0,
                              patience=1,
                              verbose=0, mode='auto')
        val_acc_check = ValAccCheck((X_valid_pad, y_valid), interval=200)
        callbacks_list = [early_stopping]
        print(self.model.summary())
        
        print('Fitting model...')
        self.model.fit(X_pad, y, validation_data=(X_valid_pad, y_valid), 
                  epochs=self.num_epochs, batch_size=self.batch_size, callbacks=callbacks_list)
        
        self.val_loss_history = val_acc_check.val_losses
        self.val_acc_history = val_acc_check.val_accs
        
    def predict(self, X):
        
        if type(X) == pd.core.frame.DataFrame:
            X = X.apply(self._text_to_int_sequence)
            X = sequence.pad_sequences(X, maxlen = self.max_seq_length)
            return self.model.predict(X)
        elif type(X) == str:
            X = self._text_to_int_sequence(X)
            X = sequence.pad_sequences(X, maxlen = self.max_seq_length)
            return self.model.predict(X)
        else:
            X = map(X, self._text_to_int_sequence)
            X = sequence.pad_sequences(X, maxlen = self.max_seq_length)
            return self.model.predict(X)
    
    def predict_proba(self, X):
        
        if type(X) == pd.core.series.Series:
            X = X.apply(self._text_to_int_sequence)
            X = sequence.pad_sequences(X, maxlen = self.max_seq_length)
            return self.model.predict_proba(X)
        elif type(X) == str:
            X = self._text_to_int_sequence(X)
            X = sequence.pad_sequence(X, maxlen = self.max_seq_length)
            return self.model.predict_proba(X)
        else:
            X = map(X, self._text_to_word_sequence)
            X = sequence.pad_sequences(X, maxlen = self.max_seq_length)
            return self.model.predict_proba(X)
    
    def score(self, X, y):
        
        pred = self.predict(X)
        return accuracy_score(y, pred)

In [33]:
lstm_classifier = LSTM_Sentiment_Classifier(embedding_vector_length=32, max_seq_length=500, dropout=0.2, 
                                            lstm_layers=[128,128], num_epochs=2, use_hash=False,
                                           conv_params={'filters':32, 'kernel_size':3, 'pool_size':2})
lstm_classifier.fit(X_train, y_train, validation_data=(X_test, y_test))

Fitting Tokenizer...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 500, 32)           3325216   
_________________________________________________________________
dropout_40 (Dropout)         (None, 500, 32)           0         
_________________________________________________________________
conv1d_31 (Conv1D)           (None, 500, 32)           3104      
_________________________________________________________________
max_pooling1d_28 (MaxPooling (None, 250, 32)           0         
_________________________________________________________________
conv1d_32 (Conv1D)           (None, 250, 64)           6208      
_________________________________________________________________
max_pooling1d_29 (MaxPooling (None, 125, 64)           0         
_________________________________________________________________
lstm_28 (LSTM)               (None, 125, 128)          

## Very Deep CNNs for Text Classification (VDCNN)

In [29]:
from keras.engine import Layer, InputSpec
from keras.layers import Flatten
import tensorflow as tf

class KMaxPooling(Layer):
    """
    K-max pooling layer that extracts the k-highest activations from a sequence (2nd dimension).
    TensorFlow backend.
    """
    def __init__(self, k=1, **kwargs):
        super().__init__(**kwargs)
        self.input_spec = InputSpec(ndim=3)
        self.k = k

    def compute_output_shape(self, input_shape):
        return (input_shape[0], (input_shape[2] * self.k))

    def call(self, inputs):
        
        # swap last two dimensions since top_k will be applied along the last dimension
        shifted_input = tf.transpose(inputs, [0, 2, 1])
        
        # extract top_k, returns two tensors [values, indices]
        top_k = tf.nn.top_k(shifted_input, k=self.k, sorted=True, name=None)[0]
        
        # return flattened output
        return Flatten()(top_k)

In [41]:
from keras.layers import BatchNormalization
from keras.utils.np_utils import to_categorical
from keras.optimizers import SGD

class VDCNN_Sentiment_Classifier(BaseEstimator, ClassifierMixin):
    
    def __init__(self, n_classes, embedding_vector_length, max_seq_length, batch_size=32, num_epochs=3, dropout=0.2):
        
        self.n_classes = n_classes
        self.tokenizer = Tokenizer()
        self.embedding_vector_length = embedding_vector_length
        self.max_seq_length = max_seq_length
        self.batch_size = batch_size
        self.num_epochs = num_epochs
        self.dropout = dropout
     
    def _text_to_int_sequence(self, text):
        return [self.tokenizer.word_index[word] for word in text_to_word_sequence(text)]
    
    def _create_convolutional_block(self, num_filters, kernel_size=3):
        
        self.model.add(Conv1D(filters=num_filters, kernel_size=kernel_size,
                             padding='same', activation='relu'))
        self.model.add(BatchNormalization())
        
        self.model.add(Conv1D(filters=num_filters, kernel_size=kernel_size,
                             padding='same', activation='relu'))
        self.model.add(BatchNormalization())
        
    
    def fit(self, X, y, validation_data):
        
        all_X = pd.concat([X, validation_data[0]])
        print('Fitting Tokenizer...')
        self.tokenizer.fit_on_texts(all_X)
        self.max_vocab = len(self.tokenizer.word_index) + 20
        X = X.apply(self._text_to_int_sequence)
        X_pad = sequence.pad_sequences(X, maxlen=self.max_seq_length)
        
        X_valid = validation_data[0].apply(self._text_to_int_sequence)
        X_valid_pad = sequence.pad_sequences(X_valid, maxlen=self.max_seq_length)
        
        y_valid = validation_data[1]
        #y = to_categorical(y)
        
        self.model = Sequential()
        self.model.add(Embedding(self.max_vocab, self.embedding_vector_length, input_length=self.max_seq_length))
        self.model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation=None))
        
        filter_sizes = [64, 128, 256, 512]
        
        for filter_size in filter_sizes:
            self._create_convolutional_block(filter_size)
            self._create_convolutional_block(filter_size)
            if filter_size != 512:
                self.model.add(MaxPooling1D(pool_size=3, strides=2))
        
        self.model.add(KMaxPooling(k=8))
        self.model.add(Dense(4096, activation='relu'))
        self.model.add(Dropout(self.dropout))
        self.model.add(Dense(2048, activation='relu'))
        self.model.add(Dropout(self.dropout))
        self.model.add(Dense(2048, activation='relu'))
        self.model.add(Dropout(self.dropout))
        self.model.add(Dense(1, activation='sigmoid'))
        
        sgd = SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=False)
        self.model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])
        
        early_stopping = EarlyStopping(monitor='val_loss',
                              min_delta=0,
                              patience=1,
                              verbose=0, mode='auto')
        
        callbacks_list = [early_stopping]
        print(self.model.summary())
        
        print('Fitting model...')
        self.model.fit(X_pad, y, validation_data=(X_valid_pad, y_valid), 
                  epochs=self.num_epochs, batch_size=self.batch_size, callbacks=callbacks_list)
        
    def predict(self, X):
        
        if type(X) == pd.core.frame.DataFrame:
            X = X.apply(self._text_to_int_sequence)
            X = sequence.pad_sequences(X, maxlen = self.max_seq_length)
            return self.model.predict(X)
        elif type(X) == str:
            X = self._text_to_int_sequence(X)
            X = sequence.pad_sequences(X, maxlen = self.max_seq_length)
            return self.model.predict(X)
        else:
            X = map(X, self._text_to_int_sequence)
            X = sequence.pad_sequences(X, maxlen = self.max_seq_length)
            return self.model.predict(X)
    
    def predict_proba(self, X):
        
        if type(X) == pd.core.series.Series:
            X = X.apply(self._text_to_int_sequence)
            X = sequence.pad_sequences(X, maxlen = self.max_seq_length)
            return self.model.predict_proba(X)
        elif type(X) == str:
            X = self._text_to_int_sequence(X)
            X = sequence.pad_sequence(X, maxlen = self.max_seq_length)
            return self.model.predict_proba(X)
        else:
            X = map(X, self._text_to_word_sequence)
            X = sequence.pad_sequences(X, maxlen = self.max_seq_length)
            return self.model.predict_proba(X)
    
    def score(self, X, y):
        
        pred = self.predict(X)
        return accuracy_score(y, pred)    

In [42]:
vdcnn_classifier = VDCNN_Sentiment_Classifier(n_classes=2, embedding_vector_length=32, 
                                              max_seq_length=500, num_epochs=3, batch_size=32)
vdcnn_classifier.fit(X_train, y_train, validation_data=(X_test, y_test))

Fitting Tokenizer...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 500, 32)           3325216   
_________________________________________________________________
conv1d_172 (Conv1D)          (None, 500, 64)           6208      
_________________________________________________________________
conv1d_173 (Conv1D)          (None, 500, 64)           12352     
_________________________________________________________________
batch_normalization_161 (Bat (None, 500, 64)           256       
_________________________________________________________________
conv1d_174 (Conv1D)          (None, 500, 64)           12352     
_________________________________________________________________
batch_normalization_162 (Bat (None, 500, 64)           256       
_________________________________________________________________
conv1d_175 (Conv1D)          (None, 500, 64)           

KeyboardInterrupt: 

## Simple Multi-Layered Perceptron with Tfidf

In [23]:
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

import time
max_feats = 10000
tfidf = TfidfVectorizer(strip_accents=None, 
                        lowercase=False,
                        preprocessor=preprocessor,
                        min_df=2,
                        max_features=max_feats,
                        ngram_range=(1,2))

mlp_pipeline = Pipeline([('tfidf', tfidf), ('mlp', MLPClassifier(hidden_layer_sizes=[1000,1000,1000],
                                                                max_iter=1000,
                                                                activation='logistic'))])


start = time.time()
print('Training model...')
mlp_pipeline.fit(X_train, y_train)
end = time.time()

print('Time elapsed: {} seconds'.format(end - start))

Training model...


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Time elapsed: 117.02909994125366 seconds




In [22]:
from sklearn.metrics import accuracy_score, classification_report

pred = mlp_pipeline.predict(X_test)
print(accuracy_score(y_test, pred))

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


0.8668
