In [None]:
# Import Keras and verify that the TensorFlow backend is set as the default.
import keras
keras.__version__

## Machine Learning Guide for Text Classification
This has been adapted from the [text classification guide](https://developers.google.com/machine-learning/guides/text-classification/).

# S/W < 1500 -> MLP Model

In [None]:
import os
import random
import numpy as np
import sklearn
import tensorflow as tf
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.preprocessing import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

In [None]:
# The first step is to gather data. The more training examples you
# have the better you will be able to train a generalized mode.
# You should also make sure that the samples for every class is not
# imbalanced. There should be a fairly even representation of
# all samples. To illustrate this workflow we will use the IMDB
# dataset.
data_path = '/Users/anjalisridhar/kdd2018/workshop/datasets'
imdb_data_path = os.path.join(data_path, 'aclImdb')
seed = 123
# Load the training data
train_texts = []
train_labels = []
for category in ['pos', 'neg']:
    train_path = os.path.join(imdb_data_path, 'train', category)
    for fname in sorted(os.listdir(train_path)):
        if fname.endswith('.txt'):
            with open(os.path.join(train_path, fname)) as f:
                train_texts.append(f.read())
            train_labels.append(0 if category == 'neg' else 1)

# Load the validation data.
test_texts = []
test_labels = []
for category in ['pos', 'neg']:
    test_path = os.path.join(imdb_data_path, 'test', category)
    for fname in sorted(os.listdir(test_path)):
        if fname.endswith('.txt'):
            with open(os.path.join(test_path, fname)) as f:
                test_texts.append(f.read())
            test_labels.append(0 if category == 'neg' else 1)

# Shuffle the training data and labels. 
# The data gathered may be in a specific order and we should
# shuffle the data before doing anything else.
random.seed(seed)
random.shuffle(train_texts)
random.seed(seed)
random.shuffle(train_labels)

data = ((train_texts, np.array(train_labels)),
        (test_texts, np.array(test_labels)))

In [None]:
# Get the data.
(train_texts, train_labels), (val_texts, val_labels) = data

In [None]:
# Verify that validation labels are in the same range as training labels.
num_classes = max(train_labels) + 1
missing_classes = [i for i in range(num_classes) if i not in train_labels]
if len(missing_classes):
    raise ValueError('Missing samples with label value(s) '
                     '{missing_classes}. Please make sure you have '
                     'at least one sample for every label value '
                     'in the range(0, {max_class})'.format(
                        missing_classes=missing_classes,
                        max_class=num_classes - 1))

if num_classes <= 1:
    raise ValueError('Invalid number of labels: {num_classes}.'
                     'Please make sure there are at least two classes '
                     'of samples'.format(num_classes=num_classes))

In [None]:
unexpected_labels = [v for v in val_labels if v not in range(num_classes)]
if len(unexpected_labels):
    raise ValueError('Unexpected label values found in the validation set:'
                     ' {unexpected_labels}. Please make sure that the '
                     'labels in the validation set are in the same range '
                     'as training labels.'.format(
                         unexpected_labels=unexpected_labels))

In [None]:
# Tokenization and Vectorization
# Vectorization parameters
# Range (inclusive) of n-gram sizes for tokenizing text.
NGRAM_RANGE = (1, 2)
# Whether text should be split into word or character n-grams.
# One of 'word', 'char'.
TOKEN_MODE = 'word'
# Minimum document/corpus frequency below which a token will be discarded.
MIN_DOCUMENT_FREQUENCY = 2
# Limit on the number of features. We use the top 20K features.
TOP_K = 20000
# Create keyword arguments to pass to the 'tf-idf' vectorizer.
kwargs = {
        'ngram_range': NGRAM_RANGE,  # Use 1-grams + 2-grams.
        'dtype': 'int32',
        'strip_accents': 'unicode',
        'decode_error': 'replace',
        'analyzer': TOKEN_MODE,  # Split text into word tokens.
        'min_df': MIN_DOCUMENT_FREQUENCY,
}
# Tokenizing samples into unigrams + bigrams provides good accuracy
# while taking less compute time.
# We use Tf-idf encoding for vectorization. This does better than
# one-hot encoding and count encoding in terms of accuracy
# (on average: 0.25-15% higher). Tf-idf uses floating point 
# representation and takes more time to compute and uses more
# memory.
vectorizer = TfidfVectorizer(**kwargs)

# Learn vocabulary from training texts and vectorize training texts.
x_train = vectorizer.fit_transform(train_texts)

# Vectorize validation texts.
x_val = vectorizer.transform(val_texts)

# When we convert texts to tokens we may end up with a large
# number of tokens. We want to drop rarely occurring tokens
# as well as tokens that don't contribute heavily to label 
# predictions. 
# We use the `f_classif` function to identify the top 20K features.
# Select top 'k' of the vectorized features.
selector = SelectKBest(f_classif, k=min(TOP_K, x_train.shape[1]))
selector.fit(x_train, train_labels)
x_train = selector.transform(x_train)
x_val = selector.transform(x_val)

x_train = x_train.astype('float32')
x_val = x_val.astype('float32')

In [None]:
# Create model instance.
learning_rate=1e-3,
#TODO: Do I have to train epochs=1000?
# Try for more epochs
epochs=10
batch_size=128
layers=2
units=64
dropout_rate=0.2
input_shape=x_train.shape[1:]

if num_classes == 2:
    op_activation = 'sigmoid'
    op_units = 1
else:
    op_activation = 'softmax'
    op_units = num_classes

model = keras.models.Sequential()
model.add(keras.layers.Dropout(rate=dropout_rate, input_shape=input_shape))

for _ in range(layers-1):
    model.add(keras.layers.Dense(units=units, activation='relu'))
    model.add(keras.layers.Dropout(rate=dropout_rate))

model.add(keras.layers.Dense(units=op_units, activation=op_activation))

In [None]:
# Compile model with learning parameters.
if num_classes == 2:
    loss = 'binary_crossentropy'
else:
    loss = 'sparse_categorical_crossentropy'
optimizer = keras.optimizers.Adam(lr=learning_rate)
model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

In [None]:
# Create callback for early stopping on validation loss. If the loss does
# not decrease in two consecutive tries, stop training.
callbacks = [tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=2)]

# Train and validate model.
history = model.fit(
        x_train,
        train_labels,
        epochs=epochs,
        callbacks=callbacks,
        validation_data=(x_val, val_labels),
        verbose=2,  # Logs once per epoch.
        batch_size=batch_size)

# Print results.
history = history.history
print('Validation accuracy: {acc}, loss: {loss}'.format(
        acc=history['val_acc'][-1], loss=history['val_loss'][-1]))

# Save model.
model.save('imdb_mlp_model.h5')
print(history['val_acc'][-1], history['val_loss'][-1])

# S/W > 1500 -> sepCNN Model

In [None]:
import os, shutil, zipfile
import random

import numpy as np
import pandas as pd
import keras

In [None]:
columns = (2, 3)  # 2 - Phrases, 3 - Sentiment.
data_path = '/Users/anjalisridhar/kdd2018/workshop/datasets'
file_name = 'rotten_tomatoes_train.tsv'
seed = 123
validation_split = 0.2
separator = '\t'
header = 0

# Using the Rotten tomatoes movie reviews dataset to demonstrate
# training sequence model.
np.random.seed(seed)
data_path = os.path.join(data_path, file_name)
data = pd.read_csv(data_path, usecols=columns, sep=separator, header=header)
data = data.reindex(np.random.permutation(data.index))

# Get the review phrase and sentiment values.
texts = list(data['Phrase'])
labels = np.array(data['Sentiment'])
num_training_samples = int((1 - validation_split) * len(texts))
data = ((texts[:num_training_samples], labels[:num_training_samples]),
       (texts[num_training_samples:], labels[num_training_samples:]))

In [None]:
learning_rate=1e-3
epochs=1000
batch_size=128
blocks=2
filters=64
dropout_rate=0.2
embedding_dim=200
kernel_size=3
pool_size=3

(train_texts, train_labels), (val_texts, val_labels) = data

# Verify that validation labels are in the same range as training labels.
num_classes = max(train_labels) + 1
missing_classes = [i for i in range(num_classes) if i not in train_labels]
if len(missing_classes):
    raise ValueError('Missing samples with label value(s) '
                     '{missing_classes}. Please make sure you have '
                     'at least one sample for every label value '
                     'in the range(0, {max_class})'.format(
                        missing_classes=missing_classes,
                        max_class=num_classes - 1))

if num_classes <= 1:
    raise ValueError('Invalid number of labels: {num_classes}.'
                     'Please make sure there are at least two classes '
                     'of samples'.format(num_classes=num_classes))
unexpected_labels = [v for v in val_labels if v not in range(num_classes)]
if len(unexpected_labels):
      raise ValueError('Unexpected label values found in the validation set:'
                       ' {unexpected_labels}. Please make sure that the '
                       'labels in the validation set are in the same range '
                       'as training labels.'.format(
                           unexpected_labels=unexpected_labels))

In [None]:
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.preprocessing import text

# Vectorize texts.
# Limit on the number of features. We use the top 20K features.
TOP_K = 20000

# Limit on the length of text sequences. Sequences longer than this
# will be truncated.
MAX_SEQUENCE_LENGTH = 500

# We need to convert our text samples into numerical vectors.
# We first build a vocabulary of the 20K most frequently occurring
# words. Each word in the vocab is associated with an index.
# Create vocabulary with training texts.
tokenizer = text.Tokenizer(num_words=TOP_K)
tokenizer.fit_on_texts(train_texts)

# Vectorize training and validation texts.
x_train = tokenizer.texts_to_sequences(train_texts)
x_val = tokenizer.texts_to_sequences(val_texts)

# Get max sequence length.
max_length = len(max(x_train, key=len))
if max_length > MAX_SEQUENCE_LENGTH:
    max_length = MAX_SEQUENCE_LENGTH

# Fix sequence length to max value. Sequences shorter than the length are
# padded in the beginning and sequences longer are truncated
# at the beginning.
x_train = sequence.pad_sequences(x_train, maxlen=max_length)
x_val = sequence.pad_sequences(x_val, maxlen=max_length)
word_index = tokenizer.word_index

# Number of features will be the embedding input dimension. Add 1 for the
# reserved index 0.
num_features = min(len(word_index) + 1, TOP_K)

In [None]:
from keras import models
from keras import initializers
from keras import regularizers

from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers import SeparableConv1D
from keras.layers import MaxPooling1D
from keras.layers import GlobalAveragePooling1D

# Create model instance.
input_shape=x_train.shape[1:]
use_pretrained_embedding=False
is_embedding_trainable=False
embedding_matrix=None

if num_classes == 2:
  op_activation = 'sigmoid'
  op_units = 1
else:
  op_activation = 'softmax'
  op_units = num_classes

model = models.Sequential()

# Add embedding layer. If pre-trained embedding is used add weights to the
# embeddings layer and set trainable to input is_embedding_trainable flag.
# Sequence models often have such an embedding layer as their first layer. 
# This layer learns to turn word index sequences into word embedding vectors 
# during the training process, such that each word index gets mapped to a 
# dense vector of real values representing that word’s location in semantic space.
if use_pretrained_embedding:
    model.add(Embedding(input_dim=num_features,
                        output_dim=embedding_dim,
                        input_length=input_shape[0],
                        weights=[embedding_matrix],
                        trainable=is_embedding_trainable))
else:
    model.add(Embedding(input_dim=num_features,
                        output_dim=embedding_dim,
                        input_length=input_shape[0]))

for _ in range(blocks-1):
    model.add(Dropout(rate=dropout_rate))
    model.add(SeparableConv1D(filters=filters,
                              kernel_size=kernel_size,
                              activation='relu',
                              bias_initializer='random_uniform',
                              depthwise_initializer='random_uniform',
                              padding='same'))
    model.add(SeparableConv1D(filters=filters,
                              kernel_size=kernel_size,
                              activation='relu',
                              bias_initializer='random_uniform',
                              depthwise_initializer='random_uniform',
                              padding='same'))
    model.add(MaxPooling1D(pool_size=pool_size))

model.add(SeparableConv1D(filters=filters * 2,
                          kernel_size=kernel_size,
                          activation='relu',
                          bias_initializer='random_uniform',
                          depthwise_initializer='random_uniform',
                          padding='same'))
model.add(SeparableConv1D(filters=filters * 2,
                          kernel_size=kernel_size,
                          activation='relu',
                          bias_initializer='random_uniform',
                          depthwise_initializer='random_uniform',
                          padding='same'))
model.add(GlobalAveragePooling1D())
model.add(Dropout(rate=dropout_rate))
model.add(Dense(op_units, activation=op_activation))

In [None]:
# Compile model with learning parameters.
if num_classes == 2:
    loss = 'binary_crossentropy'
else:
    loss = 'sparse_categorical_crossentropy'
optimizer = keras.optimizers.Adam(lr=learning_rate)
model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

# Create callback for early stopping on validation loss. If the loss does
# not decrease in two consecutive tries, stop training.
callbacks = [keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=2)]

# Train and validate model.
history = model.fit(
          x_train,
          train_labels,
          epochs=epochs,
          callbacks=callbacks,
          validation_data=(x_val, val_labels),
          verbose=2,  # Logs once per epoch.
          batch_size=batch_size)

# Print results.
history = history.history
print('Validation accuracy: {acc}, loss: {loss}'.format(
        acc=history['val_acc'][-1], loss=history['val_loss'][-1]))

# Save model.
model.save('rotten_tomatoes_sepcnn_model.h5')
print(history['val_acc'][-1], history['val_loss'][-1])