In this notebook, we'll build a model to classify online posts about baseball and hockey.

Below we download the online posts data.

In [None]:
from sklearn.datasets import fetch_20newsgroups
# from sklearn.feature_extraction.text import TfidfVectorizer
groups = ['rec.sport.baseball', 'rec.sport.hockey']
# newsgroups is an instance of sklearn.utils.Bunch object.
# * newsgroups.DESCR describing the dataset.
# * newsgroups.data is a list of length 1993, each element a str representing one post.
# * newsgroups.target is a one-dimensional numpy.ndarray of length 1993, each element an int64
#   with value of either 0 (rec.sport.baseball) or 1 (rec.sport.hockey').
#   There are 994 instances of 0, and 999 instances of 1.
newsgroups = fetch_20newsgroups(subset='all', remove = ('headers', 'footers', 'quotes'), categories = groups)

In [None]:
# from sklearn.naive_bayes import MultinomialNB
# from sklearn import metrics
# newsgroups_test = fetch_20newsgroups(subset='test', remove = ['headers', 'footers', 'quotes'], categories = groups)
# vectors_test = vectorizer.transform(newsgroups_test.data)
# clf = MultinomialNB(alpha=.01)
# clf.fit(vectors, newsgroups_train.target)
# pred = clf.predict(vectors_test)
# metrics.f1_score(newsgroups_test.target, pred, average='macro')
# 0.88213592402729568 (full set)
# 0.9320767597087378

Next, we download GloVe vectors we will be using to represent our post data.

In [None]:
import os.path
if not os.path.exists('glove.6B.zip'):
    import urllib.request
    data = urllib.request.urlopen('http://nlp.stanford.edu/data/glove.6B.zip').read()
    with open('glove.6B.zip', 'wb') as f:
        f.write(data)

Below we unzip the GloVe file we downloaded.

In [None]:
import os.path
if not os.path.exists('glove.6B.100d.txt'):
    import zipfile
    with zipfile.ZipFile('glove.6B.zip') as f:
        f.extractall()

Next, we load the GloVe vectors.

In [None]:
import numpy as np

embeddings_index = {}
# glove.6B.100d.txt is a text file of 400,000 lines.
# Each line has 101 tokens. The first token is a word, and the remaining tokens are
# floating point numbers between -1 and 1, containing the feature vector for the word.
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

# embeddings_index is dict[str, numpy.ndarray], mapping words to arrays of 100 float32 values each.
print('Found %s word vectors.' % len(embeddings_index))

In [None]:
# Some example word similarities.
def similar(a, b):
    # Adapted from L16qs.ipynb.
    print(
        a, b,
        (embeddings_index[a] @ embeddings_index[b])
            / np.sqrt(embeddings_index[a] @ embeddings_index[a]) 
            / np.sqrt(embeddings_index[b] @ embeddings_index[b]))

similar('cat', 'dog')     # 0.8798
similar('cat', 'person')  # 0.3757
similar('up', 'down')     # 0.9160
similar('the', 'a')       # 0.7760
similar('the', 'up')      # 0.7415

Next, we convert the data to a collection of word GloVe word vectors for each of the words in our dataset.

In [None]:
# !pip install keras=='2.3.1'
import tensorflow as tf
# from tf.keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences
# from keras.utils import to_categorical

print('Preparing embedding matrix.')
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100

# Tokenizer will break text in newsgroups.data into words, and replace each word by an index.
# * word_index is a dict[str, int] mapping each word to its index. The index starts from 0
#   and then increases sequentially, with more frequent words having smaller indexes.
# * tokenizer.index_word (unused here) is the opposite of word_index, a dict[int, str]
#   mapping each index to the corresponding word.
# * sequences is list[list[int]], containing word indexes for each article in newsgroups.data.
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(newsgroups.data)
sequences = tokenizer.texts_to_sequences(newsgroups.data)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
# embedding_matrix has the same contents as embeddings_index, except it is a 2-d matrix
# instead of a dict. Each row of the matrix contains the feature vector of one word.
# The row number corresponds to the index from word_index. Any words that are in
# embeddings_index but not in word_index are omitted from embedding_matrix.

Next, we'll build our dataset for training, `data` and `labels`, as well as our test set, `data_test` and `labels_test`.  We will limit our training set to 200 examples.

In [None]:
# from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

# finally, vectorize the text samples into a 2D integer tensor
# * sequence (list[list[int]]) => data (numpy.ndarray of shape (1993, 1000))
# * newsgroups.target (numpy.ndarray of shape (1993,)) => labels (numpy.ndarray of shape (1993, 2))
#   A 0 in newsgroups.target becomes a [1., 0.] in labels, and a 1 becomes a [0., 1.].
MAX_SEQUENCE_LENGTH = 1000
data = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(newsgroups.target))

print('Before split:')
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

data, data_test, labels, labels_test = train_test_split(data,labels,train_size=200)

print()
print('After split:')
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
print('Shape of data_test tensor:', data_test.shape)
print('Shape of label_test tensor:', labels_test.shape)

Next, we'll declare a `train` function that declares and trains the model with `pretrain` weights.  

In [None]:
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.initializers import Constant
# from keras.optimizers import RMSprop
# from keras.optimizers import Adam
from keras import optimizers

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
# num_words = len(vectorizer.vocabulary_)
# num_words = len(word_index)+1

def train(pretrain):
  if not pretrain:  # train your own embedding
    embedding_layer = Embedding(num_words,
                              EMBEDDING_DIM,
                              input_length=MAX_SEQUENCE_LENGTH,
                              trainable=True
                             )
  else:
      embedding_layer = Embedding(num_words,
                              EMBEDDING_DIM,
                              embeddings_initializer=Constant(embedding_matrix),
                              input_length=MAX_SEQUENCE_LENGTH,
                              trainable=False
                           )
  print('Training model.')

  # train a 1D convnet with global maxpooling
  sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

  embedded_sequences = embedding_layer(sequence_input)
  x = Conv1D(128, 5, activation='relu')(embedded_sequences)
  x = MaxPooling1D(5)(x)
  x = Conv1D(128, 5, activation='relu')(x)
  x = MaxPooling1D(5)(x)
  x = Conv1D(128, 5, activation='relu')(x)
  x = GlobalMaxPooling1D()(x)
  x = Dense(128, activation='relu')(x)
  preds = Dense(len(groups), activation='softmax')(x)

  solver = optimizers.Adam(learning_rate=0.0005, beta_1=0.9, beta_2=0.999, amsgrad=False)

  model = Model(sequence_input, preds)
  model.compile(loss='categorical_crossentropy',
                optimizer=solver,
                metrics=['acc'])

  model.fit(data, labels,
            epochs=100,
            validation_data=(data_test, labels_test))

Below we train the model without pretrained weights.

In [None]:
train(False)

Next we train the model with pretrained weights.

In [None]:
train(True)