# references
- text classification with word embeddings https://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/
- preparing reuters https://miguelmalvarez.com/2015/03/20/classifying-reuters-21578-collection-with-python-representing-the-data/
- classifying reuters with SVM https://miguelmalvarez.com/2016/11/07/classifying-reuters-21578-collection-with-python/
- MUSE for multilingual word embeddings https://github.com/facebookresearch/MUSE
- working with pretrained word vectors https://blog.manash.me/how-to-use-pre-trained-word-vectors-from-facebooks-fasttext-a71e6d55f27
- term frequency-inverse document frequency (word weighting heuristic) https://en.wikipedia.org/wiki/Tf%E2%80%93idf

In [2]:
import keras
import numpy as np

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Get data from nltk

In [3]:
import nltk
from nltk.corpus import reuters
#nltk.download()

### get word embeddings

In [4]:
import os
import urllib
if not os.path.exists('wiki.multi.en.vec'):
    urllib.urlretrieve ("https://s3.amazonaws.com/arrival/embeddings/wiki.multi.en.vec", "wiki.multi.en.vec")

In [5]:
from gensim.models import KeyedVectors
embedding = KeyedVectors.load_word2vec_format('wiki.multi.en.vec')

### get inputs


In [14]:
from helper_functions import tf_idf, feature_values, tokenize
from keras.preprocessing.sequence import pad_sequences
from functools import partial

In [15]:
vocab_tokenizer = partial(tokenize, vocab=embedding.vocab)

In [16]:
# split data and train tf_IDF
train_docs = []
test_docs = []
train_doc_ids = []
test_doc_ids = []
for doc in reuters.fileids():
    if doc.startswith("train"):
        train_doc_ids.append(doc)
        train_docs.append(reuters.raw(doc))
    else:
        test_doc_ids.append(doc)
        test_docs.append(reuters.raw(doc))
representer = tf_idf(train_docs, tokenize=vocab_tokenizer)

In [17]:
# set max length to the 99th percentile length of all documents
MAX_SEQ_LEN = int(np.percentile([len(i) for i in train_docs+test_docs], 90))

In [18]:
# convert train docs to input
# tokenize
tokenized_inputs = [tokenize(doc, vocab=embedding.wv.vocab) for doc in train_docs]
# convert to indices and pad
def convert_to_index(doc, embedding):
    return [embedding.index2word.index(w) for w in doc]
seqs = [convert_to_index(doc, embedding) for doc in tokenized_inputs]
inputs_pad = pad_sequences(seqs, maxlen=MAX_SEQ_LEN)

  This is separate from the ipykernel package so we can avoid doing imports until


In [19]:
# get tfidf representation
tfidf_train_docs=representer.transform(train_docs)
tfidf_test_docs=representer.transform(test_docs)

In [20]:
def get_embedding(doc, representer, weighted=True):
    tfidf_rep = representer.transform([doc])
    if np.sum(tfidf_rep) == 0:
        doc_embedding = np.zeros(embedding.vector_size)
    else:
        tfidf_words = representer.inverse_transform(tfidf_rep)[0]
        weights = tfidf_rep.data
        doc_word_embedding = np.vstack([embedding.get_vector(w) for w in tfidf_words])
        if weighted:
            doc_embedding = np.average(doc_word_embedding, axis=0, weights=weights)
        else:
            doc_embedding = np.sum(doc_word_embedding, axis=0)
    return doc_embedding

In [21]:
embedded_train_docs = [get_embedding(doc, representer) for doc in train_docs]
embedded_test_docs = [get_embedding(doc, representer) for doc in test_docs]

### get outputs

In [23]:
from sklearn.preprocessing import MultiLabelBinarizer
categories = reuters.categories()
train_categories = [reuters.categories(doc) for doc in train_doc_ids]
test_categories = [reuters.categories(doc) for doc in test_doc_ids]
# transform into multilabel binarized representation
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([reuters.categories(doc_id)
                                  for doc_id in train_doc_ids])
test_labels = mlb.transform([reuters.categories(doc_id)
                             for doc_id in test_doc_ids])
# rare classes
rare_cutoff = 6
rare_classes = np.where(np.sum(train_labels,0)<rare_cutoff)[0]

In [24]:
# compute class weights
from sklearn.utils import class_weight
category_list = np.hstack([np.where(doc>0)[0] for doc in train_labels])
weighted_list = class_weight.compute_class_weight('balanced', np.unique(category_list), category_list)
class_weights = {i: weighted_list[i] for i in range(len(mlb.classes_))}

rebalance dataset

In [25]:
# rebalance and remove low frequency categories
single_class_labels = [(i,np.where(x==1)[0][0]) for i,x in enumerate(train_labels) if (np.sum(x)==1)]
single_class_labels = [x for x in single_class_labels if x[1] not in rare_classes]
x_subset = np.array([embedded_train_docs[i[0]] for i in single_class_labels])
y_subset = np.array([i[1] for i in single_class_labels])

# reference https://link.springer.com/content/pdf/10.1007%2F978-3-642-41822-8_42.pdf
# Managing Imbalanced Data Sets in Multi-label Problems: A Case Study with the SMOTE Algorithm
# reweight
# resample
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import RandomOverSampler
oversampler = RandomOverSampler()
resampled_train_docs,resampled_train_labels=oversampler.fit_sample(x_subset,y_subset)
# conver train_labels back to multilabel format
onehot_encoder = OneHotEncoder(n_values=len(test_labels[0]), sparse=False)
resampled_train_labels = onehot_encoder.fit_transform(np.expand_dims(resampled_train_labels,1))

### Plot data characteristics

### SVM classifier

train SVM on tf idf vectors

In [26]:
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

# inputs
x_train = tfidf_train_docs
x_test = tfidf_test_docs

# Classifier
classifier = OneVsRestClassifier(LinearSVC(random_state=42))
classifier.fit(x_train, train_labels)
 
predictions = classifier.predict(x_test)

In [27]:
from sklearn.metrics import f1_score, precision_score,recall_score
 
precision = precision_score(test_labels, predictions,
                            average='micro')
recall = recall_score(test_labels, predictions,
                      average='micro')
f1 = f1_score(test_labels, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}"
        .format(precision, recall, f1))
 
precision = precision_score(test_labels, predictions,
                            average='macro')
recall = recall_score(test_labels, predictions,
                      average='macro')
f1 = f1_score(test_labels, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}"
        .format(precision, recall, f1))

Micro-average quality numbers
Precision: 0.9441, Recall: 0.7978, F1-measure: 0.8648
Macro-average quality numbers
Precision: 0.6088, Recall: 0.3622, F1-measure: 0.4314


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


trian SVM on word embeddings

In [28]:
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

# inputs
x_train = embedded_train_docs
x_test = embedded_test_docs

# Classifier
classifier = OneVsRestClassifier(LinearSVC(random_state=42))
classifier.fit(x_train, train_labels)
 
predictions = classifier.predict(x_test)

In [29]:
from sklearn.metrics import f1_score, precision_score,recall_score
 
precision = precision_score(test_labels, predictions,
                            average='micro')
recall = recall_score(test_labels, predictions,
                      average='micro')
f1 = f1_score(test_labels, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}"
        .format(precision, recall, f1))
 
precision = precision_score(test_labels, predictions,
                            average='macro')
recall = recall_score(test_labels, predictions,
                      average='macro')
f1 = f1_score(test_labels, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}"
        .format(precision, recall, f1))

Micro-average quality numbers
Precision: 0.9428, Recall: 0.5812, F1-measure: 0.7191
Macro-average quality numbers
Precision: 0.2688, Recall: 0.0798, F1-measure: 0.1072


train on embedding with resampled dataset

In [30]:
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

# inputs
x_train = resampled_train_docs
x_test = embedded_test_docs

# Classifier
classifier = OneVsRestClassifier(LinearSVC(random_state=42))
classifier.fit(x_train, resampled_train_labels)
 
predictions = classifier.predict(x_test)

  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


AttributeError: 'list' object has no attribute 'shape'

In [None]:
from sklearn.metrics import f1_score, precision_score,recall_score
 
precision = precision_score(test_labels, predictions,
                            average='micro')
recall = recall_score(test_labels, predictions,
                      average='micro')
f1 = f1_score(test_labels, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}"
        .format(precision, recall, f1))
 
precision = precision_score(test_labels, predictions,
                            average='macro')
recall = recall_score(test_labels, predictions,
                      average='macro')
f1 = f1_score(test_labels, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}"
        .format(precision, recall, f1))

### set up keras classifier

In [36]:
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, Conv1D, Dense, Flatten, MaxPooling1D
import keras.backend as K

In [37]:
import tensorflow as tf
def custom_objective(y_true, y_pred, prob):
    '''Just another crossentropy'''
    # Transform to logits
    epsilon = tf.convert_to_tensor(K.common._EPSILON, y_pred.dtype.base_dtype)
    y_pred = tf.clip_by_value(y_pred, epsilon, 1 - epsilon)
    y_pred = tf.log(y_pred / (1 - y_pred))
    loss = tf.nn.weighted_cross_entropy_with_logits(y_true, y_pred, prob)
    return tf.reduce_mean(loss, axis=-1)

loss = partial(custom_objective, prob=50)

In [None]:
from numpy.random import seed
seed(1)
embedding_layer = embedding.get_keras_embedding()
sequence_input = Input(shape=(MAX_SEQ_LEN,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x)  # global max pooling
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(len(mlb.classes_), activation='sigmoid')(x)

model = Model(sequence_input, preds)
model.compile(loss=loss,
              optimizer='adam',
              metrics=['acc'])

# happy learning!
history = model.fit(inputs_pad, train_labels,
                    validation_split=0.1, batch_size=128, epochs=2)


Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
Train on 6992 samples, validate on 777 samples
Epoch 1/2


In [None]:
#pyplot inline
import matplotlib.pyplot as plt
i = 10
predict = model.predict(inputs_pad[0:50])[i]
plt.plot(predict)
plt.plot(train_labels[i]*max(predict))

In [None]:
from sklearn.metrics import classification_report
num_examples = 50
predicted = model.predict(inputs_pad[:num_examples])>.5
y = train_labels[:num_examples]

report = classification_report(y, predicted)
print(report)