In [1]:
import tensorflow as tf
import tensorflow_addons as tfa
import numpy as np
from text_embeddings.bag_of_words import BagOfWords
from text_embeddings.tfidf import TFIDF, build_idf

from models.logistic_regression import LogisticRegressionModel
from data import *

## Create buffers for text embeddings

In [2]:
bow_embeddings_train = np.empty((x_train.shape[0], vocab_size))
bow_embeddings_test = np.empty((x_test.shape[0], vocab_size))

tfidf_embeddings_train = np.empty((x_train.shape[0], vocab_size))
tfidf_embeddings_test = np.empty((x_test.shape[0], vocab_size))

## Create bag of words embeddings

In [3]:
bag_of_words = BagOfWords(vocab_size)

for i in range(x_train.shape[0]):
    bow_embeddings_train[i] = bag_of_words(x_train[i])
for i in range(x_test.shape[0]):
    bow_embeddings_test[i] = bag_of_words(x_test[i])

## Create TF-IDF embeddings

In [4]:
idf = build_idf(x_train, vocab_size)

tf_idf = TFIDF(vocab_size, idf)

for i in range(x_train.shape[0]):
    tfidf_embeddings_train[i] = tf_idf(x_train[i])
for i in range(x_test.shape[0]):
    tfidf_embeddings_test[i] = tf_idf(x_test[i])

## Use logistic regression model for classification

In [5]:
bow_model = LogisticRegressionModel(num_classes=46)
tfidf_model = LogisticRegressionModel(num_classes=46)
bow_model.compile(loss='sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), metrics=['accuracy'])
tfidf_model.compile(loss='sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.005), metrics=['accuracy'])

## Fit the models

In [6]:
bow_model.fit(bow_embeddings_train, y_train, validation_data=(bow_embeddings_test, y_test), epochs=20, verbose=0)

<keras.callbacks.History at 0x29889238f10>

In [7]:
tfidf_model.fit(tfidf_embeddings_train, y_train, validation_data=(tfidf_embeddings_test, y_test), epochs=20, verbose=0)

<keras.callbacks.History at 0x298870f1550>

In [8]:
predictions = bow_model.predict(bow_embeddings_test)
f1_metric = tfa.metrics.F1Score(num_classes=46, average='macro')
f1_metric.update_state(tf.one_hot(y_test, depth=46), predictions)

bow_logistic_metrics = {
    'accuracy': tf.keras.metrics.sparse_categorical_accuracy(y_test, predictions),
    'top_3_accuracy' : tf.keras.metrics.sparse_top_k_categorical_accuracy(y_test, predictions, k=3),
    'top_5_accuracy' : tf.keras.metrics.sparse_top_k_categorical_accuracy(y_test, predictions, k=5)
}
bow_logistic_metrics = {k:tf.reduce_mean(v).numpy() for k,v in bow_logistic_metrics.items()}

print('\n'.join([f'{k}:\t{v*100:.2f}%' for k,v in bow_logistic_metrics.items()]))

accuracy:	79.03%
top_3_accuracy:	90.74%
top_5_accuracy:	93.54%


In [9]:
predictions = tfidf_model.predict(tfidf_embeddings_test)
f1_metric = tfa.metrics.F1Score(num_classes=46, average='macro')
f1_metric.update_state(tf.one_hot(y_test, depth=46), predictions)

tfidf_logistic_metrics = {
    'accuracy': tf.keras.metrics.sparse_categorical_accuracy(y_test, predictions),
    'top_3_accuracy' : tf.keras.metrics.sparse_top_k_categorical_accuracy(y_test, predictions, k=3),
    'top_5_accuracy' : tf.keras.metrics.sparse_top_k_categorical_accuracy(y_test, predictions, k=5)
}
tfidf_logistic_metrics = {k:tf.reduce_mean(v).numpy() for k,v in tfidf_logistic_metrics.items()}

print('\n'.join([f'{k}:\t{v*100:.2f}%' for k,v in tfidf_logistic_metrics.items()]))

accuracy:	80.10%
top_3_accuracy:	91.23%
top_5_accuracy:	93.41%


# Use feature selection to select most important tokens

In [18]:
from sklearn.feature_selection import SelectKBest, chi2

In [43]:
select_k_best_bow = SelectKBest(chi2, k=5000).fit(bow_embeddings_train, y_train)
select_k_best_tfidf = SelectKBest(chi2, k=5000).fit(tfidf_embeddings_train, y_train)

bow_embeddings_train_best = select_k_best_bow.transform(bow_embeddings_train)
bow_embeddings_test_best = select_k_best_bow.transform(bow_embeddings_test)

tfidf_embeddings_train_best = select_k_best_tfidf.transform(tfidf_embeddings_train)
tfidf_embeddings_test_best = select_k_best_tfidf.transform(tfidf_embeddings_test)

In [48]:
bow_model_kbest = LogisticRegressionModel(num_classes=46)
tfidf_model_kbest = LogisticRegressionModel(num_classes=46)
bow_model_kbest.compile(loss='sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.002), metrics=['accuracy'])
tfidf_model_kbest.compile(loss='sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), metrics=['accuracy'])

In [49]:
bow_model_kbest.fit(bow_embeddings_train_best, y_train, validation_data=(bow_embeddings_test_best, y_test), epochs=20, verbose=0);

In [50]:
tfidf_model_kbest.fit(tfidf_embeddings_train_best, y_train, validation_data=(tfidf_embeddings_test_best, y_test), epochs=20, verbose=0);

In [51]:
predictions = bow_model_kbest.predict(bow_embeddings_test_best)
f1_metric = tfa.metrics.F1Score(num_classes=46, average='macro')
f1_metric.update_state(tf.one_hot(y_test, depth=46), predictions)

bow_logistic_metrics = {
    'accuracy': tf.keras.metrics.sparse_categorical_accuracy(y_test, predictions),
    'top_3_accuracy' : tf.keras.metrics.sparse_top_k_categorical_accuracy(y_test, predictions, k=3),
    'top_5_accuracy' : tf.keras.metrics.sparse_top_k_categorical_accuracy(y_test, predictions, k=5)
}
bow_logistic_metrics = {k:tf.reduce_mean(v).numpy() for k,v in bow_logistic_metrics.items()}

print('\n'.join([f'{k}:\t{v*100:.2f}%' for k,v in bow_logistic_metrics.items()]))

accuracy:	78.45%
top_3_accuracy:	90.34%
top_5_accuracy:	93.14%


In [52]:
predictions = tfidf_model_kbest.predict(tfidf_embeddings_test_best)
f1_metric = tfa.metrics.F1Score(num_classes=46, average='macro')
f1_metric.update_state(tf.one_hot(y_test, depth=46), predictions)

tfidf_logistic_metrics = {
    'accuracy': tf.keras.metrics.sparse_categorical_accuracy(y_test, predictions),
    'top_3_accuracy' : tf.keras.metrics.sparse_top_k_categorical_accuracy(y_test, predictions, k=3),
    'top_5_accuracy' : tf.keras.metrics.sparse_top_k_categorical_accuracy(y_test, predictions, k=5)
}
tfidf_logistic_metrics = {k:tf.reduce_mean(v).numpy() for k,v in tfidf_logistic_metrics.items()}

print('\n'.join([f'{k}:\t{v*100:.2f}%' for k,v in tfidf_logistic_metrics.items()]))

accuracy:	80.99%
top_3_accuracy:	91.63%
top_5_accuracy:	93.90%
