In [None]:
!git clone https://github.com/JoannaMisztalRadecka/text_classification_tf.git

In [None]:
! pip install -r text_classification_tf/requirements.txt

In [None]:
%load_ext tensorboard

In [None]:
import os
import datetime
import tensorflow as tf 

import sys
sys.path.append('text_classification_tf')

from text_classification.hypermodel import StandardTextClassificationHyperModel, \
TFHubEmbeddingTextClassificationHyperModel, BertTextClassificationHyperModel,\
get_best_model

from text_classification.dataset import get_dataset

In [None]:
loss = 'sparse_categorical_crossentropy'
metric = 'accuracy'
objective=f"val_{metric}"
n_output_units = 20
max_trials = 5
executions_per_trial = 1
epochs = 10
batch_size = 64
seed = 123

url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset_dir = "aclImdb_v1"
train_dir = "aclImdb/train"

# url = "https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz"
# dataset_dir = "stack_overflow_16k"
# train_dir = "train"

results = []

In [None]:
# train_ds, val_ds = get_dataset(url, dataset_dir, train_dir, batch_size, seed)


In [None]:
dataset = tf.keras.utils.get_file(dataset_dir, url,
                                      untar=True, cache_dir='..',
                                      cache_subdir='')
train_dataset_dir = os.path.join(os.path.dirname(dataset), train_dir)

train_ds = tf.keras.utils.text_dataset_from_directory(
    train_dataset_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed)
val_ds = tf.keras.utils.text_dataset_from_directory(
    train_dataset_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed)

In [None]:
from sklearn.datasets import fetch_20newsgroups
train_dataset = fetch_20newsgroups(subset='train')
val_dataset = fetch_20newsgroups(subset='test')

In [None]:
import numpy as np

In [None]:
train_ds = tf.data.Dataset.from_tensor_slices((np.array(train_dataset.data).astype(str), np.array(train_dataset.target).astype(int)))
val_ds = tf.data.Dataset.from_tensor_slices((np.array(val_dataset.data).astype(str), np.array(val_dataset.target).astype(int)))

In [None]:
train_ds = train_ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)

## Baseline model

In [None]:
hypermodel_baseline = StandardTextClassificationHyperModel(train_ds, n_output_units, 
                                                  loss, metric)
log_dir_baseline = os.path.join('logs', 'baseline', datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))

%tensorboard --logdir  $log_dir_baseline

In [None]:
best_model_baseline = get_best_model(hypermodel_baseline, log_dir_baseline, objective, train_ds, 
                            val_ds, epochs, max_trials, executions_per_trial)
results.append({"model": "baseline", objective: best_model_baseline.evaluate(val_ds)[1]})

## Model with pre-computed text embeddings from TF-Hub (transfer learning)

In [None]:
hypermodel_tf_hub = TFHubEmbeddingTextClassificationHyperModel(train_ds, n_output_units, 
                                                  loss, metric)

log_dir_tf_hub = os.path.join('logs', 'tf_hub', datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
%load_ext tensorboard
%tensorboard --logdir  $log_dir_tf_hub

In [None]:
best_model_tf_hub = get_best_model(hypermodel_tf_hub, log_dir_tf_hub, objective, train_ds, 
                            val_ds, epochs, max_trials, executions_per_trial)
results.append({"model": "tf_hub", objective: best_model_tf_hub.evaluate(val_ds)[1]})

## Model with pre-computed Bert embeddings

In [None]:
hypermodel_bert = BertTextClassificationHyperModel(train_ds, n_output_units, 
                                                   loss, metric)
log_dir_bert = os.path.join('logs', 'bert', datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
%load_ext tensorboard
%tensorboard --logdir  $log_dir_bert



In [None]:
import tensorflow_text # required for importing Bert

best_model_bert = get_best_model(hypermodel_bert, log_dir_bert, objective, train_ds, 
                            val_ds, epochs, max_trials, executions_per_trial)
results.append({"model": "bert", objective: best_model_bert.evaluate(val_ds)[1]})

## Results comparison

In [None]:
import pandas as pd
pd.DataFrame(results)