In [1]:
!pip install -q -U tensorflow-text==2.7.3
!pip install -q tf-models-official==2.7.0

[K     |████████████████████████████████| 4.9 MB 4.0 MB/s 
[K     |████████████████████████████████| 495.0 MB 30 kB/s 
[K     |████████████████████████████████| 463 kB 54.9 MB/s 
[K     |████████████████████████████████| 1.3 MB 46.8 MB/s 
[K     |████████████████████████████████| 1.8 MB 4.0 MB/s 
[K     |████████████████████████████████| 1.1 MB 48.3 MB/s 
[K     |████████████████████████████████| 43 kB 2.0 MB/s 
[K     |████████████████████████████████| 234 kB 48.2 MB/s 
[K     |████████████████████████████████| 596 kB 59.6 MB/s 
[K     |████████████████████████████████| 352 kB 54.6 MB/s 
[K     |████████████████████████████████| 1.2 MB 48.9 MB/s 
[K     |████████████████████████████████| 47.7 MB 1.2 MB/s 
[K     |████████████████████████████████| 99 kB 9.8 MB/s 
[K     |████████████████████████████████| 90 kB 9.8 MB/s 
[?25h  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [2]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

tf.get_logger().setLevel('ERROR')

We want to get the data in roughly equal proportions, so we drop some examples so that classes are at most twice as large as each other.

In [3]:
max_count_diff = 10
import csv

df = pd.read_csv('train-data.csv')
max_count = df['Topic'].value_counts().min() * max_count_diff
cats = np.array(pd.Categorical(df['Topic']).categories)

new_df = pd.DataFrame(columns=df.columns)
for cat in cats:
  values = df[df['Topic']==cat]
  new_df = new_df.append(values.sample(min(max_count, len(values))))

# df[['Topic']] = df[['Topic']].apply(lambda col: pd.Categorical(col).codes)

In [4]:
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32

val_split = 0.2
train_split = 1 - val_split

df = df.sample(frac=1)


train_df, val_df = train_test_split(df, test_size=val_split)

def make_pred_vector(topics):
  pred_categories = [cat for cat in cats if cat != 'none']
  result = np.zeros((len(topics), len(pred_categories)))
  for i, value in enumerate(topics):
    if value != 'none':
      result[i][pred_categories.index(value)] = 1.0

  return result

def make_dataset(df):
  x = df['Tweet']
  y = df['Topic']

  y = make_pred_vector(y)

  ds = tf.data.Dataset.from_tensor_slices((x, y))
  ds = ds.cache().prefetch(buffer_size=AUTOTUNE).batch(batch_size)
  return ds

train_ds = make_dataset(train_df)
val_ds = make_dataset(val_df)


In [5]:
class Model(tf.keras.models.Model):
  def __init__(self, categories, *args, **kwargs):
    super().__init__(*args, **kwargs)

    bert_preprocess_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
    bert_model_url = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1'

    self.bert_preprocess = hub.KerasLayer(bert_preprocess_url)
    self.bert_model = hub.KerasLayer(bert_model_url)

    self.categories = [cat for cat in categories if cat != 'none']
    self.num_classes = len(self.categories)

    self.dense_layers = [
      tf.keras.layers.Dense(1024),
      tf.keras.activations.relu,
      tf.keras.layers.Dense(self.num_classes),
      tf.keras.activations.softmax,
    ]

  def call(self, x):
    x = self.bert_preprocess(x)
    x = self.bert_model(x)['pooled_output']
    for layer in self.dense_layers:
      x = layer(x)
    return x

  def decode_predictions(self, x):
    return {self.categories[i]:x[i] for i in range(len(self.num_classes))}


In [8]:
epochs = 10

# strategy = tf.distribute.TPUStrategy(resolver)

steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
validation_steps = tf.data.experimental.cardinality(val_ds).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

loss = tf.keras.losses.CategoricalCrossentropy()

# with strategy.scope():
model = Model(cats)
model.compile(optimizer=optimizer, loss=loss, steps_per_execution=num_train_steps)

In [None]:
history = model.fit(train_ds, validation_data=val_ds, epochs=epochs, steps_per_epoch=steps_per_epoch, validation_steps=validation_steps)

Epoch 1/10
