In [8]:
!git clone https://github.com/Josepholaidepetro/Docict

Cloning into 'Docict'...
remote: Enumerating objects: 11, done.[K
remote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 11 (delta 0), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (11/11), done.


In [10]:
!mkdir data

In [None]:
!unzip "/content/Docict/data/bbc data.zip" -d "/content/data"

In [135]:
import glob
import json
import re
import numpy as np
import pandas as pd

In [113]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras import Model

In [87]:
data = []
for filename in glob.glob(f'/content/data/bbc-fulltext (document classification)/*/*/*.txt'):
  data.append(filename)

In [88]:
len(data)

2225

In [206]:
docs = []
for i in data:
  label = i.split('/')[-2]
  # {'business', 'entertainment', 'politics', 'sport', 'tech'}
  if label == 'business':
    label = 0
  elif label == 'entertainment':
    label = 1
  elif label == 'politics':
    label = 2
  elif label == 'sport':
    label = 3
  else:
    label = 4
  with open(i, 'r') as f:
    try:
      dataop = f.read()
      docs.append((dataop, label))
    except:
      pass 

In [207]:
np.random.shuffle(docs)
num_val_samples = int(0.2 * len(docs))
num_train_samples = len(docs) - 2 * num_val_samples
train_pairs = docs[:num_train_samples]
val_pairs = docs[num_train_samples : num_train_samples + num_val_samples]
test_pairs = docs[num_train_samples + num_val_samples :]

print(f"{len(docs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

2224 total pairs
1336 training pairs
444 validation pairs
444 test pairs


In [208]:
vocab_size = 25000
sequence_length = 200
batch_size = 64

In [209]:
def tf_clean(text):
  text = tf.strings.regex_replace(text, '\n', '')
  text = tf.strings.lower(text)
  # Keep space, a to z, and select punctuation.
  text = tf.strings.regex_replace(text, '[^ a-z.?!,¿]', '')
  # Add spaces around punctuation.
  text = tf.strings.regex_replace(text, '[.?!,¿]', '')
  return text

train_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
    standardize=tf_clean,
)
train_texts = [pair[0] for pair in docs]
train_vectorization.adapt(train_texts)

In [210]:
def format_dataset(doc, label):
    vec = train_vectorization(doc)
    return vec, label

In [211]:
def make_dataset(pairs):
    texts, labels = zip(*pairs)
    texts = list(texts)
    labels = list(labels)
    dataset = tf.data.Dataset.from_tensor_slices((texts, labels))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset)
    return dataset.shuffle(2048).prefetch(16).cache()

In [217]:
train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

# Transformer Block

In [218]:
class Transformerlayer(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super(Transformerlayer, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, tf.newaxis, :], dtype="int32")
        attention_output = self.attention(
            query=inputs, value=inputs, key=inputs, attention_mask=padding_mask
        )
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

# Positional Embedding Layer

In [219]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super(PositionalEmbedding, self).__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim
        )
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions
        
    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)

# Modelling

In [220]:
embed_dim = 64
latent_dim = 128
num_heads = 4

transformer_inputs = keras.Input(shape=(None,), dtype="int64", name="transformer_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(transformer_inputs)
transformer_outputs = Transformerlayer(embed_dim, latent_dim, num_heads)(x)
x = layers.GlobalAveragePooling1D()(transformer_outputs)
x = layers.Dropout(0.2)(x)
x = layers.Dense(64, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(5, activation="softmax")(x)

In [221]:
model = keras.Model(inputs=transformer_inputs, outputs=outputs)

In [222]:
model.compile(
    optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
history = model.fit(
    train_ds, epochs=20, validation_data=val_ds
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [223]:
test_ds = make_dataset(test_pairs)

In [225]:
model.evaluate(test_ds)



[0.11882788687944412, 0.9729729890823364]