# Milestone Project 2: SkimLit

The purpose of this notebook is to build an NLP model to make reading medical abstracts easier.

Were going to be replicating the paper https://arxiv.org/abs/1612.05251

## Get data
Since we'll be replicating the paper above (Pubmed 200k RCT), let's download the dataset they used. We can do so through the authors Github.

In [None]:
# Get helper functions file
import os

if not os.path.exists("helper_functions.py"):
    !wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py
else:
    print("[INFO] 'helper_functions.py' already exists, skipping download.")


In [None]:
from helper_functions import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys, calculate_results

In [None]:
!git clone https://github.com/Franck-Dernoncourt/pubmed-rct.git
!ls pubmed-rct

In [None]:
# Check what files are in the PubMed_20k dataset
!ls pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/

In [None]:
# Start our experiments using the 20k dataset with numbers replaced by "@" sign
data_dir = '/content/pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/'

In [None]:
# Check all of the files name in the target directory
import os
filenames = [data_dir + filename for filename in os.listdir(data_dir)]
filenames

## Preprocess data

In [None]:
# Create function to read the lines of a document
def get_lines(filename):
  """
  Reads filename (a text filename) and returns the lines of text as a list.

  Args:
   filename: a string containing the target filepath

   Returns:
    A list of strings with one string per line from the target filename.
  """
  with open(filename, 'r') as f:
    return f.readlines()

In [None]:
# Let's read in the training lines
train_lines = get_lines(data_dir+'train.txt')
train_lines[:20]

In [None]:
len(train_lines)

In [None]:
def preprocess_text(filename):
    """
    Returns a list of dictionaries of abstract line data.

    Takes in filename, reads its contents and sorts through each line,
    extracting things like the target label, the text of the sentence,
    how many sentences are in the current abstract and what sentence number the target line is
    """
    input_lines = get_lines(filename)  # get all lines from filename
    abstract_lines = ''  # create an empty abstract
    abstract_samples = []  # create an empty list of abstracts

    for line in input_lines:
        if line.startswith('###'):
            abstract_id = line
            abstract_lines = ""
        elif line.isspace():
            abstract_line_split = abstract_lines.splitlines()

            for abstract_line_number, abstract_line in enumerate(abstract_line_split):
                line_data = {}
                target_text_split = abstract_line.split('\t')
                line_data['target'] = target_text_split[0]
                line_data['text'] = target_text_split[1].lower()
                line_data['line_number'] = abstract_line_number
                line_data['total_lines'] = len(abstract_line_split) - 1
                abstract_samples.append(line_data)  # append the dictionary to the list

        else:
            abstract_lines += line

    return abstract_samples

In [None]:
# Get data from file and preprocess it
%%time
train_samples = preprocess_text(data_dir + 'train.txt')
val_samples = preprocess_text(data_dir + 'dev.txt')
test_samples = preprocess_text(data_dir + 'test.txt')

print(len(train_samples), len(val_samples), len(test_samples))

In [None]:
# Check the first abstract of our training data
train_samples[:16]

In [None]:
import pandas as pd
train_df = pd.DataFrame(train_samples)
val_df = pd.DataFrame(val_samples)
test_df = pd.DataFrame(test_samples)
train_df.head(14)

In [None]:
# Distriubtions of labels
train_df.target.value_counts()

In [None]:
# Let's check the length of different lines
train_df.total_lines.plot.hist()

### Get list of sentences

In [None]:
# Convert abstract text lines into lists
train_sentences = train_df['text'].tolist()
val_sentences = val_df['text'].tolist()
test_sentences = test_df['text'].tolist()
len(train_sentences), len(val_sentences), len(test_sentences)

In [None]:
# View the first 10 lines of training sentences
train_sentences[:10]

## Make numberic labels (ML models require numeric labels)

In [None]:
# One hot encode labels
from sklearn.preprocessing import OneHotEncoder
one_hot_encoder = OneHotEncoder(sparse_output=False)
train_labels_one_hot = one_hot_encoder.fit_transform(train_df['target'].to_numpy().reshape(-1,1))
val_labels_one_hot = one_hot_encoder.transform(val_df['target'].to_numpy().reshape(-1,1))
test_labels_one_hot = one_hot_encoder.transform(test_df['target'].to_numpy().reshape(-1,1))

train_labels_one_hot

In [None]:
# Extract labels ('target' columns) and encode them into integerts
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_df['target'].to_numpy())
val_labels_encoded = label_encoder.transform(val_df['target'].to_numpy())
test_labels_encoded = label_encoder.transform(test_df['target'].to_numpy())

# Check what training labels look like
train_labels_encoded

In [None]:
# Get class names and number of classes from LabelEncoder instance
num_classes = len(label_encoder.classes_)
class_names = label_encoder.classes_
num_classes,class_names

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

model_0 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

model_0.fit(train_sentences,train_labels_encoded)

In [None]:
model_0_preds = model_0.predict(val_sentences)

In [None]:
model_0.score(val_sentences,val_labels_encoded)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(val_labels_encoded, model_0_preds))

In [None]:
model_0_results = calculate_results(val_labels_encoded,model_0_preds)

In [None]:
import io
import os
import re
import shutil
import string
import tensorflow as tf

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import TextVectorization

In [None]:
# How long is each sentence on average?
import numpy as np
sent_lens = [len(sentence.split()) for sentence in train_sentences]
avg_sent_len = np.mean(sent_lens)
avg_sent_len

In [None]:
# Whats the distrubution look like?
import matplotlib.pyplot as plt
plt.hist(sent_lens, bins=20)

In [None]:
# How long of a sentence length covers 95% of examples?
output_seq_len = int(np.percentile(sent_lens, 95))
output_seq_len

In [None]:
# Turn our data into TensorFlow Datasets
train_dataset = tf.data.Dataset.from_tensor_slices((train_sentences,train_labels_one_hot))
valid_dataset = tf.data.Dataset.from_tensor_slices((val_sentences, val_labels_one_hot))
test_dataset = tf.data.Dataset.from_tensor_slices((test_sentences, test_labels_one_hot))

train_dataset

In [None]:
# Take the TensorsliceDatasets and turn them into prefetch datasets
train_dataset = train_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
valid_dataset = valid_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

train_dataset

In [None]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import TextVectorization, Embedding, Input, Conv1D, GlobalAveragePooling1D, Dense
from tensorflow.keras.models import Model

# Define constants
max_tokens = 68000
output_seq_len = 55  # From your percentile calculation
num_classes = 5  # Number of target classes

# Prepare training sentences
train_sentences = train_df['text'].to_list()  # Ensure this is a list of strings
train_sentences = np.array(train_sentences)  # Convert to numpy array

# Define and adapt the TextVectorization layer
vectorize_layer = TextVectorization(
    max_tokens=max_tokens,
    output_mode='int',
    output_sequence_length=output_seq_len
)
vectorize_layer.adapt(train_sentences)

In [None]:

# Get the vocabulary size after adaptation
rct_20k_text_vocab = vectorize_layer.get_vocabulary()
vocab_size = len(rct_20k_text_vocab)
print(f"Vocabulary size after adaptation: {vocab_size}")  # Debug step


In [None]:
# Define the embedding layer with the correct input_dim
embedding_layer = Embedding(
    input_dim=vocab_size,  # Use the adapted vocabulary size
    output_dim=128,
    mask_zero=True,
    name='token_embedding'
)


In [None]:
# Adapt the vectorization layer to your training data
import numpy as np
train_sentences = np.array(train_sentences)
val_sentences = np.array(val_sentences)
vectorize_layer.adapt(train_sentences)

In [None]:
# Convert the text data to numerical format using the TextVectorization layer
train_sentences_num = vectorize_layer(train_sentences)
val_sentences_num = vectorize_layer(val_sentences)

In [None]:
# Define the model
inputs = Input(shape=(1,), dtype=tf.string)
text_vectors = vectorize_layer(inputs)
token_embedding = embedding_layer(text_vectors)
x = Conv1D(64, 5, padding='same', activation='relu')(token_embedding)
x = GlobalAveragePooling1D()(x)
outputs = Dense(num_classes, activation='softmax')(x)
model_1 = Model(inputs, outputs)

# Compile the model
model_1.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

# Check the model summary
model_1.summary()

In [None]:

# Train the model
history_model_1 = model_1.fit(
    train_dataset,
    steps_per_epoch=int(0.1 * len(train_dataset)),
    epochs=3,
    validation_data=valid_dataset,
    validation_steps=int(0.1 * len(valid_dataset))
)


In [None]:
model_1.evaluate(valid_dataset)

In [None]:
# Make predictions
model_1_pred_probs = model_1.predict(valid_dataset)
model_1_pred_probs

In [None]:
model_1_preds = tf.argmax(model_1_pred_probs, axis=1)
model_1_preds

In [None]:
model_1_results = calculate_results(val_labels_encoded,
                                    model_1_preds)
model_1_results

In [None]:
model_0_results

## Model 2: Featue extraction with pretrained token embeddings

In [None]:
# Download pretrained TensofrFlow Hub USE
import tensorflow_hub as hub
# Load the Universal Sentence Encoder
use_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4", trainable=False)


In [None]:
# Define a custom Keras layer for the USE
class USEEmbedding(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(USEEmbedding, self).__init__(**kwargs)
        self.use = use_layer

    def call(self, inputs):
        return self.use(inputs)

In [None]:
# Define the model using the Functional API
input_layer = tf.keras.Input(shape=[], dtype=tf.string)
embedding = USEEmbedding()(input_layer)
x = tf.keras.layers.Dense(128, activation='relu')(embedding)
outputs = tf.keras.layers.Dense(5, activation="softmax")(x)

# Create the model
model_2 = tf.keras.Model(input_layer, outputs)

model_2.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])

# Test it
model_2.summary()

In [None]:
history_model_2 = model_2.fit(train_dataset,
                              epochs=3,
                              steps_per_epoch=int(0.1 * len(train_dataset)),
                              validation_data=valid_dataset,
                              validation_steps=int(0.1 * len(valid_dataset))
                              )

In [None]:
# Evaluate on the whole validation dataset
model_2.evaluate(valid_dataset)

In [None]:
# Make predictions with feature extraction model
model_2_pred_probs = model_2.predict(valid_dataset)
model_2_pred_probs

In [None]:
# Convert the prediction probabilities found with feature extraction model to labels
model_2_preds = tf.argmax(model_2_pred_probs, axis=1)
model_2_preds

In [None]:
# Calculate results from TF Hub pretrained embeddings results on val set
model_2_results = calculate_results(val_labels_encoded,
                                    model_2_preds)
model_2_results

## Model 3: Conv1D with character embeddings
The paper that we're replicating states they used a combination of token and character embeddings.

previously we made token embeddings but we'll need to do similar steps for characters if we want to use char-level embedings

### Create a character level tokenizer

In [None]:
train_sentences[:5]

In [None]:
# Make function to split sentences into charactesr
def split_chars(text):
  return " ".join(list(text))

In [None]:
# Split sequence-level data splits into character-level data splits
train_chars = [split_chars(sentence) for sentence in train_sentences]
val_chars = [split_chars(sentence) for sentence in val_sentences]
test_chars = [split_chars(sentence) for sentence in test_sentences]

In [None]:
# What's the average character length?
chars_lens = [len(sentence) for sentence in train_sentences]
mean_char_len = np.mean(chars_lens)
mean_char_len

In [None]:
# Check the distribution of our sequences at a character-level
import matplotlib.pyplot as plt
plt.hist(chars_lens, bins=7)

In [None]:

# Find character length that covers 95% of sequences
output_seq_len = int(np.percentile(chars_lens, 95))
output_seq_len

In [None]:
# Get all keboard characters
import string
alphabet = string.ascii_lowercase + string.digits + string.punctuation
alphabet

In [None]:
# Create char-level token vecotrizer instance
NUM_CHAR_TOKENS = len(alphabet) + 2
char_vectorizer = TextVectorization(max_tokens= NUM_CHAR_TOKENS,
                                    output_sequence_length = output_seq_len,
                                    standardize='lower_and_strip_punctuation',
                                    name='char_vectorizer')

In [None]:
# Adapt character vectorizer to training cahracters
char_vectorizer.adapt(train_chars)

In [None]:
char_vocab = char_vectorizer.get_vocabulary()

In [None]:
char_embed = tf.keras.layers.Embedding(input_dim=len(char_vocab),
                              output_dim=25,
                              mask_zero=True,
                              name='char_embed')

In [None]:
# Check character voacab stats
char_vocab = char_vectorizer.get_vocabulary()
print(f"Number of different characters in character vocab: {len(char_vocab)}")
print(f"5 most common characters: {char_vocab[:5]}")
print(f"5 least common characters: {char_vocab[-5:]}")

In [None]:

import random
random_train_chars = random.choice(train_chars)
print(f"Charified text: \n {random_train_chars}")
print(f"Length of random sentence {len(random_train_chars.split())}")
vectorized_chars = char_vectorizer([random_train_chars])
print(f"Vectorized chars: \n {vectorized_chars}")
print(f"Length of vectorized chars: \n {len(vectorized_chars[0])}")

### Building a Conv1D to fit on character embeddings

In [None]:
inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
char_vectors = char_vectorizer(inputs)
char_embeddings = char_embed(char_vectors)
x = tf.keras.layers.Conv1D(64,5, padding='same', activation='relu')(char_embeddings)
x = tf.keras.layers.GlobalMaxPooling1D()(x)
outputs = tf.keras.layers.Dense(num_classes, activation='softmax')(x)
model_3 = tf.keras.Model(inputs,outputs, name='model_3_conv1d_char_embeddings')

model_3.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])
model_3.summary()

In [None]:
# Create char level datasets
train_char_dataset = tf.data.Dataset.from_tensor_slices((train_chars, train_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE)
val_char_dataset = tf.data.Dataset.from_tensor_slices((val_chars, val_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE)
test_char_dataset = tf.data.Dataset.from_tensor_slices((test_chars, test_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE)

In [None]:
train_char_dataset

In [None]:
# fit the model on chars only
model_3_history = model_3.fit(train_char_dataset,
                              epochs=3,
                              steps_per_epoch=int(0.1 * (len(train_char_dataset))),
                              validation_data=val_char_dataset,
                              validation_steps=int(0.1 * len(val_char_dataset)))

In [None]:
# Make predictions
model_3_pred_probs = model_3.predict(val_char_dataset)
model_3_pred_probs

In [None]:
model_3_preds = tf.argmax(model_3_pred_probs, axis=1)
model_3_preds

In [None]:
model_3_results = calculate_results(val_labels_encoded,model_3_preds)
model_3_results

## Model 4: Combining pretrained token embeddings + character embeddings (hybrid approach)

In [None]:
# Download pretrained TensorFlow Hub USE
import tensorflow_hub as hub
tf_hub_embedding_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        trainable=False,
                                        name="universal_sentence_encoder")

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
from tensorflow.keras.saving import register_keras_serializable

# Wrap the TensorFlow Hub layer in a custom Keras layer
@register_keras_serializable()  # Register the custom class for serialization
class USEEmbedding(layers.Layer):
    def __init__(self, **kwargs):
        super(USEEmbedding, self).__init__(**kwargs)
        self.tf_hub_embedding_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4", trainable=False, name="universal_sentence_encoder")

    def call(self, inputs):
        return self.tf_hub_embedding_layer(inputs)

    def get_config(self):  # Define get_config for serialization
        config = super(USEEmbedding, self).get_config()
        return config


token_inputs = layers.Input(shape=[], dtype=tf.string, name="token_input")
# Use the custom layer to apply the USE embeddings
token_embeddings = USEEmbedding()(token_inputs)
token_output = layers.Dense(128, activation="relu")(token_embeddings)
token_model = tf.keras.Model(inputs=token_inputs,
                             outputs=token_output)

# 2. Setup char inputs/model
char_inputs = layers.Input(shape=(1,), dtype=tf.string, name="char_input")
char_vectors = char_vectorizer(char_inputs)
char_embeddings = char_embed(char_vectors)
char_bi_lstm = layers.Bidirectional(layers.LSTM(25))(char_embeddings) # bi-LSTM shown in Figure 1 of https://arxiv.org/pdf/1612.05251.pdf
char_model = tf.keras.Model(inputs=char_inputs,
                            outputs=char_bi_lstm)

# 3. Concatenate token and char inputs (create hybrid token embedding)
token_char_concat = layers.Concatenate(name="token_char_hybrid")([token_model.output,
                                                                  char_model.output])

# 4. Create output layers - addition of dropout discussed in 4.2 of https://arxiv.org/pdf/1612.05251.pdf
combined_dropout = layers.Dropout(0.5)(token_char_concat)
combined_dense = layers.Dense(200, activation="relu")(combined_dropout) # slightly different to Figure 1 due to different shapes of token/char embedding layers
final_dropout = layers.Dropout(0.5)(combined_dense)
output_layer = layers.Dense(num_classes, activation="softmax")(final_dropout)

# 5. Construct model with char and token inputs
model_4 = tf.keras.Model(inputs=[token_model.input, char_model.input],
                         outputs=output_layer,
                         name="model_4_token_and_char_embeddings")

In [None]:
model_4.summary()

In [None]:
# Plot hybrid token and character model
from keras.utils import plot_model
plot_model(model_4,show_layer_names=True)

In [None]:
# Compile token char model
model_4.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])

### Combinding token and character data into a tf.Data dataset

In [None]:
# Combine chars and tokens into a dataset
train_char_token_data = tf.data.Dataset.from_tensor_slices((train_sentences, train_chars)) # make data
train_char_token_labels = tf.data.Dataset.from_tensor_slices(train_labels_one_hot) # make labels
train_char_token_dataset = tf.data.Dataset.zip((train_char_token_data, train_char_token_labels)) # combine data and labels

# Prefetch and batch train data
train_char_token_dataset = train_char_token_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

# Repeat same steps validation data
val_char_token_data = tf.data.Dataset.from_tensor_slices((val_sentences, val_chars))
val_char_token_labels = tf.data.Dataset.from_tensor_slices(val_labels_one_hot)
val_char_token_dataset = tf.data.Dataset.zip((val_char_token_data, val_char_token_labels))
val_char_token_dataset = val_char_token_dataset.batch(32).prefetch(tf.data.AUTOTUNE)


In [None]:
train_char_token_dataset, val_char_token_dataset

### Fitting a model on token and character-level sequences

In [None]:

# Fit the model on tokens and chars
model_4_history = model_4.fit(train_char_token_dataset, # train on dataset of token and characters
                              steps_per_epoch=int(0.1 * len(train_char_token_dataset)),
                              epochs=3,
                              validation_data=val_char_token_dataset,
                              validation_steps=int(0.1 * len(val_char_token_dataset)))


In [None]:
model_4.evaluate(val_char_token_dataset)

In [None]:
model_4_pred_prods = model_4.predict(val_char_token_dataset)

In [None]:
model_4_preds = tf.argmax(model_4_pred_prods, axis=1)
model_4_preds

In [None]:
model_4_results = calculate_results(val_labels_encoded,
                                    model_4_preds)

In [None]:
model_4_results

## Model 5: Transfer learning with pretrained token embeddings + character embeddings + positional embeddings

### Create positional embeddings

In [None]:
train_df.head(15)

In [None]:
train_df['line_number'].value_counts()

In [None]:
train_df.line_number.plot.hist()

In [None]:
train_line_numbers_one_hot = tf.one_hot(train_df['line_number'].to_numpy(), depth=15)
val_line_numbers_one_hot = tf.one_hot(val_df['line_number'].to_numpy(), depth=15)
test_line_numbers_one_hot = tf.one_hot(test_df['line_number'].to_numpy(), depth=15)
train_line_numbers_one_hot, train_line_numbers_one_hot.shape

In [None]:
train_df['total_lines'].plot.hist()

In [None]:
np.percentile(train_df.total_lines, 98)

In [None]:
train_total_lines_one_hot = tf.one_hot(train_df['total_lines'].to_numpy(), depth=20)
test_total_lines_one_hot = tf.one_hot(test_df['total_lines'].to_numpy(), depth=20)
val_total_lines_one_hot = tf.one_hot(val_df['total_lines'].to_numpy(), depth=20)
train_total_lines_one_hot

### Building a tribrid embedding model

In [None]:
# 1. Token inputs
token_inputs = layers.Input(shape=[], dtype='string', name='token_inputs')
token_embeddings = USEEmbedding()(token_inputs)
token_outputs = layers.Dense(128, activation='relu')(token_embeddings)
# Reshape the output to be 3D for GlobalAveragePooling1D
token_outputs = layers.Reshape((1, 128))(token_outputs)  # Reshape to (batch_size, 1, features)
# Apply GlobalAveragePooling1D to reduce dimensionality
token_outputs = layers.GlobalAveragePooling1D()(token_outputs)
token_model = tf.keras.Model(token_inputs,
                             token_outputs)

# 2. Char inputs
char_inputs = layers.Input(shape=(1,), dtype='string', name='char_inputs')
char_vectors = char_vectorizer(char_inputs)
char_embeddings = char_embed(char_vectors)
char_bi_lstm = layers.Bidirectional(layers.LSTM(24))(char_embeddings)
char_model = tf.keras.Model(char_inputs,
                            char_bi_lstm)

# 3 Line numbers model
line_number_inputs = layers.Input(shape=(15,), dtype=tf.float32, name='line_number_input')
x = layers.Dense(32, activation='relu')(line_number_inputs)
line_number_model = tf.keras.Model(line_number_inputs,
                                   x)
# 4. Total lines model
total_lines_inputs = layers.Input(shape=(20,), dtype=tf.float32, name='total_lines_input')
y = layers.Dense(32, activation='relu')(total_lines_inputs)
total_line_model = tf.keras.Model(total_lines_inputs,
                                  y)
# 5. Combine token and char embeddings into a hybrid embedding
combined_embeddings = layers.Concatenate(name='char_token_hybrid_embedding')([token_model.output,
                                                                              char_model.output])
z = layers.Dense(256, activation='relu')(combined_embeddings)
z = layers.Dropout(0.5)(z)

# 6. Combine positional embeddings with combined token and char embeddings
tribrid_embeddings = layers.Concatenate(name='char_token_positional_embedding')([line_number_model.output,
                                                                                 total_line_model.output,
                                                                                 z])
# 7. Create output layer
output_layer = layers.Dense(5, activation='softmax', name='output_layer')(tribrid_embeddings)

# 8. Put together model
model_5 = tf.keras.Model([line_number_model.input,
                          total_line_model.input,
                          token_model.input,
                          char_model.input],
                         output_layer, name='tribrid_embedings_model_5')

In [None]:
model_5.summary()

In [None]:
from tensorflow.keras.utils import plot_model
plot_model(model_5)

In [None]:
# Explicitly set steps_per_execution to 1
model_5.compile(
    loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.2),
    optimizer='adam',
    metrics=['accuracy'],
    steps_per_execution=1  # Force single-step execution
)

In [None]:
# Convert lists to tensors and reshape if necessary
if isinstance(train_chars, list):
    train_chars = tf.constant(train_chars)
if isinstance(val_chars, list):
    val_chars = tf.constant(val_chars)

if len(train_chars.shape) == 1:
    train_chars = tf.expand_dims(train_chars, axis=-1)  # Shape: (180040, 1)
if len(val_chars.shape) == 1:
    val_chars = tf.expand_dims(val_chars, axis=-1)

# 1. Create individual datasets for each of the 4 input tensors
train_line_numbers_dataset = tf.data.Dataset.from_tensor_slices(train_line_numbers_one_hot)  # (180040, 15), float32
train_total_lines_dataset = tf.data.Dataset.from_tensor_slices(train_total_lines_one_hot)    # (180040, 20), float32
train_sentences_dataset = tf.data.Dataset.from_tensor_slices(train_sentences)                # (180040,), string
train_chars_dataset = tf.data.Dataset.from_tensor_slices(train_chars)                        # (180040, 1), string

# 2. Create the labels dataset
train_labels_dataset = tf.data.Dataset.from_tensor_slices(train_labels_one_hot)              # (180040, 5), float64

# 3. Zip the 4 input datasets into a single tuple, then zip with labels
train_inputs_dataset = tf.data.Dataset.zipmeant = tf.data.Dataset.zip((
    train_line_numbers_dataset,
    train_total_lines_dataset,
    train_sentences_dataset,
    train_chars_dataset
))
train_char_token_pos_dataset = tf.data.Dataset.zip((train_inputs_dataset, train_labels_dataset))

# 4. Batch and prefetch
train_char_token_pos_dataset = train_char_token_pos_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

# Validation dataset
val_line_numbers_dataset = tf.data.Dataset.from_tensor_slices(val_line_numbers_one_hot)
val_total_lines_dataset = tf.data.Dataset.from_tensor_slices(val_total_lines_one_hot)
val_sentences_dataset = tf.data.Dataset.from_tensor_slices(val_sentences)
val_chars_dataset = tf.data.Dataset.from_tensor_slices(val_chars)
val_labels_dataset = tf.data.Dataset.from_tensor_slices(val_labels_one_hot)

val_inputs_dataset = tf.data.Dataset.zip((
    val_line_numbers_dataset,
    val_total_lines_dataset,
    val_sentences_dataset,
    val_chars_dataset
))
val_char_token_pos_dataset = tf.data.Dataset.zip((val_inputs_dataset, val_labels_dataset))

val_char_token_pos_dataset = val_char_token_pos_dataset.batch(32).prefetch(tf.data.AUTOTUNE)


In [None]:
for (inputs, labels) in train_char_token_pos_dataset.take(1):
    print("Input shapes:")
    for i, inp in enumerate(inputs):
        print(f"Input {i+1}: {inp.shape}, {inp.dtype}")
    print(f"Labels: {labels.shape}, {labels.dtype}")

In [None]:
history_model_6 = model_5.fit(
    train_char_token_pos_dataset,
    epochs=3,
    steps_per_epoch=int(0.1 * len(train_char_token_pos_dataset)),
    validation_data=val_char_token_pos_dataset,
    validation_steps=int(0.1 * len(val_char_token_pos_dataset))
)

In [None]:
model_5_pred_probs = model_5.predict(val_char_token_pos_dataset)

In [None]:
model_5_preds = tf.argmax(model_5_pred_probs, axis=1)
model_5_preds

In [None]:
model_5_results = calculate_results(val_labels_encoded,
                                    model_5_preds)
model_5_results

In [None]:
# Combine model results into a datafram
all_model_results = pd.DataFrame({'model_0_baseline': model_0_results,
                                 'model_1_custom_token_embedding': model_1_results,
                                 'model_2_pretrained_token_embedding': model_2_results,
                                 'model_3_custom_char_embedding':model_3_results,
                                 'model_4_hybrid+char_token_embedding': model_4_results,
                                 'model_5_pos_char_token_embedding': model_5_results}).T
all_model_results

In [None]:
# Reduce the accuracy to same scale as other metrics
all_model_results['accuracy'] = all_model_results['accuracy']/100

In [None]:
import matplotlib.pyplot as plt

all_model_results.plot(kind='bar', figsize=(12, 6), width=0.8)
plt.title('Model Performance Comparison (F1-Score and More)')
plt.xlabel('Models')
plt.ylabel('Score')
plt.xticks(rotation=45, ha='right')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', title='Metrics')
plt.tight_layout()
plt.show()

In [None]:
# Sort the model results by f1-scores
all_model_results.sort_values('f1', ascending=True)['f1'].plot(kind='bar', figsize=(10,7), ylabel='F1-score')

In [None]:
import os

os.makedirs('skimlit_tribrid', exist_ok=True)

model_5.save('skimlit_tribrid/model_5.keras')

In [None]:
loaded_model = tf.keras.models.load_model('skimlit_tribrid/model_5.keras')

In [None]:
loaded_pred_probs = loaded_model.predict(val_char_token_pos_dataset)
loaded_preds = tf.argmax(loaded_pred_probs, axis=1)
loaded_preds

In [None]:
loaded_model_results = calculate_results(val_labels_encoded,
                                         loaded_preds)
loaded_model_results