# Model Inference

- final projects
- problem set 03
- grades update
- reading
- missing problem sets (50 percent recovery)
- Informal presentations

# Pubmed 200k: a Dataset for Sequential Sentence Classification

In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import os

In [None]:
!nvidia-smi -L

## Download the data

In [None]:
!git clone https://github.com/Franck-Dernoncourt/pubmed-rct.git
!ls pubmed-rct

In [None]:
!ls pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign

In [None]:
from pathlib import Path
data_dir = 'pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign'

files = [str(x) for x in Path(data_dir).glob('*.txt')]
files

### Preprocess the data

In [None]:
# Create function to read the lines of a document
def get_lines(filename):
  """
  Reads filename (a text file) and returns the lines of text as a list.
  
  Args:
      filename: a string containing the target filepath to read.
  
  Returns:
      A list of strings with one string per line from the target filename.
      For example:
      ["this is the first line of filename",
       "this is the second line of filename",
       "..."]
  """
  with open(filename, "r") as f:
    return f.readlines()

In [None]:
train_lines = get_lines(data_dir + "/train.txt")
val_lines = get_lines(data_dir + "/dev.txt")
test_lines = get_lines(data_dir + "/test.txt")

train_lines[:20]

In [None]:
def preprocess_text(data):
  """Returns a list of dictionaries of abstract line data.

  Takes in filename, reads its contents and sorts through each line,
  extracting things like the target label, the text of the sentence,
  how many sentences are in the current abstract and what sentence number
  the target line is.

  Args:
      filename: a string of the target text file to read and extract line data
      from.

  Returns:
      A list of dictionaries each containing a line from an abstract,
      the lines label, the lines position in the abstract and the total number
      of lines in the abstract where the line is from. For example:

      [{"target": 'CONCLUSION',
        "text": The study couldn't have gone better, turns out people are kinder than you think",
        "line_number": 8,
        "total_lines": 8}]
  """
  abstract_lines = "" # create an empty abstract
  abstract_samples = [] # create an empty list of abstracts
  
  # Loop through each line in target file
  for line in data:
    if line.startswith("###"): # check to see if line is an ID line
      abstract_id = line
      abstract_lines = "" # reset abstract string
    elif line.isspace(): # check to see if line is a new line
      abstract_line_split = abstract_lines.splitlines() # split abstract into separate lines

      # Iterate through each line in abstract and count them at the same time
      for abstract_line_number, abstract_line in enumerate(abstract_line_split):
        line_data = {} # create empty dict to store data from line
        target_text_split = abstract_line.split("\t") # split target label from text
        line_data["target"] = target_text_split[0] # get target label
        line_data["text"] = target_text_split[1].lower() # get target text and lower it
        line_data["line_number"] = abstract_line_number # what number line does the line appear in the abstract?
        line_data["total_lines"] = len(abstract_line_split) - 1 # how many total lines are in the abstract? (start from 0)
        abstract_samples.append(line_data) # add line data to abstract samples list
    
    else: # if the above conditions aren't fulfilled, the line contains a labelled sentence
      abstract_lines += line
  
  return abstract_samples

In [None]:
# preprocess examples
train_samples = preprocess_text(train_lines)
val_samples = preprocess_text(val_lines)
test_samples = preprocess_text(test_lines)

len(train_samples), len(val_samples), len(test_samples)

In [None]:
train_samples[:3]

Visualize the data in a pd dataframe

In [None]:
import pandas as pd
train_df = pd.DataFrame(train_samples)
val_df = pd.DataFrame(val_samples)
test_df = pd.DataFrame(test_samples)
train_df.head(14)

In [None]:
# Distribution of labels in training data
train_df.target.value_counts()

In [None]:
train_df.total_lines.plot.hist();

## Prepare data for RNN Model

### Convert to lists

In [None]:
# Convert abstract text lines into lists 
train_sentences = train_df["text"].tolist()
val_sentences = val_df["text"].tolist()
test_sentences = test_df["text"].tolist()
len(train_sentences), len(val_sentences), len(test_sentences)

In [None]:
train_sentences[:5]

### Create OneHotEncoding

In [None]:
# One hot encode labels
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()

train_labels_oh = ohe.fit_transform(train_df["target"].to_numpy().reshape(-1, 1))
val_labels_oh = ohe.transform(val_df["target"].to_numpy().reshape(-1, 1))
test_labels_oh = ohe.transform(test_df["target"].to_numpy().reshape(-1, 1))

# Check what training labels look like
train_labels_oh[:5].toarray()

### Create classification labels

In [None]:
# Extract labels ("target" columns) and encode them into integers 
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

train_labels_encoded = label_encoder.fit_transform(train_df["target"].to_numpy())
val_labels_encoded = label_encoder.transform(val_df["target"].to_numpy())
test_labels_encoded = label_encoder.transform(test_df["target"].to_numpy())


train_labels_encoded

### Visualize class labels

In [None]:
# Get class names and number of classes from LabelEncoder instance 

num_classes = len(label_encoder.classes_)
class_names = label_encoder.classes_
num_classes, class_names

## Naive Bayes Model

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create a pipeline
model_0 = Pipeline([
  ("tf-idf", TfidfVectorizer()),
  ("clf", MultinomialNB())
])

# Fit the pipeline to the training data
model_0.fit(X=train_sentences, 
            y=train_labels_encoded)

In [None]:
# Evaluate baseline on validation dataset
model_0.score(X=val_sentences,
              y=val_labels_encoded)

### Predictions using NB Model

In [None]:
baseline_preds = model_0.predict(val_sentences)
baseline_preds

### Classification report

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import classification_report

# Calculate accuracy
baseline_results = accuracy_score(val_labels_encoded, baseline_preds)

report = classification_report(val_labels_encoded, baseline_preds, target_names=class_names)
print(report)

In [None]:
# Calculate precision, recall and f1 score
precision, recall, f1, _ = precision_recall_fscore_support(val_labels_encoded,
                                                           baseline_preds,
                                                           average="weighted")

precision, recall, f1

## RNN Model

### Create RNN Model

### Text vectorizer

In [None]:
from tensorflow.keras.layers import TextVectorization

max_tokens = 65000
max_length = 58

vectorizer = TextVectorization(max_tokens=max_tokens, output_sequence_length=max_length)

In [None]:
# Adapt text vectorizer to training sentences
vectorizer.adapt(train_sentences)

In [None]:
# Test out text vectorizer
import random
target_sentence = random.choice(train_sentences)

print(f"Text:\n{target_sentence}")
print(f"\nLength of text: {len(target_sentence.split())}")
print(f"\nVectorized text:\n{vectorizer([target_sentence])}")

In [None]:
# How many words in our training vocabulary?
vocab = vectorizer.get_vocabulary()
print(f"Number of words in vocabulary: {len(vocab)}"), 
print(f"Most common words in the vocabulary: {vocab[:5]}")
print(f"Least common words in the vocabulary: {vocab[-5:]}")

In [None]:
# Get the config of our text vectorizer
vectorizer.get_config()

### Embeddings

#### Sentence Embeddings

##### Create Embedding Matrix randomized (and model will learn the embeddings wrt the task)

In [None]:
# Create token embedding layer
from tensorflow.keras import layers

token_embed = layers.Embedding(input_dim=len(vocab), # length of vocabulary
                               output_dim=128,
                               mask_zero=True,
                               name="embeddings") 

# Show example embedding
print(f"Sentence before vectorization:\n{target_sentence}\n")
vectorized_sentence = vectorizer([target_sentence])
print(f"Sentence after vectorization (before embedding):\n{vectorized_sentence}\n")
embedded_sentence = token_embed(vectorized_sentence)
print(f"Sentence after embedding:\n{embedded_sentence}\n")
print(f"Embedded sentence shape: {embedded_sentence.shape}")

##### Use pre-trained embeddings

In [None]:
# Download pretrained TensorFlow Hub USE
import tensorflow_hub as hub
hub_embedding_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        trainable=False,
                                        name="universal_sentence_encoder")

In [None]:
# Test out the embedding on a random sentence
rand_sent = random.choice(train_sentences)
print(f"Random training sentence:\n{rand_sent}\n")
embed_sent = hub_embedding_layer([rand_sent])
print(f"Sentence after embedding:\n{embed_sent[0][:30]} (truncated output)...\n")
print(f"Length of sentence embedding:\n{len(embed_sent[0])}")

#### Character Embeddings

In [None]:
def char_tokenizer(text):
    return " ".join(list(text))

rand_sent = random.choice(train_sentences)
print(f"Random training sentence:\n{rand_sent}\n")
print(f"Sentence after character-level tokenization:\n{char_tokenizer(rand_sent)}")

In [None]:
train_chars = [char_tokenizer(sentence) for sentence in train_sentences]
val_chars = [char_tokenizer(sentence) for sentence in val_sentences]
test_chars = [char_tokenizer(sentence) for sentence in test_sentences]

In [None]:
train_chars[:5]

In [None]:
# What's the average character length?
import numpy as np


char_lens = [len(sentence) for sentence in train_sentences]
mean_char_len = np.mean(char_lens)
mean_char_len

In [None]:
# Check the distribution of our sequences at character-level
import matplotlib.pyplot as plt
plt.hist(char_lens, bins=7);

In [None]:
# Find what character length covers 95% of sequences
output_seq_char_len = int(np.percentile(char_lens, 98))
output_seq_char_len

In [None]:
# Get all keyboard characters for char-level embedding
import string
alphabet = string.ascii_lowercase + string.digits + string.punctuation
alphabet

In [None]:
# Create char-level token vectorizer instance
CHAR_TOKENS_LEN = len(alphabet) + 2 # num characters in alphabet + space + OOV token
char_vectorizer = TextVectorization(max_tokens=CHAR_TOKENS_LEN,  
                                    output_sequence_length=output_seq_char_len,
                                    standardize="lower_and_strip_punctuation",
                                    name="char_vectorizer")

# Adapt character vectorizer to training characters
char_vectorizer.adapt(train_chars)

In [None]:
# Check character vocabulary characteristics
char_vocab = char_vectorizer.get_vocabulary()
print(f"Number of different characters in character vocab: {len(char_vocab)}")
print(f"5 most common characters: {char_vocab[:5]}")
print(f"5 least common characters: {char_vocab[-5:]}")

In [None]:
# Test out character vectorizer
random_train_chars = random.choice(train_chars)
print(f"Charified text:\n{random_train_chars}")
print(f"\nLength of chars: {len(random_train_chars.split())}")
vectorized_chars = char_vectorizer([random_train_chars])
print(f"\nVectorized chars:\n{vectorized_chars}")
print(f"\nLength of vectorized chars: {len(vectorized_chars[0])}")

In [None]:
# Create char embedding layer
char_embed = layers.Embedding(input_dim=CHAR_TOKENS_LEN, # number of different characters
                              output_dim=25, # embedding dimension of each character (same as Figure 1 in https://arxiv.org/pdf/1612.05251.pdf)
                              mask_zero=False, # don't use masks (this messes up model_5 if set to True)
                              name="char_embed")

# Test out character embedding layer
print(f"Charified text (before vectorization and embedding):\n{random_train_chars}\n")
char_embed_example = char_embed(char_vectorizer([random_train_chars]))
print(f"Embedded chars (after vectorization and embedding):\n{char_embed_example}\n")
print(f"Character embedding shape: {char_embed_example.shape}")

#### Line position embeddings

In [None]:
train_df["line_number"].value_counts()

In [None]:
# Check the distribution of "line_number" column
train_df.line_number.plot.hist()

In [None]:
# Use TensorFlow to create one-hot-encoded tensors of our "line_number" column 
train_line_numbers_oh = tf.one_hot(train_df["line_number"].to_numpy(), depth=15)
val_line_numbers_oh = tf.one_hot(val_df["line_number"].to_numpy(), depth=15)
test_line_numbers_oh = tf.one_hot(test_df["line_number"].to_numpy(), depth=15)

In [None]:
# Check one-hot encoded "line_number" feature samples
train_line_numbers_oh.shape, train_line_numbers_oh[:10]

In [None]:
# Use TensorFlow to create one-hot-encoded tensors of our "line_number" column 
train_line_numbers_oh = tf.one_hot(train_df["line_number"].to_numpy(), depth=15)
val_line_numbers_oh = tf.one_hot(val_df["line_number"].to_numpy(), depth=15)
test_line_numbers_oh = tf.one_hot(test_df["line_number"].to_numpy(), depth=15)

print(f"train_line_numbers_oh shape: {train_line_numbers_oh.shape}")
print(f"val_line_numbers_oh shape: {val_line_numbers_oh.shape}")

#### Total lines embeddings

In [None]:
train_df["total_lines"].value_counts()

In [None]:
# Check the distribution of total lines
train_df.total_lines.plot.hist();

In [None]:
# Use TensorFlow to create one-hot-encoded tensors of our "total_lines" column 
train_total_lines_oh = tf.one_hot(train_df["total_lines"].to_numpy(), depth=20)
val_total_lines_oh = tf.one_hot(val_df["total_lines"].to_numpy(), depth=20)
test_total_lines_oh = tf.one_hot(test_df["total_lines"].to_numpy(), depth=20)

print(f"train_total_lines_oh shape: {train_total_lines_oh.shape}")
print(f"val_total_lines_oh shape: {val_total_lines_oh.shape}")

# Check shape and samples of total lines one-hot tensor
train_total_lines_oh.shape, train_total_lines_oh[:10]

### Combine the embeddings

Configure our inputs

In [None]:

class MyLayer(layers.Layer):
    def call(self, inputs):
        return hub_embedding_layer(inputs)
    
# Token inputs
tokens = layers.Input(shape=[], dtype="string", name="token_inputs")
token_embeddings = MyLayer(name='Sentence_Embedding')(tokens)
token_outputs = layers.Dense(128, activation="relu")(token_embeddings)
token_model = tf.keras.Model(inputs=tokens,
                             outputs=token_outputs)

# Char inputs
char_inputs = layers.Input(shape=(1,), dtype="string", name="char_inputs")
char_vectors = char_vectorizer(char_inputs)
char_embeddings = char_embed(char_vectors)
char_bi_lstm = layers.Bidirectional(layers.LSTM(32))(char_embeddings)
char_model = tf.keras.Model(inputs=char_inputs,
                            outputs=char_bi_lstm)

# Line numbers inputs
line_number_inputs = layers.Input(shape=(15,), dtype=tf.int32, name="line_number_input")
x = layers.Dense(32, activation="relu")(line_number_inputs)
line_number_model = tf.keras.Model(inputs=line_number_inputs,
                                   outputs=x)

# Total lines inputs
total_lines_inputs = layers.Input(shape=(20,), dtype=tf.int32, name="total_lines_input")
y = layers.Dense(32, activation="relu")(total_lines_inputs)
total_line_model = tf.keras.Model(inputs=total_lines_inputs,
                                  outputs=y)

Concatenate the embeddings

In [None]:
# Combine token and char embeddings into a hybrid embedding
combined_embeddings = layers.Concatenate(name="token_char_hybrid_embedding")([token_model.output, 
                                                                              char_model.output])
z = layers.Dense(256, activation="relu")(combined_embeddings)
z = layers.Dropout(0.5)(z)

# Combine positional embeddings with combined token and char embeddings into a tribrid embedding
z = layers.Concatenate(name="token_char_positional_embedding")([line_number_model.output,
                                                                total_line_model.output,
                                                                z])

output_layer = layers.Dense(5, activation="softmax", name="output_layer")(z)

### Define our model

In [None]:
model = tf.keras.Model(inputs=[line_number_model.input,
                                 total_line_model.input,
                                 token_model.input, 
                                 char_model.input],
                         outputs=output_layer)

In [None]:
# Get a summary of our token, char and positional embedding model
model.summary()

In [None]:
from tensorflow.keras.utils import plot_model
plot_model(model, to_file='model.png', show_shapes=True)

In [None]:
for layer in model.layers:
    print(layer, layer.trainable)

### Train our model

In [None]:
# Compile token, char, positional embedding model
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.2), # add label smoothing (examples which are really confident get smoothed a little)
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [None]:
# Create training and validation datasets (all four kinds of inputs)
train_pos_char_token_data = tf.data.Dataset.from_tensor_slices((train_line_numbers_oh, # line numbers
                                                                train_total_lines_oh, # total lines
                                                                train_sentences, # train tokens
                                                                train_chars))
# Convert sparse matrix to dense numpy array
train_labels_oh_dense = train_labels_oh.toarray()   

# Create dataset
train_pos_char_token_labels = tf.data.Dataset.from_tensor_slices(train_labels_oh_dense) # train labels

train_pos_char_token_dataset = tf.data.Dataset.zip((train_pos_char_token_data, train_pos_char_token_labels)) # combine data and labels
train_pos_char_token_dataset = train_pos_char_token_dataset.batch(32).prefetch(tf.data.AUTOTUNE) # turn into batches and prefetch appropriately

In [None]:
# Validation dataset
val_pos_char_token_data = tf.data.Dataset.from_tensor_slices((val_line_numbers_oh,
                                                              val_total_lines_oh,
                                                              val_sentences,
                                                              val_chars))
val_labels_oh_dense = val_labels_oh.toarray()

val_pos_char_token_labels = tf.data.Dataset.from_tensor_slices(val_labels_oh_dense)

val_pos_char_token_dataset = tf.data.Dataset.zip((val_pos_char_token_data, val_pos_char_token_labels))
val_pos_char_token_dataset = val_pos_char_token_dataset.batch(32).prefetch(tf.data.AUTOTUNE) # turn into batches and prefetch appropriately

In [None]:
train_pos_char_token_dataset, val_pos_char_token_dataset

In [None]:
# Fit the token, char and positional embedding model
history_model = model.fit(train_pos_char_token_dataset,
                              steps_per_epoch=int(0.1 * len(train_pos_char_token_dataset)),
                              epochs=3,
                              validation_data=val_pos_char_token_dataset,
                              validation_steps=int(0.1 * len(val_pos_char_token_dataset)))

### Make predictions with our Model

In [None]:
predictions = model.predict(val_pos_char_token_dataset, verbose=1)
predictions

In [None]:
# look at the most incorrect predictions
preds_df = pd.DataFrame(predictions)
preds_df.columns = class_names
preds_df["target"] = val_df["target"]



In [None]:
preds_df