# Milestone Project 2: SkimLit

Sequence Problem:
Many to One Classification

* Download PubMed 200k RCT dataset
* Preprocess the text data
* Set up multiple modeling experiments
* Build a multimodal model to take in different sources of data
  * Replicate the model powering https://arxiv.org/abs/1710.06071
* Find the most wrong prediction examples

## Skim Medical Literature 

A Dataset for sequential Sentence Classification in Medical Abastracts:

[Source](https://arxiv.org/abs/1710.06071)

[Model Architecture](https://arxiv.org/abs/1612.05251)

Artificial Neural Network consisting of 3 main components:
* Token embedding layer (bi-LSTM)
* Sentence label prediction layer (bi-LSTM)
* Label sequence optimization layer (CRF)


In [None]:
#Confirm access to GPU
!nvidia-smi -L

In [None]:
# Get dataset
!git clone https://github.com/Franck-Dernoncourt/pubmed-rct.git ../Downloads/09_skimlit_milestone_project

In [None]:
# Check downloaded folders
!ls ../Downloads/09_skimlit_milestone_project

# Check files in the one of the folders
!ls ../Downloads/09_skimlit_milestone_project/PubMed_20k_RCT_numbers_replaced_with_at_sign/

In [None]:
# Use 20k RCT dataset with numbers replaced with @ sign
data_dir = "../Downloads/09_skimlit_milestone_project/PubMed_20k_RCT_numbers_replaced_with_at_sign/"

In [None]:
# Check the filenames in target dir
import os
filenames = os.listdir(data_dir)
filenames

## Preprocess Data

Visualize the data

In [None]:
# write a function to read in all the lines of a target text file
def get_lines(filepath):
    """
    Reads in a text file and returns a list of lines.

    Args:
        filename (str): The path to the text file.

    Returns:
        list: A list of lines in the text file.
    """
    with open(filepath, "r") as f:
        return f.readlines()


# Check the first 10 lines of the train file
train_lines = get_lines(data_dir + "train.txt")
train_lines[:25]

### Structuring the data

```
[
    {
        'line_number': 0, 
        'target': 'BACKGROUND',
        'text': 'Emotional eating is associated with overeating and the development of obesity .\n',
        'total_lines': 11, 
    }, 
    ...
]
```

In [None]:
# Create a function that takes a list of lines and returns a list of dictionarie
def create_dicts(filepath):
    """
    Creates a list of dictionaries of abstract line data

    Args:
        filepath.

    Returns:
        list: A list of dictionaries with the keys "line_number", "target", "text", "total_lines", abstract_id.
    """
    abstract_lines = "" # Create an empty abstract
    abstract_samples = [] # Create an empty list of abstract samples
    input_lines = get_lines(filepath)

    for i, line in enumerate(input_lines):
        if line.startswith("###"):
            abstract_id = line
            abstract_lines = "" # reset the abstract string if the line is an ID line
        elif line.isspace(): 
            # if line is end of abstract, take abstract lines and create a dictionary, 
            # then append the dictionary to abstract_samples
            abstract_line_split = abstract_lines.splitlines() # split abstract lines on new line
            # Iterate through each line in a single abstract and keep count
            for abstract_line_number, abstract_line in enumerate(abstract_line_split):
                line_split = abstract_line.split("\t")
                # Create a dictionary of the line data
                abstract_sample = {
                    "line_number": abstract_line_number,
                    "target": line_split[0],
                    "text": line_split[1].lower(),
                    "total_lines": len(abstract_line_split) - 1,
                    "abstract_id": abstract_id
                }
                abstract_samples.append(abstract_sample)
        else: # appends line to abstract lines if the end of the abstract is not reached
            abstract_lines += line

  
    return abstract_samples

In [None]:
# get data from each file and preprocess it
train_samples = create_dicts(data_dir + "train.txt")
val_samples = create_dicts(data_dir + "dev.txt")
test_samples = create_dicts(data_dir + "test.txt")
len(train_samples),  len(val_samples), len(test_samples)


In [None]:
import pandas as pd
train_df = pd.DataFrame(train_samples)
val_df = pd.DataFrame(val_samples)
test_df = pd.DataFrame(test_samples)
train_df.head(14)

In [None]:
# Distribution of labels in training data
train_df.target.value_counts()

In [None]:
import matplotlib.pyplot as plt

# Check length of different lines
train_df.total_lines.plot.hist()
plt.title("Distribution of total lines in abstracts")

### Get lists of sentences

In [None]:
# Convert abstract text lines into lists
train_sentences = train_df.text.tolist()
val_sentences = val_df.text.tolist()
test_sentences = test_df.text.tolist()
len(train_sentences), len(val_sentences), len(test_sentences)

In [None]:
# View the first 10 lines of the training data
train_sentences[:10]

### Make numeric labels

In [None]:
import tensorflow as tf
# Onehot encoded labels
from sklearn.preprocessing import OneHotEncoder 
# alternatively use tf.one_hot
one_hot_encoder = OneHotEncoder(sparse=False) # We want a non-sparse matrix
train_labels_one_hot = one_hot_encoder.fit_transform(train_df.target.to_numpy().reshape(-1, 1))
val_labels_one_hot = one_hot_encoder.transform(val_df.target.to_numpy().reshape(-1, 1))
test_labels_one_hot = one_hot_encoder.transform(test_df.target.to_numpy().reshape(-1, 1))
# Check what one_hot encoded labels look like
train_labels_one_hot
tf.constant(train_labels_one_hot)

In [None]:
import sklearn
print(sklearn.__version__)

### Label encode labels

In [None]:
# Extract labels ("target" columns) and encode them into integers
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_df.target.to_numpy())
val_labels_encoded = label_encoder.transform(val_df.target.to_numpy())
test_labels_encoded = label_encoder.transform(test_df.target.to_numpy())
# Check what encoded labels look like
train_labels_encoded[:12]


In [None]:
# Get class names and number of classes from LabelEncoder instance
num_classes = len(label_encoder.classes_)
class_names = label_encoder.classes_
num_classes, class_names

## Series of Experiments

* 0 - Naive Bayes with TF-IDF encoder (baseline)
* 1 - Conv1D with token embeddings
* 2 - TF Hub Pretrained Feature Extractor
* 3 - Conv1D with character embeddings
* 4 - Combining pretrained token embeddings + characters embeddings (hybrid embedding layer)
* 5 - Combining pretrained token embeddings + characters embeddings + positional embeddings

[Machine Learning Testing Map](https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html)

## Model 0: Baseline Naive Bayes

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create tokenization and modelling pipeline
model_0 = Pipeline([
    ("tfidf", TfidfVectorizer()), # convert words to numbers using tfidf
    ("clf", MultinomialNB()), # model the text using a naive bayes classifier
])

# Fit the pipeline to the training data
model_0.fit(train_sentences, 
            train_labels_encoded, 
)

In [None]:
# Evaluate our baseline model
baseline_score =  model_0.score(val_sentences, val_labels_encoded)
print(f'Baseline accuracy score: {baseline_score * 100:.2f}%')

In [None]:
# Make predictions using baseline model
baseline_preds = model_0.predict(val_sentences)
baseline_preds

### Use helper functions script

In [None]:
from _helper_functions import calculate_results
# Calculate baseline results
baseline_results = calculate_results(y_true=val_labels_encoded,
                                        y_pred=baseline_preds)

baseline_results


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Find the average number of tokens (words) in the training sentences
sent_lens = [len(sentence.split()) for sentence in train_sentences]
avg_sent_len = np.mean(sent_lens)
print(avg_sent_len)

# What's the distribution look like?
plt.hist(sent_lens, bins=20)
plt.title("Distribution of sentence length");

In [None]:
# How long of a sentence length covers 95% of the examples?
output_seq_len = int(np.percentile(sent_lens, 95))
print(output_seq_len, "tokens (words) or less covers 95% of training examples")

## Model 1: Conv1D with token embeddings

In [None]:
# Tokenize train sentences, turn it into an embedding and build a model
import numpy as np
import random
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import random

# Setup text vectorization variables
max_vocab_len = 68000 # 68k value used in paper for PubMed 20k RCT, 200k is 331k

# Create text vectorizer
text_vectorizer = TextVectorization(max_tokens=max_vocab_len,
                                    output_sequence_length=output_seq_len)
                                    
# Adapt the text vectorizer to the training texts
text_vectorizer.adapt(train_sentences)

In [None]:
# Vectorize random sentence & view
target_sentence = random.choice(train_sentences)
vectorized_sentence = text_vectorizer([target_sentence])
print(f"Text:\n{target_sentence}")
print(f"\nLength of text: {len(target_sentence.split())}")
print(f"\nVectorized text: {vectorized_sentence}")

In [None]:
# Explore: How many words are in the vocabulary?
rct_20k_text_vocab = text_vectorizer.get_vocabulary()
print(f"Number of words in vocab: {len(rct_20k_text_vocab)}")
print(f"Most common words: {rct_20k_text_vocab[:5]}")
print(f"Least common words: {rct_20k_text_vocab[-5:]}")

In [None]:
# Get the config of our text vectorizer
text_vectorizer.get_config()

In [None]:
from tensorflow.keras import layers
# Create an embedding layer
token_embed = layers.Embedding(input_dim=len(rct_20k_text_vocab), # length of vocab
                                output_dim=128, # size of embedding vectors
                                mask_zero=True, # whether or not the input value 0 is a special "padding" value which should be masked out
                                name="token_embedding")

In [None]:
# Show example embedding

print(f"Original text:\n {target_sentence}\
        \n\nVectorized version: {vectorized_sentence}")  
embedded_sentence = token_embed(vectorized_sentence)
print(f"\nEmbedded version: {embedded_sentence}")
print(f"\nEmbedded version shape: {embedded_sentence.shape}")

In [None]:
# Create datasets with the tf.data API
# Turn the data into TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((train_sentences, train_labels_one_hot))
val_dataset = tf.data.Dataset.from_tensor_slices((val_sentences, val_labels_one_hot))
test_dataset = tf.data.Dataset.from_tensor_slices((test_sentences, test_labels_one_hot))

train_dataset

In [None]:
# Take the TensorFlow datasets and turn them into prefetched batches
BATCH_SIZE = 32
train_dataset = train_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

train_dataset


In [None]:
# Create a 1D conv model with token embedding to process sequences

inputs = layers.Input(shape=(1,), dtype=tf.string)
text_vectors = text_vectorizer(inputs)
token_embeddings = token_embed(text_vectors)
x = layers.Conv1D(64, 5, activation="relu")(token_embeddings)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(num_classes, activation="softmax")(x)
model_1 = tf.keras.Model(inputs, outputs, name="model_1_conv1D")

# Get a summary of our model
model_1.summary()

In [None]:
from _helper_functions import create_tensorboard_callback
#Compile model
model_1.compile(loss="categorical_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [None]:
# Fit the model
history_1 = model_1.fit(train_dataset,
                        steps_per_epoch=int(0.1*len(train_dataset)),
                        epochs=3,
                        validation_data=val_dataset,
                        validation_steps=int(0.1*len(val_dataset)), # only validates on 10% of validation data
                        callbacks=[create_tensorboard_callback(dir_name="../tensorflow_hub/skimlit",
                                                                experiment_name="model_1_conv1D")])

In [None]:
# Make predictions
model_1_pred_probs = model_1.predict(val_dataset)

In [None]:
model_1_pred_probs, model_1_pred_probs.shape

In [None]:
# Convert prediction probabilities to class labels
model_1_preds = tf.argmax(model_1_pred_probs, axis=1)
model_1_preds[:10]

In [None]:
# Calculate model results
model_1_results = calculate_results(y_true=val_labels_encoded,
                                    y_pred=model_1_preds)
model_1_results

In [None]:
baseline_results

## Model 2: TF Hub Pretrained Feature Extractor

[Universal Sentence Encoder v4 (USE)](https://tfhub.dev/google/universal-sentence-encoder/4)

The paper originally used Global Vectors for Word Representation (GloVe) embeddings.

This notebook uses the latest USE pretrained embeddings.

In [None]:
# Create a TF Hub pretrained feature extractor
import tensorflow_hub as hub

# Create Keras layer using pretrained feature extractor
sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        input_shape=[], # can take variable length sequences
                                        dtype=tf.string, # accepts string inputs
                                        trainable=False, # freeze the pretrained weights
                                        name="USE_feature_extractor_layer")

In [None]:
# Test out the pretrained embedding on a random sentence
random_sentence = random.choice(train_sentences)
print(f"Original text:\n {random_sentence}")
use_embedded_sentence = sentence_encoder_layer([random_sentence])
print(f"\nEmbedded version: {use_embedded_sentence[:30]}")
print(f"\nEmbedded version length: {len(use_embedded_sentence[0])}")

In [None]:
# Define feature extraction model using TF Hub layer
inputs = layers.Input(shape=[], dtype=tf.string)
pretrained_embedding = sentence_encoder_layer(inputs)
x = layers.Dense(128, activation="relu")(pretrained_embedding)
outputs = layers.Dense(num_classes, activation="softmax")(x)
model_2 = tf.keras.Model(inputs, outputs, name="model_2_USE_feature_extractor")

In [None]:
# Compile model
model_2.compile(loss="categorical_crossentropy",
                    optimizer=tf.keras.optimizers.Adam(),
                    metrics=["accuracy"])

In [None]:
# Fit model
history_2 = model_2.fit(train_dataset,
                        steps_per_epoch=int(0.1*len(train_dataset)),
                        epochs=3,
                        validation_data=val_dataset,
                        validation_steps=int(0.1*len(val_dataset)),
                        callbacks=[
                            create_tensorboard_callback(dir_name="../tensorflow_hub/skimlit",
                                experiment_name="model_2_USE")
                            ]
                        )

In [None]:
# Make predictions
model_2_pred_probs = model_2.predict(val_dataset)

In [None]:
# Convert prediction probabilities to class labels
model_2_preds = tf.argmax(model_2_pred_probs, axis=1)
model_2_preds[:10]

In [None]:
# Calculate model results
model_2_results = calculate_results(y_true=val_labels_encoded,
                                    y_pred=model_2_preds)

In [None]:
baseline_results, model_1_results, model_2_results

## Model 3: Conv1D with character embeddings

Same as model 1 but use character-level embeddings.

The paper used a combination of token and character embeddings.

In [None]:
# Create a function to split sentences
def split_chars(text):
    return " ".join(list(text))

# Text splitting non-character-level sequence into characters
split_chars(random_sentence)

In [None]:
# Split sequence-level data splits into character-level data splits
train_chars = [split_chars(sentence) for sentence in train_sentences]
val_chars = [split_chars(sentence) for sentence in val_sentences]
test_chars = [split_chars(sentence) for sentence in test_sentences]


In [None]:
# Get avg char length for each sentence in train_sentences
char_lens = [len(sentence) for sentence in train_sentences]
mean_char_len = np.mean(char_lens)
mean_char_len

In [None]:
# Check distribution of character lengths
import matplotlib.pyplot as plt
plt.hist(char_lens, bins=10)
plt.xticks(range(0,1500,100))
plt.plot()

In [None]:
# Find what character length covers 95% of sequences
output_seq_char_len = int(np.percentile(char_lens, 95))
output_seq_char_len

In [None]:
import string
alphabet = string.ascii_lowercase # + string.digits + string.punctuation #not needed if using default standardizer
alphabet

In [None]:
# Create char-level token vectorizer instance
NUM_CHAR_TOKENS = len(alphabet) + 2 # add 2 for OOV [UNK] and space tokens
char_vectorizer = TextVectorization(max_tokens=NUM_CHAR_TOKENS,
                                    output_sequence_length=output_seq_char_len,
                                    # standardize=None,
                                    name="char_vectorizer")


In [None]:
# Adapt the char_vectorizer to our training characters
char_vectorizer.adapt(train_chars)

In [None]:
# Check character vocab stats
char_vocab = char_vectorizer.get_vocabulary()
print(f"Number of different chars in char vocab: {len(char_vocab)}")
print(f"5 Most common chars in char vocab: {char_vocab[:5]}")
print(f"5 Least common chars in char vocab: {char_vocab[-5:]}")


In [None]:
# Test out character vectorizer
random_train_chars = random.choice(train_chars)
print(f"Original text:\n {random_train_chars}")
print(f"\nLength of original text: {len(random_train_chars)}")

vectorized_chars = char_vectorizer([random_train_chars])
print(f"\nVectorized version: {vectorized_chars}")
print(f"\nLength of vectorized version: {len(vectorized_chars[0])}")


In [None]:
# Create a character-level embedding
char_embed = layers.Embedding(input_dim=len(char_vocab),
                                output_dim=25,
                                mask_zero=True, # add masking to account for OOV tokens
                                name="char_embedding")
                                

In [None]:
# Test out character embedding
print(f"Char split sentences:\n {random_train_chars}\n")
char_embed_example = char_embed(char_vectorizer([random_train_chars]))
print(f"Embedded version: {char_embed_example}")
print(f"Embedded version shape: {char_embed_example.shape}")

In [None]:
# Build a conv1D character-level embedding model
inputs = layers.Input(shape=(1,), dtype=tf.string)
char_vectors = char_vectorizer(inputs)
char_embeddings = char_embed(char_vectors)
x = layers.Conv1D(64, kernel_size=5, padding="same", activation="relu")(char_embeddings)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(num_classes, activation="softmax")(x)
model_3 = tf.keras.Model(inputs, outputs, name="model_3_conv1D_char_embedding")



In [None]:
model_3.compile(loss="categorical_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

model_3.summary()


In [None]:
# Fit  
# TODO: Need to optimize data before fit

# history_3 = model_3.fit(train_dataset,
#                         steps_per_epoch=int(0.1*len(train_dataset)),
#                         epochs=3,
#                         validation_data=val_dataset,
#                         validation_steps=int(0.1*slen(val_dataset)),
#                         callbacks=[
#                             create_tensorboard_callback(dir_name="../tensorflow_hub/skimlit",
#                                 experiment_name="model_3_char_embedding")
#                             ]
#                         )
