# Milestone Project 2: SkimLit

Sequence Problem:
Many to One Classification

* Download PubMed 200k RCT dataset
* Preprocess the text data
* Set up multiple modeling experiments
* Build a multimodal model to take in different sources of data
  * Replicate the model powering https://arxiv.org/abs/1710.06071
* Find the most wrong prediction examples

## Skim Medical Literature 

A Dataset for sequential Sentence Classification in Medical Abastracts:

[Source](https://arxiv.org/abs/1710.06071)

[Model Architecture](https://arxiv.org/abs/1612.05251)

Artificial Neural Network consisting of 3 main components:
* Token embedding layer (bi-LSTM)
* Sentence label prediction layer (bi-LSTM)
* Label sequence optimization layer (CRF)


In [None]:
#Confirm access to GPU
!nvidia-smi -L

In [None]:
# Get dataset
!git clone https://github.com/Franck-Dernoncourt/pubmed-rct.git ../Downloads/09_skimlit_milestone_project

In [None]:
# Check downloaded folders
!ls ../Downloads/09_skimlit_milestone_project

# Check files in the one of the folders
!ls ../Downloads/09_skimlit_milestone_project/PubMed_20k_RCT_numbers_replaced_with_at_sign/

In [None]:
# Use 20k RCT dataset with numbers replaced with @ sign
data_dir = "../Downloads/09_skimlit_milestone_project/PubMed_20k_RCT_numbers_replaced_with_at_sign/"

In [None]:
# Check the filenames in target dir
import os
filenames = os.listdir(data_dir)
filenames

## Preprocess Data

Visualize the data

In [None]:
# write a function to read in all the lines of a target text file
def get_lines(filepath):
    """
    Reads in a text file and returns a list of lines.

    Args:
        filename (str): The path to the text file.

    Returns:
        list: A list of lines in the text file.
    """
    with open(filepath, "r") as f:
        return f.readlines()


# Check the first 10 lines of the train file
train_lines = get_lines(data_dir + "train.txt")
train_lines[:25]

### Structuring the data

```
[
    {
        'line_number': 0, 
        'target': 'BACKGROUND',
        'text': 'Emotional eating is associated with overeating and the development of obesity .\n',
        'total_lines': 11, 
    }, 
    ...
]
```

In [None]:
# Create a function that takes a list of lines and returns a list of dictionarie
def create_dicts(filepath):
    """
    Creates a list of dictionaries of abstract line data

    Args:
        filepath.

    Returns:
        list: A list of dictionaries with the keys "line_number", "target", "text", "total_lines", abstract_id.
    """
    abstract_lines = "" # Create an empty abstract
    abstract_samples = [] # Create an empty list of abstract samples
    input_lines = get_lines(filepath)

    for i, line in enumerate(input_lines):
        if line.startswith("###"):
            abstract_id = line
            abstract_lines = "" # reset the abstract string if the line is an ID line
        elif line.isspace(): 
            # if line is end of abstract, take abstract lines and create a dictionary, 
            # then append the dictionary to abstract_samples
            abstract_line_split = abstract_lines.splitlines() # split abstract lines on new line
            # Iterate through each line in a single abstract and keep count
            for abstract_line_number, abstract_line in enumerate(abstract_line_split):
                line_split = abstract_line.split("\t")
                # Create a dictionary of the line data
                abstract_sample = {
                    "line_number": abstract_line_number,
                    "target": line_split[0],
                    "text": line_split[1].lower(),
                    "total_lines": len(abstract_line_split) - 1,
                    "abstract_id": abstract_id
                }
                abstract_samples.append(abstract_sample)
        else: # appends line to abstract lines if the end of the abstract is not reached
            abstract_lines += line

  
    return abstract_samples

In [None]:
# get data from each file and preprocess it
train_samples = create_dicts(data_dir + "train.txt")
val_samples = create_dicts(data_dir + "dev.txt")
test_samples = create_dicts(data_dir + "test.txt")
len(train_samples),  len(val_samples), len(test_samples)


In [None]:
import pandas as pd
train_df = pd.DataFrame(train_samples)
val_df = pd.DataFrame(val_samples)
test_df = pd.DataFrame(test_samples)
train_df.head(14)

In [None]:
# Distribution of labels in training data
train_df.target.value_counts()

In [None]:
import matplotlib.pyplot as plt

# Check length of different lines
train_df.total_lines.plot.hist()
plt.title("Distribution of total lines in abstracts")

### Get lists of sentences

In [None]:
# Convert abstract text lines into lists
train_sentences = train_df.text.tolist()
val_sentences = val_df.text.tolist()
test_sentences = test_df.text.tolist()
len(train_sentences), len(val_sentences), len(test_sentences)

In [None]:
# View the first 10 lines of the training data
train_sentences[:10]

### Make numeric labels

In [None]:
import tensorflow as tf
# Onehot encoded labels
from sklearn.preprocessing import OneHotEncoder 
# alternatively use tf.one_hot
one_hot_encoder = OneHotEncoder(sparse=False) # We want a non-sparse matrix
train_labels_one_hot = one_hot_encoder.fit_transform(train_df.target.to_numpy().reshape(-1, 1))
val_labels_one_hot = one_hot_encoder.transform(val_df.target.to_numpy().reshape(-1, 1))
test_labels_one_hot = one_hot_encoder.transform(test_df.target.to_numpy().reshape(-1, 1))
# Check what one_hot encoded labels look like
train_labels_one_hot
tf.constant(train_labels_one_hot)

In [None]:
import sklearn
print(sklearn.__version__)

### Label encode labels

In [None]:
# Extract labels ("target" columns) and encode them into integers
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_df.target.to_numpy())
val_labels_encoded = label_encoder.transform(val_df.target.to_numpy())
test_labels_encoded = label_encoder.transform(test_df.target.to_numpy())
# Check what encoded labels look like
train_labels_encoded[:12]


In [None]:
# Get class names and number of classes from LabelEncoder instance
num_classes = len(label_encoder.classes_)
class_names = label_encoder.classes_
num_classes, class_names

## Series of Experiments

* 0 - Naive Bayes with TF-IDF encoder (baseline)
* 1 - Conv1D with token embeddings
* 2 - TF Hub Pretrained Feature Extractor
* 3 - Conv1D with character embeddings
* 4 - Combining pretrained token embeddings + characters embeddings (hybrid embedding layer)
* 5 - Combining pretrained token embeddings + characters embeddings + positional embeddings

[Machine Learning Testing Map](https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html)

## 0 Baseline Model: Naive Bayes

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create tokenization and modelling pipeline
model_0 = Pipeline([
    ("tfidf", TfidfVectorizer()), # convert words to numbers using tfidf
    ("clf", MultinomialNB()), # model the text using a naive bayes classifier
])

# Fit the pipeline to the training data
model_0.fit(train_sentences, train_labels_encoded)

In [None]:
# Evaluate our baseline model
baseline_score =  model_0.score(val_sentences, val_labels_encoded)
print(f'Baseline accuracy score: {baseline_score * 100:.2f}%')

In [None]:
# Make predictions using baseline model
baseline_preds = model_0.predict(val_sentences)
baseline_preds

### Use helper functions script

In [None]:
from _helper_functions import calculate_results
# Calculate baseline results
baseline_results = calculate_results(y_true=val_labels_encoded,
                                        y_pred=baseline_preds)

baseline_results


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Find the average number of tokens (words) in the training sentences
sent_lens = [len(sentence.split()) for sentence in train_sentences]
avg_sent_len = np.mean(sent_lens)
print(avg_sent_len)

# What's the distribution look like?
plt.hist(sent_lens, bins=20)
plt.title("Distribution of sentence length");

In [None]:
# How long of a sentence length covers 95% of the examples?
output_seq_len = int(np.percentile(sent_lens, 95))
print(output_seq_len, "tokens (words) or less covers 95% of training examples")

In [None]:
# Tokenize train sentences, turn it into an embedding and build a model
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import random

# Setup text vectorization variables
max_vocab_length = 10000
max_length = 15

# Create text vectorizer
text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

# Fit the text vectorizer to the training text
text_vectorizer.adapt(train_sentences)

# Create a sample sentence and tokenize it
sample_sentence = "There are two types of tensors: scalars (0D tensors), vectors (1D tensors), matrices (2D tensors), and tensors with more axes (3D tensors and higher)."
text_vectorizer([sample_sentence])

# Get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5]
bottom_5_words = words_in_vocab[-5:]
print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"5 random words from vocab: {top_5_words}")
print(f"5 random words from vocab: {bottom_5_words}")

# Create an embedding layer
embedding = layers.Embedding(input_dim=max_vocab_length,
                                output_dim=128,
                                embeddings_initializer="uniform",
                                input_length=max_length)

# Get a random sentence from the training set
random_sentence = random.choice(train_sentences)
print(f"Original text:\n {random_sentence}\
        \n\nVectorized version:")   

# Embed the random sentence (you can also pass a list of sentences)
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

# Create a 1D conv model to process sequences
inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.Conv1D(64, 5, activation="relu")(x)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(num_classes, activation="softmax")(x)
model_1 = tf.keras.Model(inputs, outputs, name="model_1_conv1D")

# Get a summary of our model
model_1.summary()


In [None]:
from _helper_functions import create_tensorboard_callback
#Compile model
model_1.compile(loss="categorical_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

# Fit the model
history_1 = model_1.fit(train_sentences,    
                        train_labels_one_hot,
                        epochs=5,
                        validation_data=(val_sentences, val_labels_one_hot),
                        callbacks=[create_tensorboard_callback(dir_name="tensorflow_hub",
                                                                experiment_name="model_1_conv1D")])
