In [2]:
import pandas as pd

In [3]:
train_csv = pd.read_csv(r"datasets/train.csv")

print("Training set shape", train_csv.shape)

train_csv.head()

Training set shape (100, 3)


Unnamed: 0,url,doc_id,label
0,http://elbe-elster-klinikum.de/fachbereiche/ch...,1,1
1,http://klinikum-bayreuth.de/einrichtungen/zent...,3,3
2,http://klinikum-braunschweig.de/info.php/?id_o...,4,1
3,http://klinikum-braunschweig.de/info.php/?id_o...,5,1
4,http://klinikum-braunschweig.de/zuweiser/tumor...,6,3


In [4]:
train_csv['label'].value_counts()

label
2    59
1    32
3     9
Name: count, dtype: int64

In [5]:
test_csv = pd.read_csv(r"datasets/test.csv")
print("Test set shape", test_csv.shape)
test_csv.head()

Test set shape (48, 2)


Unnamed: 0,url,doc_id
0,http://chirurgie-goettingen.de/medizinische-ve...,0
1,http://evkb.de/kliniken-zentren/chirurgie/allg...,2
2,http://krebszentrum.kreiskliniken-reutlingen.d...,7
3,http://marienhospital-buer.de/mhb-av-chirurgie...,15
4,http://marienhospital-buer.de/mhb-av-chirurgie...,16


We have 100 documents in the training set, and 48 in the test set. We have 32 documents that mention no tumor board (label = 1), 59 documents where a tumor board is mentioned, but we are not certain if it is the main focus of the page (label = 2), and 9 documents for which we are certain that they are dedicated to tumor boards.

In [6]:
tumor_keywords = pd.read_csv("datasets/keyword2tumor_type.csv")
print("Tumor keywords set shape", tumor_keywords.shape)
tumor_keywords.head()

Tumor keywords set shape (126, 2)


Unnamed: 0,keyword,tumor_type
0,senologische,Brust
1,brustzentrum,Brust
2,breast,Brust
3,thorax,Brust
4,thorakale,Brust


In [7]:
tumor_keywords['tumor_type'].value_counts()

tumor_type
Lunge                           10
Darm                            10
Gynäkologie                      8
Interdisziplinär                 7
Haut                             7
Hämatooncology                   7
Magen                            7
Brust                            6
Urologische                      6
Kopf-hals                        6
Sarkome                          6
Endokrine malignome              5
Pädiatrische                     4
Mamma carcinoma                  4
Pankreas                         3
Prostata                         3
Gallenblasen/gallengangkrebs     3
Neuroonkologie                   3
Leber                            2
Hoden, penis                     2
Knochentumoren                   2
Niere                            1
Mikroskopieren                   1
Stammzelltransplantation         1
Schwerpunkt                      1
Prätherapeutische                1
Oral                             1
Molekular                        1
Harnblase

## Loading Data

In [8]:
def read_html(doc_id: int) -> str:
    """
    Reads the HTML file at the specified path.
    Since the language of the documents is German,
    we need to specify the 'latin1' encoding, rather
    than the more common 'utf-8'. For more info about
    the encoding, see: https://en.wikipedia.org/wiki/ISO/IEC_8859-1
    """
    with open(file=f"datasets/htmls/{doc_id}.html",
              mode="r",
              encoding="latin1") as f:
        html = f.read()
    return html


# this will store the actual HTML text in the 'html' column
train_csv["html"] = train_csv["doc_id"].apply(read_html)

In [9]:
# print a sample to get familiar with the data at this point
# the random_state argument is needed to provide deterministic output
train_csv.sample(n=5, random_state=42)

Unnamed: 0,url,doc_id,label,html
83,http://www.sbk-vs.de/de/medizin/leistungen-und...,125,1,"\n\n<!DOCTYPE HTML>\n<html dir=""ltr"" lang=""de_..."
53,http://www.klinikum-esslingen.de/kliniken-und-...,85,2,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or..."
70,http://www.malteser-kliniken-rhein-ruhr.de/med...,107,2,"<!DOCTYPE html>\n<html lang=""de"">\n<head>\n\n<..."
45,http://www.klilu.de/medizin__pflege/kliniken_u...,73,2,"<!DOCTYPE html>\n<html lang=""de""><head>\n\t<me..."
44,http://www.kk-bochum.de/de/kliniken_zentren_be...,72,1,"<!DOCTYPE html PUBLIC ""-//W3C//DTD HTML 4.01 T..."


In [10]:
import warnings

from bs4 import BeautifulSoup

warnings.filterwarnings(action="ignore")


def extract_html_text(html: str) -> str:
    """
    Extracts the text from the provided HTML.
    Using the 'lxml' parser has excellent encoding detection
    and provides better results for HTMLs which do not
    declare their encoding.
    """
    bs = BeautifulSoup(markup=html, features="lxml")
    for script in bs(name=["script", "style"]):
        # remove all <script> and <style> tags from the HTML
        script.decompose()
    return bs.get_text(separator=" ")


# extract text elements from the HTML
train_csv["html_text"] = train_csv["html"].apply(extract_html_text)

In [11]:
# printing a sample to observe the data
train_csv.head()

Unnamed: 0,url,doc_id,label,html,html_text
0,http://elbe-elster-klinikum.de/fachbereiche/ch...,1,1,<!DOCTYPE html>\n<!-- jsn_reta_pro 1.0.2 -->\n...,\n \n \n \n \n \n Elbe-Elster Klinikum - Chiru...
1,http://klinikum-bayreuth.de/einrichtungen/zent...,3,3,"<!DOCTYPE html>\n<html class=""no-js"" lang=""de""...",\n \n \n \n \n \n \n Onkologisches Zentrum - K...
2,http://klinikum-braunschweig.de/info.php/?id_o...,4,1,"<!doctype html>\n<html lang=""de"">\n<head>\n\t<...",\n \n Zentrum - SozialpÃ¤diatrisches Zentrum -...
3,http://klinikum-braunschweig.de/info.php/?id_o...,5,1,"<!doctype html>\n<html lang=""de"">\n<head>\n\t<...",\n \n Leistung - Spezielle UnterstÃ¼tzung bei ...
4,http://klinikum-braunschweig.de/zuweiser/tumor...,6,3,"<!doctype html>\n<html lang=""de"">\n<head>\n\t<...",\n \n Zuweiser - Tumorkonferenzen - Tumorkonfe...


we immediately observe an issue, and that is the large number of new line symbols \n at the beginning of each document. Ideally, we would want to provide clear text, with no special characters and in a proper, human-readable format. To achieve that, we will try to utilize some of the methods in the gensim library.

In [12]:
from gensim.parsing import preprocessing


def preprocess_html_text(html_text: str) -> str:
    """
    The preprocessing consists of the following six steps:

    1. Strips all non-alphanumerical characters.
    2. Strips all multiple whitespaces.
    3. Strips all punctuation.
    4. Strips all numerical characters.
    5. Converts to lowercase and then stems the text.
    6. Removes all stop-words.
    """
    preprocessed_text = preprocessing.strip_non_alphanum(s=html_text)
    preprocessed_text = preprocessing.strip_multiple_whitespaces(s=preprocessed_text)
    preprocessed_text = preprocessing.strip_punctuation(s=preprocessed_text)
    preprocessed_text = preprocessing.strip_numeric(s=preprocessed_text)

    preprocessed_text = preprocessing.stem_text(text=preprocessed_text)
    preprocessed_text = preprocessing.remove_stopwords(s=preprocessed_text)
    return preprocessed_text


train_csv["preprocessed_html_text"] = train_csv["html_text"].apply(preprocess_html_text)

In [13]:
train_csv.head()

Unnamed: 0,url,doc_id,label,html,html_text,preprocessed_html_text
0,http://elbe-elster-klinikum.de/fachbereiche/ch...,1,1,<!DOCTYPE html>\n<!-- jsn_reta_pro 1.0.2 -->\n...,\n \n \n \n \n \n Elbe-Elster Klinikum - Chiru...,elb elster klinikum chirurgi finsterwald suche...
1,http://klinikum-bayreuth.de/einrichtungen/zent...,3,3,"<!DOCTYPE html>\n<html class=""no-js"" lang=""de""...",\n \n \n \n \n \n \n Onkologisches Zentrum - K...,onkologisch zentrum klinikum bayreuth aktuel ã...
2,http://klinikum-braunschweig.de/info.php/?id_o...,4,1,"<!doctype html>\n<html lang=""de"">\n<head>\n\t<...",\n \n Zentrum - SozialpÃ¤diatrisches Zentrum -...,zentrum sozialpã diatrisch zentrum stã dtisch ...
3,http://klinikum-braunschweig.de/info.php/?id_o...,5,1,"<!doctype html>\n<html lang=""de"">\n<head>\n\t<...",\n \n Leistung - Spezielle UnterstÃ¼tzung bei ...,leistung speziel unterstã¼tzung bei der anmeld...
4,http://klinikum-braunschweig.de/zuweiser/tumor...,6,3,"<!doctype html>\n<html lang=""de"">\n<head>\n\t<...",\n \n Zuweiser - Tumorkonferenzen - Tumorkonfe...,zuweis tumorkonferenzen tumorkonferenz gastroi...


## EDA

In [15]:
import plotly.express as px
import plotly.offline as pyo

# set notebook mode to work in offline
# should enable viewing of plotly plots in offline mode
pyo.init_notebook_mode(connected=True)

In [16]:
px.histogram(x=train_csv["preprocessed_html_text"].apply(len), title="Distribution of Text Length (Character Count)")

There is one document with 170-179K characters. Others are with < 50K character count in total.

In [17]:
px.histogram(x=train_csv["preprocessed_html_text"].apply(lambda text: text.split(" ")).apply(len),
             title="Distribution of Text Length (Word Count)")

There is one document with 27-28K words. Other documents all have < 6K words in total.

In [70]:
px.histogram(x=train_csv["preprocessed_html_text"].apply(lambda text: set(text.split(" "))).apply(len),
             title="Unique Words Count")

There is one document with 6500-7000 unique words. All others consist of < 2000 unique words.

## Predictive Modeling
We will use Tensorflow to build a neural network that will be able to consume the texts we have pre-processed and output a label for them. Tensorflow is widely used in the data science community for solving tasks that deal with non-tabular data, such as natural language processing, computer vision, audio processing, etc. It has great support and is highly optimized for creating production-ready, state-of-the-art neural network models.

To solve our task, which falls under the umbrella of natural language processing, we will use a model called the siamese network. Siamese networks can address the class imbalance and small data set sizes. They are mostly used in few shots learning tasks, like signature verification systems, face recognition, object detection, etc.

They fit our task well. We have a relatively small data set (< 100 samples), and we have a class imbalance (only 9 training instances with label = 3 compared to 59 instances with label = 2 and 32 with label = 1).

Let's get started building our neural network. First, we import some needed libraries, like Tensorflow, NumPy, and Python's random package.

In [19]:
import random
import numpy as np
import tensorflow as tf

# it is always useful to set the random seeds
# wherever possible, for reproducibility of results
np.random.seed(42)
tf.random.set_seed(seed=42)

In [71]:
class Pair(tf.keras.utils.Sequence):
    def __init__(self, dataframe: pd.DataFrame, labels: pd.Series, n_batch: int, batch_size: int):
        """Initialization"""
        self.dataframe = dataframe
        self.labels = labels
        self.n_batch = n_batch
        self.batch_size = batch_size
        self.all_classes = set(self.labels)
        self.anchor_groups = {}
        for target_class in self.all_classes:
            self.anchor_groups[target_class] = {
                "positive": self.dataframe[self.labels == target_class],
                "negative": self.dataframe[self.labels != target_class]
            }

    def __len__(self):
        return self.n_batch

    def __getitem__(self, item):
        pairs = []

        for i in range(int(self.batch_size / 2)):
            anchor_class = random.randint(1, 3)
            anchor_group = self.anchor_groups[anchor_class]["positive"]
            not_anchor_group = self.anchor_groups[anchor_class]["negative"]

            anchor = anchor_group.sample(n=1).iloc[0]
            positive = anchor_group.sample(n=1).iloc[0]
            negative = not_anchor_group.sample(n=1).iloc[0]

            pairs.append([anchor, positive, 1])
            pairs.append([anchor, negative, 0])

        random.shuffle(x=pairs)
        pairs = np.array(pairs)

        data_pairs = pairs[:, :2]
        targets = pairs[:, 2]

        return data_pairs, tf.convert_to_tensor(targets, dtype=np.float32)

    def get_support_set(self, sample_size: int = 1):
        """Returns sample sets of certain size of each target class"""
        support_set = {}
        for target_class in self.all_classes:
            support_set[target_class] = self.anchor_groups[target_class]["positive"].sample(n=sample_size)
        return support_set

In [72]:
class SiameseNetwork(tf.keras.Model):
    def __init__(self, corpora: pd.Series):
        super(SiameseNetwork, self).__init__()
        self.vectorizer_layer: tf.keras.layers.TextVectorization = tf.keras.layers.TextVectorization(
            max_tokens=2000,  # empirically chosen as best, higher number overfits (see the unique words count plot)
            output_mode="int",
            output_sequence_length=512
        )
        self.vectorizer_layer.adapt(corpora.values)
        self.encoder = tf.keras.Sequential(layers=[
            self.vectorizer_layer,
            tf.keras.layers.Dense(units=256, activation=tf.keras.activations.relu),
            tf.keras.layers.Dropout(rate=0.3),
            tf.keras.layers.Dense(units=128, activation=tf.keras.activations.relu),
            tf.keras.layers.Dropout(rate=0.3),
            tf.keras.layers.Dense(units=64, activation=tf.keras.activations.relu),
            tf.keras.layers.Lambda(function=lambda x: tf.math.l2_normalize(x, axis=1))
        ])
        self.encoding_distance = tf.keras.layers.Dot(axes=1)

    def __call__(self, inputs, *args, **kwargs):
        anchors, supports = inputs[:, 0], inputs[:, 1]
        anchors_encoded = self.encoder(anchors)
        supports_encoded = self.encoder(supports)
        return self.encoding_distance((anchors_encoded, supports_encoded))

    def predict_with_support_set(self, entry, support_set: dict):
        """
        Custom method that wraps around the __call__ method.
        It is used to pass the entry (input text) multiple times
        through the model to average out the losses and provide more
        stable estimate.
        """
        scores = {}
        for instance_class, texts in support_set.items():
            class_scores = ([self(np.array([entry, text]).reshape((-1, 2))) for text in texts])
            scores[instance_class] = tf.math.reduce_mean(class_scores)
        return max(scores, key=scores.get)

In [73]:
model = SiameseNetwork(corpora=train_csv["preprocessed_html_text"])

In [81]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])


At this point, we have our model, our data, and the data generator. We are ready to commence training.

But, before we do that, let's split the data in train_csv into training and validation sets. We'll use 20% of the documents as validation, and the remainder as training. Sklearn's train_test_split method is very convenient for doing that. Furthermore, notice that we stratify the split by the label. This is important because it prevents the case where the split is done only on a single class, or the splits have unrepresentative class distribution.

In [75]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(train_csv["preprocessed_html_text"], train_csv["label"],
                                                      test_size=0.2,
                                                      random_state=42, stratify=train_csv["label"])

In [76]:
# training params
BATCH_SIZE = 64
N_BATCH = 100
# we instantiate training and validation data / pair generators
TRAIN_PAIR_GENERATOR = Pair(dataframe=X_train, labels=y_train, n_batch=N_BATCH, batch_size=BATCH_SIZE)
VALID_PAIR_GENERATOR = Pair(dataframe=X_valid, labels=y_valid, n_batch=N_BATCH, batch_size=BATCH_SIZE)

Finally, we put in an early stopping callback method that will stop the training prematurely if the validation loss does not decrease for 3 straight epochs.

In [77]:
early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=3)

In [78]:
history = model.fit(
    x=TRAIN_PAIR_GENERATOR,
    validation_data=VALID_PAIR_GENERATOR,
    epochs=10,
    callbacks=[early_stopping_callback],
    verbose=1
)

ValueError: Invalid dtype: str5613664

In [79]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(train_csv["preprocessed_html_text"], train_csv["label"],
                                                      test_size=0.2, random_state=42, stratify=train_csv["label"])

# Training params
BATCH_SIZE = 64
N_BATCH = 100

# Instantiate training and validation data / pair generators
TRAIN_PAIR_GENERATOR = Pair(dataframe=X_train, labels=y_train, n_batch=N_BATCH, batch_size=BATCH_SIZE)
VALID_PAIR_GENERATOR = Pair(dataframe=X_valid, labels=y_valid, n_batch=N_BATCH, batch_size=BATCH_SIZE)

early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=3)

model = SiameseNetwork(corpora=train_csv["preprocessed_html_text"])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])

history = model.fit(
    x=TRAIN_PAIR_GENERATOR,
    validation_data=VALID_PAIR_GENERATOR,
    epochs=10,
    callbacks=[early_stopping_callback],
    verbose=1
)


ValueError: Invalid dtype: str1335392