In [1]:
import pandas as pd

In [5]:
train_csv = pd.read_csv(r"datasets/train.csv")

print("Training set shape", train_csv.shape)

train_csv.head()

Training set shape (100, 3)


Unnamed: 0,url,doc_id,label
0,http://elbe-elster-klinikum.de/fachbereiche/ch...,1,1
1,http://klinikum-bayreuth.de/einrichtungen/zent...,3,3
2,http://klinikum-braunschweig.de/info.php/?id_o...,4,1
3,http://klinikum-braunschweig.de/info.php/?id_o...,5,1
4,http://klinikum-braunschweig.de/zuweiser/tumor...,6,3


In [9]:
train_csv['label'].value_counts()

label
2    59
1    32
3     9
Name: count, dtype: int64

In [6]:
test_csv = pd.read_csv(r"datasets/test.csv")
print("Test set shape", test_csv.shape)
test_csv.head()

Test set shape (48, 2)


Unnamed: 0,url,doc_id
0,http://chirurgie-goettingen.de/medizinische-ve...,0
1,http://evkb.de/kliniken-zentren/chirurgie/allg...,2
2,http://krebszentrum.kreiskliniken-reutlingen.d...,7
3,http://marienhospital-buer.de/mhb-av-chirurgie...,15
4,http://marienhospital-buer.de/mhb-av-chirurgie...,16


We have 100 documents in the training set, and 48 in the test set. We have 32 documents that mention no tumor board (label = 1), 59 documents where a tumor board is mentioned, but we are not certain if it is the main focus of the page (label = 2), and 9 documents for which we are certain that they are dedicated to tumor boards.

In [7]:
tumor_keywords = pd.read_csv("datasets/keyword2tumor_type.csv")
print("Tumor keywords set shape", tumor_keywords.shape)
tumor_keywords.head()

Tumor keywords set shape (126, 2)


Unnamed: 0,keyword,tumor_type
0,senologische,Brust
1,brustzentrum,Brust
2,breast,Brust
3,thorax,Brust
4,thorakale,Brust


In [8]:
tumor_keywords['tumor_type'].value_counts()

tumor_type
Lunge                           10
Darm                            10
Gynäkologie                      8
Interdisziplinär                 7
Haut                             7
Hämatooncology                   7
Magen                            7
Brust                            6
Urologische                      6
Kopf-hals                        6
Sarkome                          6
Endokrine malignome              5
Pädiatrische                     4
Mamma carcinoma                  4
Pankreas                         3
Prostata                         3
Gallenblasen/gallengangkrebs     3
Neuroonkologie                   3
Leber                            2
Hoden, penis                     2
Knochentumoren                   2
Niere                            1
Mikroskopieren                   1
Stammzelltransplantation         1
Schwerpunkt                      1
Prätherapeutische                1
Oral                             1
Molekular                        1
Harnblase

## Loading Data

In [11]:
def read_html(doc_id: int) -> str:
    """
    Reads the HTML file at the specified path.
    Since the language of the documents is German,
    we need to specify the 'latin1' encoding, rather
    than the more common 'utf-8'. For more info about
    the encoding, see: https://en.wikipedia.org/wiki/ISO/IEC_8859-1
    """
    with open(file=f"datasets/htmls/{doc_id}.html",
              mode="r",
              encoding="latin1") as f:
        html = f.read()
    return html


# this will store the actual HTML text in the 'html' column
train_csv["html"] = train_csv["doc_id"].apply(read_html)

In [12]:
# print a sample to get familiar with the data at this point
# the random_state argument is needed to provide deterministic output
train_csv.sample(n=5, random_state=42)

Unnamed: 0,url,doc_id,label,html
83,http://www.sbk-vs.de/de/medizin/leistungen-und...,125,1,"\n\n<!DOCTYPE HTML>\n<html dir=""ltr"" lang=""de_..."
53,http://www.klinikum-esslingen.de/kliniken-und-...,85,2,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or..."
70,http://www.malteser-kliniken-rhein-ruhr.de/med...,107,2,"<!DOCTYPE html>\n<html lang=""de"">\n<head>\n\n<..."
45,http://www.klilu.de/medizin__pflege/kliniken_u...,73,2,"<!DOCTYPE html>\n<html lang=""de""><head>\n\t<me..."
44,http://www.kk-bochum.de/de/kliniken_zentren_be...,72,1,"<!DOCTYPE html PUBLIC ""-//W3C//DTD HTML 4.01 T..."


In [14]:
import warnings

from bs4 import BeautifulSoup

warnings.filterwarnings(action="ignore")


def extract_html_text(html: str) -> str:
    """
    Extracts the text from the provided HTML.
    Using the 'lxml' parser has excellent encoding detection
    and provides better results for HTMLs which do not
    declare their encoding.
    """
    bs = BeautifulSoup(markup=html, features="lxml")
    for script in bs(name=["script", "style"]):
        # remove all <script> and <style> tags from the HTML
        script.decompose()
    return bs.get_text(separator=" ")


# extract text elements from the HTML
train_csv["html_text"] = train_csv["html"].apply(extract_html_text)

In [16]:
# printing a sample to observe the data
train_csv.head()

Unnamed: 0,url,doc_id,label,html,html_text
0,http://elbe-elster-klinikum.de/fachbereiche/ch...,1,1,<!DOCTYPE html>\n<!-- jsn_reta_pro 1.0.2 -->\n...,\n \n \n \n \n \n Elbe-Elster Klinikum - Chiru...
1,http://klinikum-bayreuth.de/einrichtungen/zent...,3,3,"<!DOCTYPE html>\n<html class=""no-js"" lang=""de""...",\n \n \n \n \n \n \n Onkologisches Zentrum - K...
2,http://klinikum-braunschweig.de/info.php/?id_o...,4,1,"<!doctype html>\n<html lang=""de"">\n<head>\n\t<...",\n \n Zentrum - SozialpÃ¤diatrisches Zentrum -...
3,http://klinikum-braunschweig.de/info.php/?id_o...,5,1,"<!doctype html>\n<html lang=""de"">\n<head>\n\t<...",\n \n Leistung - Spezielle UnterstÃ¼tzung bei ...
4,http://klinikum-braunschweig.de/zuweiser/tumor...,6,3,"<!doctype html>\n<html lang=""de"">\n<head>\n\t<...",\n \n Zuweiser - Tumorkonferenzen - Tumorkonfe...


we immediately observe an issue, and that is the large number of new line symbols \n at the beginning of each document. Ideally, we would want to provide clear text, with no special characters and in a proper, human-readable format. To achieve that, we will try to utilize some of the methods in the gensim library.

In [17]:
from gensim.parsing import preprocessing


def preprocess_html_text(html_text: str) -> str:
    """
    The preprocessing consists of the following six steps:

    1. Strips all non-alphanumerical characters.
    2. Strips all multiple whitespaces.
    3. Strips all punctuation.
    4. Strips all numerical characters.
    5. Converts to lowercase and then stems the text.
    6. Removes all stop-words.
    """
    preprocessed_text = preprocessing.strip_non_alphanum(s=html_text)
    preprocessed_text = preprocessing.strip_multiple_whitespaces(s=preprocessed_text)
    preprocessed_text = preprocessing.strip_punctuation(s=preprocessed_text)
    preprocessed_text = preprocessing.strip_numeric(s=preprocessed_text)

    preprocessed_text = preprocessing.stem_text(text=preprocessed_text)
    preprocessed_text = preprocessing.remove_stopwords(s=preprocessed_text)
    return preprocessed_text


train_csv["preprocessed_html_text"] = train_csv["html_text"].apply(preprocess_html_text)

In [18]:
train_csv.head()

Unnamed: 0,url,doc_id,label,html,html_text,preprocessed_html_text
0,http://elbe-elster-klinikum.de/fachbereiche/ch...,1,1,<!DOCTYPE html>\n<!-- jsn_reta_pro 1.0.2 -->\n...,\n \n \n \n \n \n Elbe-Elster Klinikum - Chiru...,elb elster klinikum chirurgi finsterwald suche...
1,http://klinikum-bayreuth.de/einrichtungen/zent...,3,3,"<!DOCTYPE html>\n<html class=""no-js"" lang=""de""...",\n \n \n \n \n \n \n Onkologisches Zentrum - K...,onkologisch zentrum klinikum bayreuth aktuel ã...
2,http://klinikum-braunschweig.de/info.php/?id_o...,4,1,"<!doctype html>\n<html lang=""de"">\n<head>\n\t<...",\n \n Zentrum - SozialpÃ¤diatrisches Zentrum -...,zentrum sozialpã diatrisch zentrum stã dtisch ...
3,http://klinikum-braunschweig.de/info.php/?id_o...,5,1,"<!doctype html>\n<html lang=""de"">\n<head>\n\t<...",\n \n Leistung - Spezielle UnterstÃ¼tzung bei ...,leistung speziel unterstã¼tzung bei der anmeld...
4,http://klinikum-braunschweig.de/zuweiser/tumor...,6,3,"<!doctype html>\n<html lang=""de"">\n<head>\n\t<...",\n \n Zuweiser - Tumorkonferenzen - Tumorkonfe...,zuweis tumorkonferenzen tumorkonferenz gastroi...


## EDA

In [19]:
import plotly.express as px
import plotly.offline as pyo

# set notebook mode to work in offline
# should enable viewing of plotly plots in offline mode
pyo.init_notebook_mode(connected=True)

In [22]:
px.histogram(x=train_csv["preprocessed_html_text"].apply(len), title="Distribution of Text Length (Character Count)")

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed