# Keyword Detection on Websites
Source: https://platform.stratascratch.com/data-projects/keyword-detection-websites


In [85]:
import pandas as pd
import numpy as np
import chardet
import torch
from bs4 import BeautifulSoup
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from datasets import Dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
import spacy
from xgboost import XGBClassifier


### (1) Load and examine data

In [2]:
traindt = pd.read_csv("./datasets/train.csv")
testdt = pd.read_csv("./datasets/test.csv")

In [3]:
testdt.head()

Unnamed: 0,url,doc_id
0,http://chirurgie-goettingen.de/medizinische-ve...,0
1,http://evkb.de/kliniken-zentren/chirurgie/allg...,2
2,http://krebszentrum.kreiskliniken-reutlingen.d...,7
3,http://marienhospital-buer.de/mhb-av-chirurgie...,15
4,http://marienhospital-buer.de/mhb-av-chirurgie...,16


In [4]:
htmldt_0 = []

for i in range(147):
    with open(f"./datasets/htmls/{i}.html", "rb") as f:
        initread = chardet.detect(f.read())
        encoding = initread["encoding"]

    with open(f"./datasets/htmls/{i}.html", "r", encoding=encoding) as f:
        html = f.read()

    soup = BeautifulSoup(html, "html.parser")
    body = soup.get_text(" ", strip=True)
    htmldt_0.append({"doc_id": i, "html": body})


  k = self.parse_starttag(i)


In [5]:
htmldt = (
    pd.DataFrame(htmldt_0)
    .merge(traindt, on="doc_id", how="left", validate="1:1")
    .assign(label_pred=None, label=lambda x: x.label.fillna(0).astype(int) - 1)
)

In [6]:
htmldt

Unnamed: 0,doc_id,html,url,label,label_pred
0,0,"Bauchspeicheldrüse | Klinik für Allgemein-, Vi...",,-1,
1,1,Elbe-Elster Klinikum - Chirurgie Finsterwalde ...,http://elbe-elster-klinikum.de/fachbereiche/ch...,0,
2,2,Chirurgie der Bauchspeicheldrüse (Pankreaschir...,,-1,
3,3,Onkologisches Zentrum - Klinikum Bayreuth Aktu...,http://klinikum-bayreuth.de/einrichtungen/zent...,2,
4,4,Zentrum - Sozialpädiatrisches Zentrum - Städti...,http://klinikum-braunschweig.de/info.php/?id_o...,0,
...,...,...,...,...,...
142,142,Tumorboard des Lungenkrebszentrums Möglicherwe...,,-1,
143,143,Oberarzt (m/w/d) für die HNO-Klinik A- A A+ No...,,-1,
144,144,Für Ärzte | Vivantes JavaScript scheint in Ihr...,http://www.vivantes.de/fuer-sie-vor-ort/klinik...,1,
145,145,"Innere Medizin – Hämatologie, Onkologie und Pa...",http://www.vivantes.de/fuer-sie-vor-ort/klinik...,1,


In [24]:
htmldt.html.str.split().apply(len).describe()

count      147.000000
mean      1883.755102
std       3867.911788
min        206.000000
25%        691.500000
50%        966.000000
75%       1832.500000
max      36214.000000
Name: html, dtype: float64

### (2) LLM based Classification

In [7]:
# Uncomment to use the model
# pipe = pipeline("text-generation", model="meta-llama/Llama-3.2-1B-Instruct")

In [8]:
max_new_tokens = 10

# for i in tqdm(range(htmldt.shape[0])):
for i in tqdm(range(1)):  # for testing purposes
    context = htmldt.iloc[i]["html"]

    QA_Input = [
        {
            "role": "user",
            "content": f"""
            You will review text extracted from German HTML webpages and respond in English.
            Tumor Board is a consilium of doctors (usually from different disciplines) discussing cancer cases in their departments.
            Task: Determine if the text refers to tumor boards and assign one of the following labels:
            - '1': No mention of tumor boards.
            - '2': Mentions tumor boards, but the page is not fully dedicated to their description.
            - '3': The page is fully dedicated to describing tumor board types and dates.

            Respond strictly with one of the numbers ('1', '2', or '3'). Do not include any additional text. 
            Limit your response to {max_new_tokens} tokens.

            Text for review: {context}
            """,
        },
    ]

    res = pipe(
        QA_Input, max_new_tokens=max_new_tokens, pad_token_id=128001, temperature=0.001
    )
    label = next(
        item["content"]
        for item in res[0]["generated_text"]
        if item["role"] == "assistant"
    )
    print(f"{i}: {label}")

    htmldt.loc[i, "label_pred"] = label

  0%|          | 0/1 [00:00<?, ?it/s]

NameError: name 'pipe' is not defined

In [45]:
htmldt.label_pred.value_counts()

label_pred
1. No Evidence                                        62
1. No Evidence\n2. Medium confidence\n                31
1                                                      9
'3'                                                    7
1. No                                                  7
Tumor Board is a consilium of doctors                  5
Here are the three labels for the text:\n\n1           4
1.                                                     4
3                                                      3
1. '3'                                                 2
The text is about the Sana Klinikum                    1
This text appears to be a list of albums by            1
No Evidence                                            1
Here are the three labels for the given text:\n\n      1
Tumor Board: '3'                                       1
Tumor Board: '3' (High confidence                      1
Here is the review of the text in English:\n\n         1
Tumor Board ist eine

The results using LLM based prompt engineering were not optimal due to several reasons:
1. Due to cost and hardware constrains, the most powerful models were not available for this exercise. 
2. The concept of 'Tumor board' may be unclear to the LLM. The training data was not fully utilized to influence the output.

For future work, the OpenAI GPT models or more powerful open LLMs from huggingface platforms could be tested for enhanced performance.

### (3) Supervised training with transformer

#### Initiate the tokenizer and model

In [7]:
model_name = "dbmdz/bert-base-german-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
hfmodel = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
print(hfmodel)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31102, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [8]:
# freeze base model parameters
for name, param in hfmodel.base_model.named_parameters():
    param.requires_grad = False

# unfreeze base model pooling layers
for name, param in hfmodel.base_model.named_parameters():
    if "pooler" in name:
        param.requires_grad = True

In [9]:
# print layers
for name, param in hfmodel.named_parameters():
    print(name, param.requires_grad)

bert.embeddings.word_embeddings.weight False
bert.embeddings.position_embeddings.weight False
bert.embeddings.token_type_embeddings.weight False
bert.embeddings.LayerNorm.weight False
bert.embeddings.LayerNorm.bias False
bert.encoder.layer.0.attention.self.query.weight False
bert.encoder.layer.0.attention.self.query.bias False
bert.encoder.layer.0.attention.self.key.weight False
bert.encoder.layer.0.attention.self.key.bias False
bert.encoder.layer.0.attention.self.value.weight False
bert.encoder.layer.0.attention.self.value.bias False
bert.encoder.layer.0.attention.output.dense.weight False
bert.encoder.layer.0.attention.output.dense.bias False
bert.encoder.layer.0.attention.output.LayerNorm.weight False
bert.encoder.layer.0.attention.output.LayerNorm.bias False
bert.encoder.layer.0.intermediate.dense.weight False
bert.encoder.layer.0.intermediate.dense.bias False
bert.encoder.layer.0.output.dense.weight False
bert.encoder.layer.0.output.dense.bias False
bert.encoder.layer.0.output.Lay

#### Preprocessing text

In [10]:
holdoutdt = htmldt[htmldt["label"] == -1].drop(columns=["label", "url", "label_pred"])
traindt = htmldt[htmldt["label"] > -1].drop(columns=["url", "label_pred"])

train_df, val_df = train_test_split(
    traindt, test_size=0.25, random_state=30, stratify=traindt["label"]
)

print(f"{holdoutdt.shape=}")
print(f"{traindt.shape=}")
print(f"{htmldt.shape=}")
print(f"{train_df.shape=}")
print(f"{val_df.shape=}")

holdoutdt.shape=(47, 2)
traindt.shape=(100, 3)
htmldt.shape=(147, 5)
train_df.shape=(75, 3)
val_df.shape=(25, 3)


In [11]:
holdout_ds = Dataset.from_pandas(holdoutdt)
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)

train_ds

Dataset({
    features: ['doc_id', 'html', 'label', '__index_level_0__'],
    num_rows: 75
})

In [12]:
def tokenize_function(row):
    return tokenizer(row["html"], truncation=True, padding="max_length", max_length=512)


train_ds = train_ds.map(tokenize_function, batched=True)
val_ds = val_ds.map(tokenize_function, batched=True)
holdout_ds = holdout_ds.map(tokenize_function, batched=True)

columns = ["input_ids", "attention_mask", "label"]
train_ds.set_format(type="torch", columns=columns)
val_ds.set_format(type="torch", columns=columns)
holdout_ds.set_format(type="torch")

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Map:   0%|          | 0/47 [00:00<?, ? examples/s]

#### Training

In [13]:
# Function to compute metrics on the validation set
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")  # or macro
    return {"accuracy": acc, "f1": f1}


training_args = TrainingArguments(
    learning_rate=1e-5,  # you can tune this
    output_dir="model_output",  # directory to store model checkpoints
    eval_strategy="epoch",  # evaluate at the end of each epoch
    save_strategy="epoch",  # save model at the end of each epoch
    num_train_epochs=10,  # you can tune this
    per_device_train_batch_size=8,  # depends on your GPU and data size
    per_device_eval_batch_size=8,
    logging_steps=50,  # logs training progress every 50 steps
    load_best_model_at_end=True,  # load the best model after training
)

trainer = Trainer(
    model=hfmodel,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    processing_class=tokenizer,  # needed for data processing
    compute_metrics=compute_metrics,
)

In [14]:
trainer.train()
eval_results = trainer.evaluate()
print(f"evaluation results: {eval_results}")

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.8498169183731079, 'eval_accuracy': 0.6, 'eval_f1': 0.45, 'eval_runtime': 10.6884, 'eval_samples_per_second': 2.339, 'eval_steps_per_second': 0.374, 'epoch': 1.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.8529292941093445, 'eval_accuracy': 0.6, 'eval_f1': 0.45, 'eval_runtime': 11.0768, 'eval_samples_per_second': 2.257, 'eval_steps_per_second': 0.361, 'epoch': 2.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.8623470067977905, 'eval_accuracy': 0.6, 'eval_f1': 0.45, 'eval_runtime': 10.7587, 'eval_samples_per_second': 2.324, 'eval_steps_per_second': 0.372, 'epoch': 3.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.8593291640281677, 'eval_accuracy': 0.6, 'eval_f1': 0.45, 'eval_runtime': 10.7271, 'eval_samples_per_second': 2.331, 'eval_steps_per_second': 0.373, 'epoch': 4.0}
{'loss': 0.8927, 'grad_norm': 4.2992401123046875, 'learning_rate': 5e-05, 'epoch': 5.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.8693068027496338, 'eval_accuracy': 0.6, 'eval_f1': 0.45, 'eval_runtime': 10.3811, 'eval_samples_per_second': 2.408, 'eval_steps_per_second': 0.385, 'epoch': 5.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.8807011246681213, 'eval_accuracy': 0.6, 'eval_f1': 0.45, 'eval_runtime': 10.3934, 'eval_samples_per_second': 2.405, 'eval_steps_per_second': 0.385, 'epoch': 6.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.8860752582550049, 'eval_accuracy': 0.64, 'eval_f1': 0.5326495726495726, 'eval_runtime': 11.1775, 'eval_samples_per_second': 2.237, 'eval_steps_per_second': 0.358, 'epoch': 7.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.8930975198745728, 'eval_accuracy': 0.6, 'eval_f1': 0.5061052631578947, 'eval_runtime': 10.4553, 'eval_samples_per_second': 2.391, 'eval_steps_per_second': 0.383, 'epoch': 8.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.894818902015686, 'eval_accuracy': 0.64, 'eval_f1': 0.5326495726495726, 'eval_runtime': 10.3032, 'eval_samples_per_second': 2.426, 'eval_steps_per_second': 0.388, 'epoch': 9.0}
{'loss': 0.766, 'grad_norm': 6.987075328826904, 'learning_rate': 0.0, 'epoch': 10.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.8941504955291748, 'eval_accuracy': 0.64, 'eval_f1': 0.5326495726495726, 'eval_runtime': 11.4463, 'eval_samples_per_second': 2.184, 'eval_steps_per_second': 0.349, 'epoch': 10.0}
{'train_runtime': 540.6263, 'train_samples_per_second': 1.387, 'train_steps_per_second': 0.185, 'train_loss': 0.8293684005737305, 'epoch': 10.0}


  0%|          | 0/4 [00:00<?, ?it/s]

evaluation results: {'eval_loss': 0.8498169183731079, 'eval_accuracy': 0.6, 'eval_f1': 0.45, 'eval_runtime': 10.3811, 'eval_samples_per_second': 2.408, 'eval_steps_per_second': 0.385, 'epoch': 10.0}


#### Inferece on holdout set

In [15]:
holdout_preds = trainer.predict(val_ds)


  0%|          | 0/4 [00:00<?, ?it/s]

In [16]:
prediction = np.argmax(holdout_preds.predictions, axis=-1)
pd.crosstab(val_ds["label"], prediction)

col_0,1
row_0,Unnamed: 1_level_1
0,8
1,15
2,2


In [17]:
prediction = (
    np.argmax(holdout_preds.predictions, axis=-1) + 1
)  # convert back to 1-index
prediction

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2], dtype=int64)

The transformer model did not perform well on the validation set. This is because token size allowed on the bert model is limited to 512. The html pages have 200 - 36K words in total. The model cannot fully capture the entire contents for many observation.   

### (4) TF-IDF + SVM / RF / XGB

##### Text pre-processing

In [28]:
nlp = spacy.load("de_core_news_sm")


def preprocess(text):
    doc = nlp(text)

    # Collect lemmas, excluding stop words and punctuation
    lemmas = []
    for token in doc:
        # Exclude stop words, punctuation, and pronouns like '-PRON-'
        if not token.is_stop and not token.is_punct:
            lemmas.append(token.lemma_.lower())

    # Join the lemmas back into a string
    return " ".join(lemmas)


htmldt["html_clean"] = htmldt["html"].apply(preprocess)


In [36]:
print(htmldt.html.str.split().apply(len).describe(percentiles=[0.95]))
print(htmldt.html_clean.str.split().apply(len).describe(percentiles=[0.75, 0.95]))

count      147.000000
mean      1883.755102
std       3867.911788
min        206.000000
50%        966.000000
95%       4655.500000
max      36214.000000
Name: html, dtype: float64
count      147.000000
mean      1223.755102
std       2097.218388
min        161.000000
50%        666.000000
75%       1332.000000
95%       2955.000000
max      19069.000000
Name: html_clean, dtype: float64


In [53]:
tfidf = TfidfVectorizer(max_features=1500)
Xarray = tfidf.fit_transform(htmldt["html_clean"]).toarray()
Xdf = pd.DataFrame(Xarray, columns=tfidf.get_feature_names_out())
dt_tfidf = pd.concat([htmldt, Xdf], axis=1)

In [74]:
X = dt_tfidf[dt_tfidf["label"] > -1].drop(
    columns=["doc_id", "url", "label_pred", "html", "html_clean", "label"]
)
y = dt_tfidf[dt_tfidf["label"] > -1]["label"]
holdout_X = dt_tfidf[dt_tfidf["label"] == -1]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.3, random_state=30, stratify=y
)

#### SVC

In [75]:
svc = SVC(kernel="sigmoid", gamma=1.0)

svc.fit(X_train, y_train)
y_pred = svc.predict(X_val)
print(f"Accuracy: {accuracy_score(y_val, y_pred)}")
print(f"F1: {f1_score(y_val, y_pred, average='weighted')}")
pd.crosstab(y_val, y_pred)


Accuracy: 0.6333333333333333
F1: 0.5533333333333333


col_0,0,1
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2,7
1,1,17
2,0,3


#### Random Forest

In [99]:
rfc = RandomForestClassifier(
    n_estimators=700, max_depth=5, max_samples=0.9, random_state=30
)

rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_val)
print(f"Accuracy: {accuracy_score(y_val, y_pred)}")
print(f"F1: {f1_score(y_val, y_pred, average='weighted')}")
pd.crosstab(y_val, y_pred)

Accuracy: 0.6666666666666666
F1: 0.5786561264822134


col_0,0,1
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2,7
1,0,18
2,0,3


#### XGBoost

In [98]:
xgbc = XGBClassifier(n_estimators=600, max_depth=4, random_state=30)

xgbc.fit(X_train, y_train)
y_pred = xgbc.predict(X_val)
print(f"Accuracy: {accuracy_score(y_val, y_pred)}")
print(f"F1: {f1_score(y_val, y_pred, average='weighted')}")
pd.crosstab(y_val, y_pred)

Accuracy: 0.7666666666666667
F1: 0.7369409660107334


col_0,0,1,2
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,4,5,0
1,0,18,0
2,0,2,1
