# Keyword Detection on Websites
Source: https://platform.stratascratch.com/data-projects/keyword-detection-websites


In [6]:
from bs4 import BeautifulSoup
import pandas as pd
import chardet
from transformers import pipeline
from IPython.display import Markdown, display
import json
from tqdm.auto import tqdm

### (1) Load and examine data

In [7]:
traindt = pd.read_csv("./datasets/train.csv")
testdt = pd.read_csv("./datasets/test.csv")

In [8]:
testdt.head()

Unnamed: 0,url,doc_id
0,http://chirurgie-goettingen.de/medizinische-ve...,0
1,http://evkb.de/kliniken-zentren/chirurgie/allg...,2
2,http://krebszentrum.kreiskliniken-reutlingen.d...,7
3,http://marienhospital-buer.de/mhb-av-chirurgie...,15
4,http://marienhospital-buer.de/mhb-av-chirurgie...,16


In [9]:
htmldt_0 = []

for i in range(147):
    with open(f"./datasets/htmls/{i}.html", "rb") as f:
        initread = chardet.detect(f.read())
        encoding = initread["encoding"]

    with open(f"./datasets/htmls/{i}.html", "r", encoding=encoding) as f:
        html = f.read()

    soup = BeautifulSoup(html, "html.parser")
    body = soup.get_text(" ", strip=True)
    htmldt_0.append({"doc_id": i, "html": body})


  k = self.parse_starttag(i)


In [10]:
htmldt = (
    pd.DataFrame(htmldt_0)
    .merge(traindt, on="doc_id", how="left", validate="1:1")
    .assign(label_pred=None)
)

In [11]:
htmldt

Unnamed: 0,doc_id,html,url,label,label_pred
0,0,"Bauchspeicheldrüse | Klinik für Allgemein-, Vi...",,,
1,1,Elbe-Elster Klinikum - Chirurgie Finsterwalde ...,http://elbe-elster-klinikum.de/fachbereiche/ch...,1.0,
2,2,Chirurgie der Bauchspeicheldrüse (Pankreaschir...,,,
3,3,Onkologisches Zentrum - Klinikum Bayreuth Aktu...,http://klinikum-bayreuth.de/einrichtungen/zent...,3.0,
4,4,Zentrum - Sozialpädiatrisches Zentrum - Städti...,http://klinikum-braunschweig.de/info.php/?id_o...,1.0,
...,...,...,...,...,...
142,142,Tumorboard des Lungenkrebszentrums Möglicherwe...,,,
143,143,Oberarzt (m/w/d) für die HNO-Klinik A- A A+ No...,,,
144,144,Für Ärzte | Vivantes JavaScript scheint in Ihr...,http://www.vivantes.de/fuer-sie-vor-ort/klinik...,2.0,
145,145,"Innere Medizin – Hämatologie, Onkologie und Pa...",http://www.vivantes.de/fuer-sie-vor-ort/klinik...,2.0,


### (2) LLM based Classification

In [12]:
pipe = pipeline("text-generation", model="meta-llama/Llama-3.2-1B-Instruct")

Device set to use cpu


In [42]:
max_new_tokens = 10

for i in tqdm(range(htmldt.shape[0])):
    # for i in tqdm(range(1)):
    context = htmldt.iloc[i]["html"]

    QA_Input = [
        {
            "role": "user",
            "content": f"""
            You review text extracted from html webpages. The text are in German. Your response should be in English.
            Tumor Board is a consilium of doctors (usually from different disciplines) discussing cancer cases in their departments.
            Determine if the text is a tumor boards and return one of the 3 labels:
            '1' (No Evidence): tumor boards are not mentioned on the page.
            '2' (Medium confidence): tumor boards are mentioned, but the page is not completely dedicated to tumor board description.
            '3' (High confidence): page is completely dedicated to the description of tumor board types and dates.
            Your response is strictly limited to the three numbers: '1', '2', and '3'. Do not include any other text.
            Only one label is expected.
            You responses are strictly limited to {max_new_tokens} tokens.
            The text extracted from the html webpage is: {context}
            """,
        },
    ]

    res = pipe(
        QA_Input, max_new_tokens=max_new_tokens, pad_token_id=128001, temperature=0.001
    )
    label = next(
        item["content"]
        for item in res[0]["generated_text"]
        if item["role"] == "assistant"
    )

    htmldt.loc[i, "label_pred"] = label

  0%|          | 0/147 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

In [41]:
htmldt

Unnamed: 0,doc_id,html,url,label,label_pred
0,0,"Bauchspeicheldrüse | Klinik für Allgemein-, Vi...",,,1. No Evidence
1,1,Elbe-Elster Klinikum - Chirurgie Finsterwalde ...,http://elbe-elster-klinikum.de/fachbereiche/ch...,1.0,1. No Evidence
2,2,Chirurgie der Bauchspeicheldrüse (Pankreaschir...,,,1. No Evidence
3,3,Onkologisches Zentrum - Klinikum Bayreuth Aktu...,http://klinikum-bayreuth.de/einrichtungen/zent...,3.0,
4,4,Zentrum - Sozialpädiatrisches Zentrum - Städti...,http://klinikum-braunschweig.de/info.php/?id_o...,1.0,
...,...,...,...,...,...
142,142,Tumorboard des Lungenkrebszentrums Möglicherwe...,,,
143,143,Oberarzt (m/w/d) für die HNO-Klinik A- A A+ No...,,,
144,144,Für Ärzte | Vivantes JavaScript scheint in Ihr...,http://www.vivantes.de/fuer-sie-vor-ort/klinik...,2.0,
145,145,"Innere Medizin – Hämatologie, Onkologie und Pa...",http://www.vivantes.de/fuer-sie-vor-ort/klinik...,2.0,


In [23]:
[item["content"] for item in res[0]["generated_text"] if item["role"] == "assistant"]

["'"]

In [153]:
# print(res[0]["generated_text"][2]["content"])


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [154]:
res

[{'generated_text': [{'role': 'user',
    'content': "\n            You review text extracted from html webpages. The text are in German. Your response should be in English.\n            Tumor Board is a consilium of doctors (usually from different disciplines) discussing cancer cases in their departments.\n            Determine if the text is a tumor boards and return one of the 3 labels:\n            '1' (No Evidence): tumor boards are not mentioned on the page.\n            '2' (Medium confidence): tumor boards are mentioned, but the page is not completely dedicated to tumor board description.\n            '3' (High confidence): page is completely dedicated to the description of tumor board types and dates.\n            Your response is strictly limit to the three numbers '1', '2', and '3'. Do not include any other information.\n            You responses are strictly limited to 10 tokens.\n            The text extracted from the html webpage is: Bauchspeicheldrüse | Klinik für Allge