In [None]:
import requests
from bs4 import BeautifulSoup
import json

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

def crawl_halodoc(url):
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.text, "html.parser")

    container = soup.find("div", id="articleContent")

    if container:
        text_elements = container.find_all(["p", "li", "h2", "strong"])
        return " ".join(t.get_text(strip=True) for t in text_elements)

    paragraphs = soup.find_all("p")
    return " ".join(p.get_text(strip=True) for p in paragraphs) if paragraphs else ""


def crawl_alodokter(url):
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.text, "html.parser")
    container = soup.find("div", id="postContent")

    if container:
        elements = container.find_all(["p", "li", "h2", "strong"])
        return " ".join(el.get_text(strip=True) for el in elements)

    article = soup.find("article")
    if article:
        elements = article.find_all(["p", "li", "h2"])
        return " ".join(el.get_text(strip=True) for el in elements)

    return ""


all_urls = [
    "https://www.halodoc.com/artikel/anak-demam-baiknya-kompres-air-hangat-atau-dingin",
    "https://www.halodoc.com/artikel/mudah-ini-cara-mengatasi-cegukan-pada-orang-dewasa",
    "https://www.halodoc.com/artikel/ini-5-rekomendasi-vitamin-daya-tahan-tubuh-untuk-perkuat-imun",
    "https://www.halodoc.com/artikel/sakit-perut-bagian-atas-ini-penyebab-dan-cara-mengatasinya",
    "https://www.halodoc.com/artikel/cara-mengenali-radang-usus-buntu-sejak-dini-dan-langkah-penanganannya",
    "https://www.halodoc.com/artikel/ini-5-minuman-untuk-bantu-melancarkan-siklus-menstruasi",
    "https://www.halodoc.com/artikel/kenapa-pengidap-rabies-takut-air-ini-penjelasannya",
    "https://www.halodoc.com/artikel/kanker-darah-ini-jenis-gejala-dan-cara-mengobatinya-1",
    "https://www.halodoc.com/artikel/ini-gejala-dbd-pada-anak-yang-perlu-diwaspadai",
    "https://www.halodoc.com/kesehatan/kanker-payudara#h-apa-itu-kanker-payudara",
    "https://www.alodokter.com/pneumonia-apakah-menular-ini-jawaban-dan-cara-mencegahnya",
    "https://www.alodokter.com/pemanasan-sebelum-berenang-ini-manfaat-contoh-gerakan-dan-tips-aman-melakukannya",
    "https://www.alodokter.com/9-ciri-peredaran-darah-tidak-lancar-yang-perlu-diketahui",
    "https://www.alodokter.com/mata-bengkak-saat-bangun-tidur-inilah-penyebab-dan-cara-mengatasinya",
    "https://www.alodokter.com/sakit-perut-sebelah-kiri-saat-hamil-trimester-2-ini-penyebab-yang-perlu-diketahui",
    "https://www.alodokter.com/cara-mengatasi-batuk-pilek-pada-bayi-dengan-aman-dan-efektif-di-rumah",
    "https://www.alodokter.com/nyeri-pangkal-paha-kiri-pada-wanita-ketahui-penyebab-dan-penanganannya",
    "https://www.alodokter.com/apakah-moisturizer-boleh-dipakai-malam-hari-ini-penjelasannya",
    "https://www.alodokter.com/apakah-epilepsi-menular-ketahui-faktanya-di-sini",
    "https://www.alodokter.com/berbagai-tanda-luka-sunat-infeksi-yang-perlu-diwaspadai-dan-cara-mencegahnya"
    ]

data_json = []

print("\n Hasil Scraping : \n")

for i, url in enumerate(all_urls):
    try:
        if "halodoc" in url:
            text = crawl_halodoc(url)
        else:
            text = crawl_alodokter(url)

        data_json.append({
            "judul": f"Artikel_{i+1}",
            "url": url,
            "konten": text
        })

        print(f"[BERHASIL] {url}")
        print(f"— Jumlah karakter: {len(text)}")
        print(f"— Cuplikan: {text[:200]}...\n")

    except Exception as e:
        print(f"[GAGAL] {url}")
        print("Error:", e)

with open("hasil_scraping_artikel_kesehatan.json", "w", encoding="utf-8") as f:
    json.dump(data_json, f, indent=4, ensure_ascii=False)

print("\nFile berhasil disimpan sebagai: hasil_scraping_artikel_kesehatan.json\n")


 Hasil Scraping : 

[BERHASIL] https://www.halodoc.com/artikel/anak-demam-baiknya-kompres-air-hangat-atau-dingin
— Jumlah karakter: 14945
— Cuplikan: DAFTAR ISI: DAFTAR ISI: Kompres yang Tepat untuk Anak Demam Cara Lain Mengatasi Demam pada Anak Rekomendasi Obat Demam AnakTempra Sirup Rasa Anggur 60 mlParacetamol Sirup 60 mlTermorex Patch 6 Sachet ...

[BERHASIL] https://www.halodoc.com/artikel/mudah-ini-cara-mengatasi-cegukan-pada-orang-dewasa
— Jumlah karakter: 6801
— Cuplikan: DAFTAR ISI DAFTAR ISI Apa Itu Cegukan? Penyebab Cegukan pada Orang Dewasa Cara Mengatasi Cegukan pada Orang Dewasa FAQ  Cegukan memang bukan kondisi berbahaya, tapi bisa sangat mengganggu, apalagi jik...

[BERHASIL] https://www.halodoc.com/artikel/ini-5-rekomendasi-vitamin-daya-tahan-tubuh-untuk-perkuat-imun
— Jumlah karakter: 6321
— Cuplikan: DAFTAR ISI DAFTAR ISI Rekomendasi Vitamin Daya Tahan Tubuh Imboost Force 10 Kaplet Blackmores Daily Immune C 500 30 Tablet Astria 4 mg 4 Kapsul Mevit Vitamin C 1000 mg 

# Preprocessing

In [13]:
import json
import re
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

with open("hasil_scraping_artikel_kesehatan.json", "r", encoding="utf-8") as f:
    data = json.load(f)

trash_phrases = [
    "daftar isi",
    "faq",
    "chat dokter",
    "hubung dokter",
    "mulai rp",
    "booking",
    "promo",
    "halodoc homecare",
    "homecare by halodoc",
    "konsultasi sekarang",
    "tanya dokter",
    "produk sehat",
    "artikel terkait",
    "dokter spesialis",
    "chat dr",
    "kamu dapat",
    "harga mulai",
    "pesan medical check up",
    "medical check up",
    "konsultasi dokter",
    "baca juga",
    "resep dokter",
]

stop_factory = StopWordRemoverFactory()
stopwords = set(stop_factory.get_stop_words())
stem_factory = StemmerFactory()
stemmer = stem_factory.create_stemmer()

def remove_trash(text):
    text_clean = text.lower()
    for phrase in trash_phrases:
        text_clean = text_clean.replace(phrase, " ")
    return text_clean


def preprocess_text(text):
    text = remove_trash(text)
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", " ", text)
    tokens = text.split()
    tokens = [t for t in tokens if t not in stopwords]
    tokens = [stemmer.stem(t) for t in tokens]
    return " ".join(tokens)

preprocessed_docs = []

for item in data:
    clean_text = preprocess_text(item["konten"])
    preprocessed_docs.append({
        "judul": item["judul"],
        "url": item["url"],
        "clean_text": clean_text
    })

with open("preprocessed_docs.json", "w", encoding="utf-8") as f:
    json.dump(preprocessed_docs, f, indent=4, ensure_ascii=False)

print("Preprocessing selesai")


Preprocessing selesai


# TF-IDF

In [14]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer

titles = [item["judul"] for item in data]
urls = [item["url"] for item in data]

with open("preprocessed_docs.json", "r", encoding="utf-8") as f:
    data = json.load(f)

documents = [item["clean_text"] for item in data]

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

print("Jumlah dokumen:", len(documents))
print("Jumlah fitur (kata unik):", len(vectorizer.get_feature_names_out()))


Jumlah dokumen: 20
Jumlah fitur (kata unik): 1882


In [18]:
from sklearn.metrics.pairwise import cosine_similarity

def search_engine(query):
    query = preprocess_text(query)
    query_vec = vectorizer.transform([query])
    scores = cosine_similarity(query_vec, tfidf_matrix).flatten()

    ranked = scores.argsort()[::-1]

    print("\nHasil Pencarian: \n")
    for idx in ranked[:5]:  # ambil 5 teratas
        print(f"Skor: {scores[idx]:.4f}")
        print(f"Judul: {titles[idx]}")
        print(f"URL: {urls[idx]}\n")

        
search_engine("Kanker Payudara")



Hasil Pencarian: 

Skor: 0.9071
Judul: Artikel_10
URL: https://www.halodoc.com/kesehatan/kanker-payudara#h-apa-itu-kanker-payudara

Skor: 0.4316
Judul: Artikel_8
URL: https://www.halodoc.com/artikel/kanker-darah-ini-jenis-gejala-dan-cara-mengobatinya-1

Skor: 0.0000
Judul: Artikel_20
URL: https://www.alodokter.com/berbagai-tanda-luka-sunat-infeksi-yang-perlu-diwaspadai-dan-cara-mencegahnya

Skor: 0.0000
Judul: Artikel_19
URL: https://www.alodokter.com/apakah-epilepsi-menular-ketahui-faktanya-di-sini

Skor: 0.0000
Judul: Artikel_17
URL: https://www.alodokter.com/nyeri-pangkal-paha-kiri-pada-wanita-ketahui-penyebab-dan-penanganannya

