# **Implementasi Studi dan Analisis Kinerja Chatbot : Studi Kasus Delbot**

# **Data Collection**

## Web Scraping

### del.ac.id

In [None]:
import requests
from bs4 import BeautifulSoup
import csv

# Daftar URL halaman yang akan di-scrape
urls = [
    "https://www.del.ac.id/?page_id=1006",
    "https://www.del.ac.id/?page_id=602",
    "https://www.del.ac.id/?page_id=30",
    "https://www.del.ac.id/?page_id=719",
    "https://www.del.ac.id/?page_id=33",
    "https://www.del.ac.id/?page_id=216",
    "https://www.del.ac.id/?page_id=583",
    "https://www.del.ac.id/?page_id=6021",
    "https://www.del.ac.id/?page_id=109",
    "https://www.del.ac.id/?page_id=3348",
    "https://www.del.ac.id/?page_id=13440",
    "https://www.del.ac.id/?page_id=1790",
    "https://www.del.ac.id/?page_id=1789",
    "https://www.del.ac.id/?page_id=14564",
    "https://www.del.ac.id/?page_id=48",
    "https://www.del.ac.id/?page_id=46",
    "https://www.del.ac.id/?page_id=896",
    "https://www.del.ac.id/?page_id=140",
    "https://www.del.ac.id/?page_id=128",
    "https://www.del.ac.id/?page_id=11117",
    "https://www.del.ac.id/?page_id=11121",
    "https://www.del.ac.id/?page_id=130",
    "https://www.del.ac.id/?page_id=9929",
    "https://www.del.ac.id/?page_id=142",
    "https://www.del.ac.id/?page_id=144",
    "https://www.del.ac.id/?page_id=146",
    "https://www.del.ac.id/?page_id=616",
    "https://www.del.ac.id/?page_id=618",
    "https://www.del.ac.id/?page_id=622"
]

# List untuk menyimpan hasil scraping
data = []

# Scraping setiap halaman
for url in urls:
    print(f"Mengambil data dari: {url}")
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            title = soup.title.string.strip() if soup.title else "Tidak ada judul"

            content_div = soup.find("div", class_="post-content")
            if content_div:
                text_content = content_div.get_text(separator="\n", strip=True)
            else:
                text_content = "Konten tidak ditemukan"

            data.append([title, url, text_content])
        else:
            print(f"Gagal mengambil halaman: {url}, Status code: {response.status_code}")
            data.append(["Gagal mengambil halaman", url, ""])
    except Exception as e:
        print(f"Error saat mengambil {url}: {e}")
        data.append(["Error", url, str(e)])

# Simpan ke file CSV
with open("web_del.csv", mode='w', encoding='utf-8', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Judul Halaman", "URL", "Konten Halaman"])
    writer.writerows(data)

print("\n✅ Semua data berhasil disimpan ke web_del.csv")


Mengambil data dari: https://www.del.ac.id/?page_id=1006
Mengambil data dari: https://www.del.ac.id/?page_id=602
Mengambil data dari: https://www.del.ac.id/?page_id=30
Mengambil data dari: https://www.del.ac.id/?page_id=719
Mengambil data dari: https://www.del.ac.id/?page_id=33
Mengambil data dari: https://www.del.ac.id/?page_id=216
Mengambil data dari: https://www.del.ac.id/?page_id=583
Mengambil data dari: https://www.del.ac.id/?page_id=6021
Mengambil data dari: https://www.del.ac.id/?page_id=109
Mengambil data dari: https://www.del.ac.id/?page_id=3348
Mengambil data dari: https://www.del.ac.id/?page_id=13440
Mengambil data dari: https://www.del.ac.id/?page_id=1790
Mengambil data dari: https://www.del.ac.id/?page_id=1789
Mengambil data dari: https://www.del.ac.id/?page_id=14564
Mengambil data dari: https://www.del.ac.id/?page_id=48
Mengambil data dari: https://www.del.ac.id/?page_id=46
Mengambil data dari: https://www.del.ac.id/?page_id=896
Mengambil data dari: https://www.del.ac.id/

### Wikipedia

In [None]:
import requests
from bs4 import BeautifulSoup
import csv

# URL halaman Wikipedia
url = "https://id.wikipedia.org/wiki/Institut_Teknologi_Del"

# Kirim permintaan GET
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')

    # Ambil judul utama halaman
    main_title = soup.find("h1", {"id": "firstHeading"}).text.strip()

    # Ambil konten utama
    content_div = soup.find("div", {"id": "mw-content-text"})
    content_body = content_div.find("div", class_="mw-parser-output")

    # Bersihkan infobox, tabel, referensi, dan elemen tak relevan
    for elem in content_body.find_all(["table", "style", "script"]):
        elem.decompose()
    for sup in content_body.find_all("sup", class_="reference"):
        sup.decompose()

    # Parsing isi artikel berdasarkan heading (h2, h3, dst.)
    data = []
    current_heading = "Pendahuluan"
    current_content = []

    for tag in content_body.find_all(["h2", "h3", "p", "ul", "ol"]):
        if tag.name in ["h2", "h3"]:
            # Simpan heading sebelumnya
            if current_content:
                data.append([main_title, current_heading, "\n".join(current_content)])
                current_content = []

            # Ambil teks heading tanpa [sunting] dan tag span lainnya
            heading_text = tag.get_text(separator=" ", strip=True).replace("[sunting | sunting sumber]", "")
            current_heading = heading_text

        elif tag.name in ["p", "ul", "ol"]:
            current_content.append(tag.get_text(separator=" ", strip=True))

    # Simpan konten terakhir
    if current_content:
        data.append([main_title, current_heading, "\n".join(current_content)])

    # Simpan ke CSV
    with open("wiki_it_del_full.csv", mode="w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["Judul Artikel", "Bagian", "Konten"])
        writer.writerows(data)

    print("✅ Semua konten berhasil disimpan ke wiki_it_del_full.csv")

else:
    print(f"❌ Gagal mengakses halaman. Status code: {response.status_code}")


✅ Semua konten berhasil disimpan ke wiki_it_del_full.csv


### Semat

In [None]:
import requests
from bs4 import BeautifulSoup
import csv

# Daftar URL yang akan di-scrape
urls = [
    "https://semat.del.ac.id/fasilitas",
    "https://semat.del.ac.id/fakultas/1",
    "https://semat.del.ac.id/fakultas/2",
    "https://semat.del.ac.id/fakultas/3",
    "https://semat.del.ac.id/fakultas/4"
]

# Menampung hasil scraping
data = []

for url in urls:
    print(f"⏳ Memproses: {url}")
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        # Ambil judul halaman dari tag <title>
        title_tag = soup.find("title")
        title = title_tag.text.strip() if title_tag else "Tidak ada judul"

        # Ambil konten utama, asumsikan ada di tag <main> atau <div class="container">
        content_div = soup.find("main") or soup.find("div", class_="container") or soup.find("body")

        if content_div:
            # Hapus elemen yang tidak diinginkan (misal script, style)
            for tag in content_div(["script", "style"]):
                tag.decompose()

            # Ambil semua teks bersih
            text_content = content_div.get_text(separator="\n", strip=True)
        else:
            text_content = "Konten tidak ditemukan"

        data.append([title, url, text_content])

    except Exception as e:
        print(f"⚠️ Gagal memproses {url}: {e}")
        data.append(["ERROR", url, str(e)])

# Simpan ke CSV
with open("semat_del_pages.csv", mode="w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["Judul Halaman", "URL", "Konten"])
    writer.writerows(data)

print("✅ Semua data berhasil disimpan ke semat_del_pages.csv")


⏳ Memproses: https://semat.del.ac.id/fasilitas
⏳ Memproses: https://semat.del.ac.id/fakultas/1
⏳ Memproses: https://semat.del.ac.id/fakultas/2
⏳ Memproses: https://semat.del.ac.id/fakultas/3
⏳ Memproses: https://semat.del.ac.id/fakultas/4
✅ Semua data berhasil disimpan ke semat_del_pages.csv


In [None]:
import pandas as pd

# Daftar file CSV yang akan digabung
csv_files = ["web_del.csv", "wiki_it_del_full.csv", "semat_del_pages.csv"]

# Gabungkan semuanya jadi satu DataFrame
gabungan_df = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)

# Simpan hasil gabungan ke file baru
gabungan_df.to_csv("dataset_del.csv", index=False, encoding="utf-8")

print("✅ Semua file berhasil digabung ke dalam dataset_del.csv")


✅ Semua file berhasil digabung ke dalam dataset_del.csv


# **Generate QA Pairs**

### **Dataset**

In [None]:
#Read csv files
df = pd.read_csv("wiki_it_del_full.csv")

context = df.loc[0, 'Konten']

### **Prompt Engineering**

In [None]:
prompt = f"Buatlah pasangan pertanyaan dan jawaban dari konteks berikut:\n\n{context}\n\nQA:"

### **Model Inference**

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

model_id = "deepseek-ai/deepseek-coder-6.7b-instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")

qa_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

output = qa_pipeline(prompt, max_new_tokens=256, do_sample=True, top_p=0.9, temperature=0.7)[0]['generated_text']
print(output)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.87k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Device set to use cpu


In [None]:
qa_section = output.split("QA:")[-1].strip()
print("Hasil QA:", qa_section)

# **ROUGE-L SCORE**

In [None]:
pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=b729e32d3cc6fb4a6c73714972176fd9351dfd8e2d846348182a99c1d1e1a7ea
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
import pandas as pd
from rouge_score import rouge_scorer

# Load dataset CSV
df = pd.read_csv('Hasil Generate QA - Website Kampus Del.csv')

# Misalnya kamu punya dua kolom: 'jawaban' (referensi) dan 'prediksi' (hasil model)
# Jika belum ada kolom 'prediksi', tambahkan atau ubah ini sesuai kebutuhanmu
# df['prediksi'] = ... (hasil dari model QA)

# Inisialisasi scorer
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

scores = [scorer.score(ref, pred)['rougeL'].fmeasure
          for ref, pred in zip(df['answer'], df['prediksi'])]

# Hitung rata-rata ROUGE-L
average_rougeL = sum(scores) / len(scores)
print(f'Average ROUGE-L Score: {average_rougeL:.4f}')


KeyError: 'answer'