<a href="https://colab.research.google.com/github/KayyThania/Tugas-1_Scraping_Website_Kayla-Nethania-Said/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time

BASE_URL = "https://www.cnnindonesia.com/indeks/"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

def get_articles(page_num=1):
    url = BASE_URL + str(page_num)
    res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.text, "html.parser")

    articles_data = []

    # ambil list artikel
    articles = soup.find_all("article", class_="list-content")
    for art in articles:
        try:
            title_tag = art.find("h2", class_="title")
            title = title_tag.get_text(strip=True) if title_tag else None

            link_tag = art.find("a")
            link = link_tag["href"] if link_tag else None
            slug = link.split("/")[-1] if link else None

            category_tag = art.find("a", class_="kanal")
            category = category_tag.get_text(strip=True) if category_tag else None

            image_tag = art.find("img")
            image = image_tag["src"] if image_tag else None

            summary_tag = art.find("div", class_="text")
            summary = summary_tag.get_text(strip=True) if summary_tag else None

            # scrape detail artikel
            article_res = requests.get(link, headers=headers)
            article_soup = BeautifulSoup(article_res.text, "html.parser")

            # isi konten
            paragraphs = article_soup.find_all("div", class_="detail-text")
            content = " ".join([p.get_text(strip=True) for p in paragraphs]) if paragraphs else None

            # tanggal publikasi
            date_tag = article_soup.find("div", class_="date")
            time_posted = date_tag.get_text(strip=True) if date_tag else None

            # penulis
            author_tag = article_soup.find("div", class_="author")
            author = author_tag.get_text(strip=True) if author_tag else None

            # update terakhir
            update_tag = article_soup.find("div", class_="update")
            last_updated = update_tag.get_text(strip=True) if update_tag else None

            # tags
            tags = [tag.get_text(strip=True) for tag in article_soup.find_all("a", class_="tag")]

            # sub kategori (dari breadcrumb)
            breadcrumb = article_soup.find("div", class_="breadcrumb")
            sub_category = breadcrumb.find_all("a")[-1].get_text(strip=True) if breadcrumb else None

            articles_data.append({
                "title": title,
                "slug": slug,
                "url": link,
                "category": category,
                "sub_category": sub_category,
                "time_posted": time_posted,
                "last_updated": last_updated,
                "author": author,
                "editor": None,  # CNN biasanya tidak ada editor
                "summary": summary,
                "content": content,
                "tags": tags,
                "image_url": image,
                "source": "CNN Indonesia"
            })

            time.sleep(1)  # jeda antar artikel
        except Exception as e:
            print(f"Error: {e}")
            continue

    return articles_data


if __name__ == "__main__":
    all_data = []
    total_pages = 5   # ubah sesuai kebutuhan, misalnya 50 halaman

    for i in range(1, total_pages+1):
        print(f"Scraping halaman {i} ...")
        articles = get_articles(i)
        all_data.extend(articles)
        time.sleep(2)  # jeda antar halaman

    # Export ke JSON
    with open("cnn_news_multi.json", "w", encoding="utf-8") as f:
        json.dump(all_data, f, ensure_ascii=False, indent=4)

    # Export ke CSV & Excel
    df = pd.DataFrame(all_data)
    df.to_csv("cnn_news_multi.csv", index=False, encoding="utf-8-sig")
    df.to_excel("cnn_news_multi.xlsx", index=False)

    print(f"Total artikel terkumpul: {len(all_data)}")
    print("Data berhasil diexport ke JSON, CSV, dan XLSX")


Scraping halaman 1 ...
Scraping halaman 2 ...
Scraping halaman 3 ...
Scraping halaman 4 ...
Scraping halaman 5 ...
Total artikel terkumpul: 0
Data berhasil diexport ke JSON, CSV, dan XLSX


In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time

BASE_URL = "https://news.detik.com/indeks/"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

def get_articles(page_num=1):
    url = BASE_URL + str(page_num)
    res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.text, "html.parser")

    articles_data = []

    # ambil list artikel
    articles = soup.find_all("article")
    for art in articles:
        try:
            title_tag = art.find("h3")
            title = title_tag.get_text(strip=True) if title_tag else None

            link_tag = art.find("a")
            link = link_tag["href"] if link_tag else None
            slug = link.split("/")[-1] if link else None

            category_tag = art.find("span", class_="labdate")
            category = category_tag.get_text(strip=True).split(" - ")[0] if category_tag else None
            time_posted = category_tag.get_text(strip=True).split(" - ")[-1] if category_tag else None

            image_tag = art.find("img")
            image = image_tag["src"] if image_tag else None

            summary_tag = art.find("p")
            summary = summary_tag.get_text(strip=True) if summary_tag else None

            # scrape detail artikel
            article_res = requests.get(link, headers=headers)
            article_soup = BeautifulSoup(article_res.text, "html.parser")

            # isi konten
            paragraphs = article_soup.find_all("div", class_="detail__body-text itp_bodycontent")
            if not paragraphs:
                paragraphs = article_soup.find_all("p")
            content = " ".join([p.get_text(strip=True) for p in paragraphs]) if paragraphs else None

            # tanggal publikasi
            date_tag = article_soup.find("div", class_="detail__date")
            time_posted_detail = date_tag.get_text(strip=True) if date_tag else time_posted

            # penulis
            author_tag = article_soup.find("div", class_="detail__author")
            author = author_tag.get_text(strip=True) if author_tag else None

            # editor
            editor_tag = article_soup.find("div", class_="editor")
            editor = editor_tag.get_text(strip=True) if editor_tag else None

            # update terakhir
            update_tag = article_soup.find("div", class_="read__update")
            last_updated = update_tag.get_text(strip=True) if update_tag else None

            # tags
            tags = [tag.get_text(strip=True) for tag in article_soup.find_all("a", class_="detail__tag")]

            # sub kategori (dari breadcrumb)
            breadcrumb = article_soup.find("div", class_="breadcrumb")
            sub_category = breadcrumb.find_all("a")[-1].get_text(strip=True) if breadcrumb else None

            articles_data.append({
                "title": title,
                "slug": slug,
                "url": link,
                "category": category,
                "sub_category": sub_category,
                "time_posted": time_posted_detail,
                "last_updated": last_updated,
                "author": author,
                "editor": editor,
                "summary": summary,
                "content": content,
                "tags": tags,
                "image_url": image,
                "source": "Detik.com"
            })

            time.sleep(1)  # jeda antar artikel
        except Exception as e:
            print(f"Error: {e}")
            continue

    return articles_data


if __name__ == "__main__":
    all_data = []
    total_pages = 5   # ubah sesuai kebutuhan, misalnya 50 halaman

    for i in range(1, total_pages+1):
        print(f"Scraping halaman {i} ...")
        articles = get_articles(i)
        all_data.extend(articles)
        time.sleep(2)  # jeda antar halaman

    # Export ke JSON
    with open("detik_news_multi.json", "w", encoding="utf-8") as f:
        json.dump(all_data, f, ensure_ascii=False, indent=4)

    # Export ke CSV & Excel
    df = pd.DataFrame(all_data)
    df.to_csv("detik_news_multi.csv", index=False, encoding="utf-8-sig")
    df.to_excel("detik_news_multi.xlsx", index=False)

    print(f"Total artikel terkumpul: {len(all_data)}")
    print("Data berhasil diexport ke JSON, CSV, dan XLSX")


Scraping halaman 1 ...
Scraping halaman 2 ...
Scraping halaman 3 ...
Scraping halaman 4 ...
Scraping halaman 5 ...
Total artikel terkumpul: 0
Data berhasil diexport ke JSON, CSV, dan XLSX
