<a href="https://colab.research.google.com/github/DarmaCahya/CrawlerNews/blob/main/test_Crawling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install requests beautifulsoup4 pandas



### Crawling data secara umum tanpa menggunakan topik

In [2]:
import json
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
import time

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Load JSON file
with open('news_websites.json') as json_file:
    websites = json.load(json_file)['websites']

def get_soup(url):
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    return BeautifulSoup(response.text, 'html.parser')

def parse_article(link, website):
    try:
        soup = get_soup(link)
        content_div = soup.find(class_=website['content_class'])
        content = ' '.join(p.get_text(strip=True) for p in content_div.find_all('p')) if content_div else 'No content found'
        return content
    except Exception as e:
        print(f"Error parsing article {link}: {e}")
        return 'No content found'

def parse_page(url, website):
    soup = get_soup(url)
    news_data = []
    articles = soup.find_all(website['article_tag'], class_=website['article_class'])
    print(f"Found {len(articles)} articles on page: {url}")

    for article in articles:
        title_tag = article.find(class_=website.get('title_class', None))
        if title_tag:
            title = title_tag.get_text(strip=True)
        else:
            title_tag = article.find('a')
            title = title_tag['title'] if 'title' in title_tag.attrs else title_tag.get_text(strip=True)

        link_tag = article.find('a')
        link = link_tag['href'] if link_tag else 'No link found'

        date_tag = article.find(class_=website['date_class'])
        date = date_tag.get_text(strip=True) if date_tag else 'No date found'

        content = parse_article(link, website) if link != 'No link found' else 'No content found'

        news_data.append({
            'title': title,
            'link': link,
            'date': date,
            'content': content,
            'is_fake': 0,
            'media_bias': website['platform']
        })
        print(f"Appended article: {title}")

    return news_data

def get_all_articles(base_url, website, max_pages=2):
    articles = []
    next_page = base_url
    current_page = 1

    while next_page and current_page <= max_pages:
        print(f"Crawling page {current_page}")
        articles.extend(parse_page(next_page, website))
        soup = get_soup(next_page)
        if website['name'] == 'Antara':
            next_page = f"{base_url}/{current_page + 1}"
        elif website['name'] == 'Suara':
            next_page = f"{base_url}?page={current_page + 1}"
        elif website['name'] == 'Detik':
            next_page = f"{base_url}/{current_page + 1}"
        else:
            next_button = soup.find(class_=website['next_page'])
            next_page = next_button["href"] if next_button else None
        current_page += 1
        time.sleep(2)
    return articles

def main():
    all_news = []
    for website in websites:
        try:
            base_url = website['url']
            scraped_news = get_all_articles(base_url, website)
            print(f"Scraped {len(scraped_news)} articles from {website['name']}")
            all_news.extend(scraped_news)
            time.sleep(2)  # Respectful delay to avoid overwhelming the server
        except requests.HTTPError as e:
            print(f"Failed to scrape {website['name']}: {e}")

    df = pd.DataFrame(all_news)
    print(f"Total articles collected: {len(all_news)}")

    # Save to CSV
    df.to_csv('scraped_news.csv', index=False)

if __name__ == "__main__":
    main()


Crawling page 1
Found 10 articles on page: https://www.cnnindonesia.com/politik/indeks/4
Appended article: Paripurna DPR Resmi Sahkan RUU Sumber Daya Alam Hayati jadi UU
Appended article: Airlangga Ungkap Rencana Kaesang Akan Bertamu ke Golkar Kamis Esok
Appended article: DPR Berencana Ubah Wantimpres Kembali Jadi DPA
Appended article: Guyon Kaesang: Pak Presiden PKS Harusnya Jadi Calon Gubernur Jakarta
Appended article: PDIP Sindir IKN Molor: Sebelumnya Bilang Sangat Siap, Ternyata Belum
Appended article: DPR Segera Jadwalkan Rapat Bahas Putusan MA soal Syarat Usia Cagub
Appended article: Menag Yaqut Buka Suara soal DPR Bentuk Pansus Angket Haji
Appended article: PBNU Sebut Izin Kelola Tambang Belum Keluar: Masih Proses
Appended article: Pansus Angket Pengawasan Haji 2024 Diteken 35 Anggota DPR
Appended article: DPR Resmi Bentuk Pansus Angket Haji Beranggotakan 30 Orang
Crawling page 2
Found 10 articles on page: https://www.cnnindonesia.com/politik/indeks/4/2
Appended article: Jokowi 

### Crawling data menggunakan inputan dari user atau berdasarkan topik

In [9]:
import json
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
import time

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Load JSON file
with open('news_websites.json') as json_file:
    websites = json.load(json_file)['websites']

def get_soup(url):
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    return BeautifulSoup(response.text, 'html.parser')

def parse_article(link, website):
    try:
        soup = get_soup(link)
        content_div = soup.find(class_=website['content_class'])
        content = ' '.join(p.get_text(strip=True) for p in content_div.find_all('p')) if content_div else 'No content found'
        return content
    except Exception as e:
        print(f"Error parsing article {link}: {e}")
        return 'No content found'

def parse_page(url, website):
    soup = get_soup(url)
    news_data = []
    articles = soup.find_all(website['article_tag'], class_=website['article_class'])
    print(f"Found {len(articles)} articles on page: {url}")

    for article in articles:
        title_tag = article.find(class_=website.get('title_class', None))
        if title_tag:
            title = title_tag.get_text(strip=True)
        else:
            title_tag = article.find('a')
            if title_tag:
                title = title_tag['title'] if 'title' in title_tag.attrs else title_tag.get_text(strip=True)
            else:
                title = 'No title found'
        link_tag = article.find('a')
        link = link_tag['href'] if link_tag else 'No link found'

        date_tag = article.find(class_=website['date_class'])
        date = date_tag.get_text(strip=True) if date_tag else 'No date found'

        content = parse_article(link, website) if link != 'No link found' else 'No content found'

        news_data.append({
            'title': title,
            'link': link,
            'date': date,
            'content': content,
            'is_fake': 0,
            'media_bias': website['platform']
        })
        print(f"Appended article: {title}")

    return news_data

def get_all_articles(base_url, website, max_pages=2):
    articles = []
    next_page = base_url
    current_page = 1

    while next_page and current_page <= max_pages:
        print(f"Crawling page {current_page}")
        articles.extend(parse_page(next_page, website))
        soup = get_soup(next_page)
        if website['name'] == 'Antara':
            next_page = f"{base_url}&page={current_page + 1}"
        elif website['name'] == 'Suara':
            next_page = f"{base_url}?page={current_page + 1}"
        elif website['name'] == 'Detik':
            next_page = f"{base_url}&page={current_page + 1}"
        else:
            next_button = soup.find(class_=website['next_page'])
            next_page = next_button["href"] if next_button else None
        current_page += 1
        time.sleep(2)
    return articles

def main():
    all_news = []
    topik = input("Masukkan Topik: ")
    for website in websites:
        try:
            base_url = website['url'] + topik
            scraped_news = get_all_articles(base_url, website)
            print(f"Scraped {len(scraped_news)} articles from {website['name']}")
            all_news.extend(scraped_news)
            time.sleep(2)  # Respectful delay to avoid overwhelming the server
        except requests.HTTPError as e:
            print(f"Failed to scrape {website['name']}: {e}")

    df = pd.DataFrame(all_news)
    print(f"Total articles collected: {len(all_news)}")

    # Save to CSV
    df.to_csv('scraped_news.csv', index=False)

if __name__ == "__main__":
    main()


Masukkan Topik: jakarta
Crawling page 1
Found 0 articles on page: https://www.suara.com/search?q=jakarta
Crawling page 2
Found 0 articles on page: https://www.suara.com/search?q=jakarta?page=2
Scraped 0 articles from Suara
Crawling page 1
Found 15 articles on page: https://www.antaranews.com/search?q=jakarta
Appended article: Estimasi kebutuhan baja 331 ribu ton dalam pembangunan IKN 2023-2024


KeyboardInterrupt: 