<a href="https://colab.research.google.com/github/Mfys212/Generasi-Information-Teks-from-Kompas/blob/main/Kompas_Scrapper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import multiprocessing
from concurrent.futures import ThreadPoolExecutor
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import re
from tqdm import tqdm
from datetime import datetime
import pytz

class ScrapKompas:
  def __init__(self, tag, max_page=5):
    self.page = f"https://www.kompas.com/tag/{tag}?page="
    self.link, self.tanggal, self.judul, self.konten = [], [], [], []
    self.max_page = max_page
    self.max_workers = multiprocessing.cpu_count()

  def get_link(self, soup):
    links = soup.find_all('a', class_='article__link')
    return links

  def get_tanggal(self, soup):
    try:
      try:
        time_text = soup.find('div', class_='read__time').get_text()
        pattern = r'\d{2}/\d{2}/\d{4}, \d{2}:\d{2} \w{3}'
        time = re.search(pattern, time_text).group()
        date = re.sub(r'(\d{2})/(\d{2})/(\d{4}), (\d{2}:\d{2}) (\w{3})', r'\3-\2-\1 \4:00', time)
      except:
        date = soup.find('div', class_='videoKG-date').get_text()
        date = datetime.strptime(date, "%d %B %Y, %H:%M WIB")
        wib = pytz.timezone('Asia/Jakarta').localize(date)
        date = wib.strftime("%Y-%m-%d %H:%M:%S")
    except:
      date = ""
    return date

  def get_judul(self, soup):
    try:
      title = soup.find('h1', class_='read__title').get_text()
    except:
      title = ""
    return title

  def get_konten(self, soup):
    try:
      content = soup.find('div', class_='read__content').get_text(strip=True)
    except:
      content = ""
    return content

  def get_data(self, save=True, output="csv"):
    def process_link(link):
      try:
        soup = BeautifulSoup(requests.get(link['href']).content, 'html.parser')
        self.link.append(link['href'])
        self.tanggal.append(self.get_tanggal(soup))
        self.judul.append(self.get_judul(soup))
        self.konten.append(self.get_konten(soup))
      except:
        pass

    with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
      futures = []
      i = 1
      pbar = tqdm(total=self.max_page, ncols=80, bar_format='scraping |{bar}{r_bar} pages')
      while True:
        page = requests.get(self.page + str(i))
        soup = BeautifulSoup(page.content, 'html.parser')
        if soup.find('h1', class_='p404__bigtitle'):
          print("\nSudah sampai halaman terakhir")
          break
        links = self.get_link(soup)
        for link in links:
          futures.append(executor.submit(process_link, link))
        pbar.update(1)
        if self.max_page is not None and i == self.max_page:
          break
        i -= -1
      pbar.close()

      for future in futures:
        future.result()

    if save:
      if output == "csv":
        self.save_to_csv()
      elif output == "json":
        self.save_to_json()

    return pd.DataFrame({"Tanggal":self.tanggal, "Judul":self.judul, "Konten":self.konten, "Link":self.link})

  def save_to_csv(self):
    data = pd.DataFrame({"Tanggal":self.tanggal, "Judul":self.judul, "Konten":self.konten, "Link":self.link})
    data.to_csv('data.csv', index=False)

  def save_to_json(self):
    data = []
    for j in range(len(self.judul)):
      data.append({
        'Tanggal': self.tanggal[j],
        'Judul': self.judul[j],
        'Konten': self.konten[j],
        'Link': self.link[j]
      })
    with open('data.json', 'w', encoding='utf-8') as jsonfile:
      json.dump(data, jsonfile, ensure_ascii=False, indent=4)

scrapper = ScrapKompas('indosat', max_page=20)
data = scrapper.get_data(save=True, output="csv")
data

scraping |████████████████████████████████| 20/20 [00:48<00:00,  2.45s/it] pages


Unnamed: 0,Tanggal,Judul,Konten,Link
0,2024-03-28 17:08:00,"Pasar Ramadan IM3, Tempat Ngabuburit Seru Biar...",KOMPAS.com– Ngabuburit dapat menjadi salah sat...,http://www.kompas.com/hype/read/2024/03/28/170...
1,2024-03-11 14:00:00,"3 Cara Transfer Pulsa Indosat Terbaru 2024, Bi...",KOMPAS.com-Transfer pulsa Indosatadalah layana...,http://www.kompas.com/tren/read/2024/03/11/140...
2,2024-04-23 17:04:00,"Telkomsel, XL, Indosat Catatkan Kenaikan Trafi...",KOMPAS.com- Tiga operator seluler di Indonesia...,http://www.kompas.com/tren/read/2024/02/11/150...
3,2024-02-11 15:00:00,"Cara Cek Umur Kartu Telepon dari Telkomsel, In...",KOMPAS.com- Setiap hari kita akan menggunakan ...,http://www.kompas.com/tren/read/2024/03/23/143...
4,2024-03-23 14:30:00,Ramai soal Pengaktifan Kembali Nomor Ponsel ya...,KOMPAS.com- Unggahan terkait pengaktifan kemba...,http://tekno.kompas.com/read/2024/04/23/170400...
...,...,...,...,...
395,2020-04-20 08:16:00,Indosat Rilis Paket Freedom Kuota Harian 1 GB ...,KOMPAS.com- Sejumlahoperator telekomunikasidi ...,http://tekno.kompas.com/read/2020/04/20/081600...
396,2020-03-27 09:00:00,Operator Telekomunikasi Bantu Dukung Pembelaja...,"KOMPAS.com -Menyambut momen Ramadan tahun ini,...",http://www.kompas.com/edu/read/2020/03/27/0900...
397,2020-07-16 10:13:00,Cara Berhenti Langganan RBT Indosat Ooredoo,KOMPAS.com- Beberapa pengguna Indosat masih me...,http://tekno.kompas.com/read/2020/07/16/101300...
398,2020-04-06 09:30:00,"Berikut Cara Dapatkan Internet Gratis dari XL,...",KOMPAS.com- Presiden Joko Widodo (Jokowi) mene...,http://www.kompas.com/tren/read/2020/04/06/093...


In [None]:
scrapper.save_to_json()