<a href="https://colab.research.google.com/github/Mfys212/Generasi-Information-Teks-from-Kompas/blob/main/Kompas_Scrapper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import multiprocessing
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import re
from tqdm import tqdm
from datetime import datetime
import pytz

class ScrapKompas:
  def __init__(self, tag, max_page=5):
    self.page = f"https://www.kompas.com/tag/{tag}?page="
    self.link, self.tanggal, self.judul, self.konten = [], [], [], []
    self.max_page = max_page
    self.max_workers = multiprocessing.cpu_count()

  def get_link(self, soup):
    links = soup.find_all('a', class_='article__link')
    return links

  def get_tanggal(self, soup):
    try:
      try:
        time_text = soup.find('div', class_='read__time').get_text()
        pattern = r'\d{2}/\d{2}/\d{4}, \d{2}:\d{2} \w{3}'
        time = re.search(pattern, time_text).group()
        date = re.sub(r'(\d{2})/(\d{2})/(\d{4}), (\d{2}:\d{2}) (\w{3})', r'\3-\2-\1 \4:00', time)
      except:
        month_mapping = {
          'Januari': 'January',
          'Februari': 'February',
          'Maret': 'March',
          'April': 'April',
          'Mei': 'May',
          'Juni': 'June',
          'Juli': 'July',
          'Agustus': 'August',
          'September': 'September',
          'Oktober': 'October',
          'November': 'November',
          'Desember': 'December'
        }
        date = soup.find('div', class_='videoKG-date').get_text()
        for ind_month, eng_month in month_mapping.items():
          date = date.replace(ind_month, eng_month)
        date = datetime.strptime(date, "%d %B %Y, %H:%M WIB")
        wib = pytz.timezone('Asia/Jakarta').localize(date)
        date = wib.strftime("%Y-%m-%d %H:%M:%S")
    except:
      date = ""
    return date

  def get_judul(self, soup):
    try:
      title = soup.find('h1', class_='read__title').get_text()
    except:
      title = ""
    return title

  def get_konten(self, soup):
    try:
      content = soup.find('div', class_='read__content').get_text(strip=True)
    except:
      content = ""
    return content

  def get_data(self, save=True, output="csv"):
    def process_link(link):
      try:
        soup = BeautifulSoup(requests.get(link['href']).content, 'html.parser')
        tanggal = self.get_tanggal(soup)
        judul = self.get_judul(soup)
        konten = self.get_konten(soup)
        if judul != "" and konten != "":
          self.link.append(link['href'])
          self.tanggal.append(tanggal)
          self.judul.append(judul)
          self.konten.append(konten)
      except:
        pass

    with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
      futures = []
      i = 1
      pbar = tqdm(total=self.max_page, ncols=80, bar_format='scraping |{bar}{r_bar} pages')
      while True:
        page = requests.get(self.page + str(i))
        soup = BeautifulSoup(page.content, 'html.parser')
        if soup.find('h1', class_='p404__bigtitle'):
          print("\nSudah sampai halaman terakhir")
          break
        links = self.get_link(soup)
        for link in links:
          futures.append(executor.submit(process_link, link))
        pbar.update(1)
        if self.max_page is not None and i == self.max_page:
          break
        i -= -1
      pbar.close()

      for future in as_completed(futures):
        future.result()

    if save:
      if output == "csv":
        self.save_to_csv()
      elif output == "json":
        self.save_to_json()

    return pd.DataFrame({"Tanggal":self.tanggal, "Judul":self.judul, "Konten":self.konten, "Link":self.link})

  def save_to_csv(self):
    data = pd.DataFrame({"Tanggal":self.tanggal, "Judul":self.judul, "Konten":self.konten, "Link":self.link})
    data.to_csv('data.csv', index=False)

  def save_to_json(self):
    data = []
    for j in range(len(self.judul)):
      data.append({
        'Tanggal': self.tanggal[j],
        'Judul': self.judul[j],
        'Konten': self.konten[j],
        'Link': self.link[j]
      })
    with open('data.json', 'w', encoding='utf-8') as jsonfile:
      json.dump(data, jsonfile, ensure_ascii=False, indent=4)

scrapper = ScrapKompas('indosat', max_page=43)
data = scrapper.get_data(save=True, output="csv")
data

scraping |████████████████████████████████| 43/43 [01:50<00:00,  2.57s/it] pages


Unnamed: 0,Tanggal,Judul,Konten,Link
0,2024-01-27 17:59:00,"Perkuat Infrastruktur Digital, Telkom dan Indo...",KOMPAS.com- PT Telkom Indonesia (Persero) Tbk ...,http://money.kompas.com/read/2024/01/27/175922...
1,2024-02-10 11:23:00,Cara Cek Umur Kartu Indosat dengan Mudah,KOMPAS.com- Tiap kartu atau nomor telepon dari...,http://tekno.kompas.com/read/2022/08/02/103000...
2,2024-01-24 17:40:00,Oknum Karyawan Vendor Curi Genset Tower di Ped...,"NUNUKAN, KOMPAS.com –Unit Reskrim Polsek Semba...",http://regional.kompas.com/read/2024/01/24/174...
3,2024-02-04 23:55:00,Cara Kirim Pulsa Indosat melalui SMS dan USSD,"JAKARTA, KOMPAS.com-Cara kirim pulsa Indosatke...",http://money.kompas.com/read/2024/02/04/235508...
4,2024-02-29 18:06:00,Ericsson dan Indosat Berkolaborasi untuk Doron...,"BARCELONA, KOMPAS.com-EricssondanIndosatOoredo...",http://tekno.kompas.com/read/2024/02/29/180600...
...,...,...,...,...
840,2013-06-12 15:50:00,"Mau Ambil Indosat dan Telkomsel, Dahlan Bingung","JAKARTA, KOMPAS.com— Menteri BUMN Dahlan Iskan...",http://bisniskeuangan.kompas.com/read/2013/06/...
841,2013-07-09 16:28:00,"Kasus Indosat-IM2, Mastel Akan Lapor ke Komisi...","JAKARTA, KOMPAS.com- Masyarakat Telekomunikasi...",http://bisniskeuangan.kompas.com/read/2013/07/...
842,2013-05-17 14:09:00,Transfer Rupiah Kini Bisa lewat SMS,Tabita Diela/Kompas.comMengetik pesan singkat ...,http://tekno.kompas.com/read/2013/05/17/140924...
843,2013-06-26 15:01:00,"Terungkap, Pencurian Kabel Bawah Laut Indosat",emerginggrowth.comKOMPAS.com— Kepolisian Daera...,http://tekno.kompas.com/read/2013/06/26/150122...


In [None]:
scrapper.save_to_json()