<a href="https://colab.research.google.com/github/Mfys212/Generasi-Information-Teks-from-Kompas/blob/main/Kompas_Scrapper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import multiprocessing
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import re
from tqdm import tqdm
from datetime import datetime
import pytz

class ScrapKompas:
  def __init__(self, tag, max_page=5):
    self.page = f"https://www.kompas.com/tag/{tag}?page="
    self.link, self.tanggal, self.judul, self.konten = [], [], [], []
    self.max_page = max_page
    self.max_workers = multiprocessing.cpu_count()

  def get_link(self, soup):
    links = soup.find_all('a', class_='article__link')
    return links

  def get_tanggal(self, soup):
    try:
      try:
        time_text = soup.find('div', class_='read__time').get_text()
        pattern = r'\d{2}/\d{2}/\d{4}, \d{2}:\d{2} \w{3}'
        time = re.search(pattern, time_text).group()
        date = re.sub(r'(\d{2})/(\d{2})/(\d{4}), (\d{2}:\d{2}) (\w{3})', r'\3-\2-\1 \4:00', time)
      except:
        month_mapping = {
          'Januari': 'January',
          'Februari': 'February',
          'Maret': 'March',
          'April': 'April',
          'Mei': 'May',
          'Juni': 'June',
          'Juli': 'July',
          'Agustus': 'August',
          'September': 'September',
          'Oktober': 'October',
          'November': 'November',
          'Desember': 'December'
        }
        date = soup.find('div', class_='videoKG-date').get_text()
        for ind_month, eng_month in month_mapping.items():
          date = date.replace(ind_month, eng_month)
        date = datetime.strptime(date, "%d %B %Y, %H:%M WIB")
        wib = pytz.timezone('Asia/Jakarta').localize(date)
        date = wib.strftime("%Y-%m-%d %H:%M:%S")
    except:
      date = ""
    return date

  def get_judul(self, soup):
    try:
      title = soup.find('h1', class_='read__title').get_text()
    except:
      title = ""
    return title

  def get_konten(self, soup):
    try:
      content = soup.find('div', class_='read__content').get_text(strip=True)
    except:
      content = ""
    return content

  def get_data(self, save=True, output="csv", filename="data.csv"):
    def process_link(link):
      try:
        soup = BeautifulSoup(requests.get(link['href']).content, 'html.parser')
        tanggal = self.get_tanggal(soup)
        judul = self.get_judul(soup)
        konten = self.get_konten(soup)
        if judul != "" and konten != "":
          self.link.append(link['href'])
          self.tanggal.append(tanggal)
          self.judul.append(judul)
          self.konten.append(konten)
      except:
        pass

    with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
      futures = []
      i = 1
      pbar = tqdm(total=self.max_page, ncols=80, bar_format='scraping |{bar}{r_bar} pages')
      while True:
        page = requests.get(self.page + str(i))
        soup = BeautifulSoup(page.content, 'html.parser')
        if soup.find('h1', class_='p404__bigtitle'):
          print("\nSudah sampai halaman terakhir")
          break
        links = self.get_link(soup)
        for link in links:
          futures.append(executor.submit(process_link, link))
        pbar.update(1)
        if self.max_page is not None and i == self.max_page:
          break
        i -= -1
      pbar.close()

      print("Menyelesaikan....")
      for future in as_completed(futures):
        future.result()

    if save:
      if output == "csv":
        self.save_to_csv(filename=filename)
      elif output == "json":
        self.save_to_json(filename=filename)

    print("Sukses!")
    return pd.DataFrame({"Tanggal":self.tanggal, "Judul":self.judul, "Konten":self.konten, "Link":self.link})

  def save_to_csv(self, filename="data.csv"):
    data = pd.DataFrame({"Tanggal":self.tanggal, "Judul":self.judul, "Konten":self.konten, "Link":self.link})
    data.to_csv(filename, index=False)

  def save_to_json(self, filename="data.json"):
    data = []
    for j in range(len(self.judul)):
      data.append({
        'Tanggal': self.tanggal[j],
        'Judul': self.judul[j],
        'Konten': self.konten[j],
        'Link': self.link[j]
      })
    with open(filename, 'w', encoding='utf-8') as jsonfile:
      json.dump(data, jsonfile, ensure_ascii=False, indent=4)

scrapper = ScrapKompas('teknologi', max_page=1000)
teknologi = scrapper.get_data(save=True, output="csv", filename="teknologi.csv")

scraping |█████████▍                   | 324/1000 [14:16<29:46,  2.64s/it] pages


Sudah sampai halaman terakhir





Menyelesaikan....
Sukses!


In [None]:
scrapper2 = ScrapKompas('ai', max_page=1000)
ai = scrapper2.get_data(save=True, output="csv", filename="ai.csv")

scraping |██▍                           | 81/1000 [03:36<40:59,  2.68s/it] pages


Sudah sampai halaman terakhir
Menyelesaikan....





Sukses!


In [None]:
scrapper3 = ScrapKompas('komputer', max_page=1000)
komputer = scrapper3.get_data(save=True, output="csv", filename="komputer.csv")

scraping |█▊                            | 59/1000 [02:37<41:50,  2.67s/it] pages


Sudah sampai halaman terakhir
Menyelesaikan....
Sukses!





In [None]:
scrapper4 = ScrapKompas('sains', max_page=1000)
sains = scrapper4.get_data(save=True, output="csv", filename="sains.csv")

scraping |██                            | 70/1000 [03:09<42:02,  2.71s/it] pages


Sudah sampai halaman terakhir
Menyelesaikan....





Sukses!


In [None]:
scrapper5 = ScrapKompas('teknik', max_page=1000)
teknik = scrapper5.get_data(save=True, output="csv", filename="teknik.csv")

scraping |███▋                         | 125/1000 [05:39<39:35,  2.71s/it] pages


Sudah sampai halaman terakhir
Menyelesaikan....





Sukses!


In [None]:
scrapper6 = ScrapKompas('matematika', max_page=1000)
matematika = scrapper6.get_data(save=True, output="csv", filename="matematika.csv")

scraping |██▍                           | 83/1000 [03:47<41:56,  2.74s/it] pages


Sudah sampai halaman terakhir
Menyelesaikan....





Sukses!


In [None]:
scrapper7 = ScrapKompas('statistika', max_page=1000)
statistika = scrapper7.get_data(save=True, output="csv", filename="statistika.csv")

scraping |                               | 3/1000 [00:07<43:06,  2.59s/it] pages


Sudah sampai halaman terakhir
Menyelesaikan....





Sukses!


In [None]:
scrapper8 = ScrapKompas('pemrograman', max_page=1000)
pemrograman = scrapper8.get_data(save=True, output="csv", filename="pemrograman.csv")

scraping |                               | 4/1000 [00:09<39:43,  2.39s/it] pages


Sudah sampai halaman terakhir
Menyelesaikan....





Sukses!


In [None]:
scrapper9 = ScrapKompas('data', max_page=1000)
data = scrapper9.get_data(save=True, output="csv", filename="data.csv")

scraping |██████████████▌              | 500/1000 [23:37<23:37,  2.84s/it] pages


Sudah sampai halaman terakhir
Menyelesaikan....





Sukses!


In [None]:
scrapper10 = ScrapKompas('kimia', max_page=1000)
kimia = scrapper10.get_data(save=True, output="csv", filename="kimia.csv")

scraping |███▏                         | 110/1000 [05:01<40:37,  2.74s/it] pages


Sudah sampai halaman terakhir
Menyelesaikan....





Sukses!


In [None]:
scrapper11 = ScrapKompas('fisika', max_page=1000)
fisika = scrapper11.get_data(save=True, output="csv", filename="fisika.csv")

scraping |█                             | 36/1000 [01:38<43:53,  2.73s/it] pages


Sudah sampai halaman terakhir





Menyelesaikan....
Sukses!


In [None]:
scrapper12 = ScrapKompas('biologi', max_page=1000)
biologi = scrapper12.get_data(save=True, output="csv", filename="biologi.csv")

scraping |█▊                            | 62/1000 [02:53<43:44,  2.80s/it] pages


Sudah sampai halaman terakhir
Menyelesaikan....





Sukses!


In [None]:
scrapper13 = ScrapKompas('algoritma', max_page=1000)
algoritma = scrapper13.get_data(save=True, output="csv", filename="algoritma.csv")

scraping |▏                              | 6/1000 [00:15<42:19,  2.55s/it] pages


Sudah sampai halaman terakhir
Menyelesaikan....





Sukses!
