# Overview

This notebook contain web scraping process from the three news portal in the study:
1. EmitenNews
2. Bisnis.com
3. Detikfinance

<br>

The scraping process follows these steps:
1. Take all the news link that is indexed on each news portal and put it in a csv file
2. Visit each news link to take all the information from the news article
3. Information from the article that's taken was the news headline and the news content itself

# EmitenNews

## Scraping news' link from EmitenNews

In [None]:
import requests
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

start_index = 19017
end_index = 29538
n_iter = (end_index - start_index) / 9

progress_bar = tqdm(total=n_iter, desc="Processing items", unit="task")

base_url = 'https://emitennews.com/category/emiten/'

link_container = []

while start_index <= end_index:
    url = base_url + str(start_index)

    response = requests.get(url)

    if response.status_code == 200:
      soup = BeautifulSoup(response.text, 'html.parser')

      search_result_wrapper = soup.find('div', class_='search-result-wrapper')

      if search_result_wrapper:
        links = search_result_wrapper.find_all('a', recursive=False)

        for link in links:
          href = link.get('href')
          if href:
            link_container.append(href)
      else:
        print("Search result wrapper not found on the page.")
    else:
      print("Failed to retrieve the page. Status code:", response.status_code)

    progress_bar.update(1)
    start_index += 9

progress_bar.close()

Processing items:   0%|          | 0/1671.0 [00:00<?, ?task/s]

In [None]:
import csv
from google.colab import files

# Define the file name
csv_filename = '/content/link_emitennews_2016_2022.csv'

# Write the list to a CSV file
with open(csv_filename, mode='w', newline='') as file:
    writer = csv.writer(file)
    for item in link_container:
        writer.writerow([item])  # Write each string as a new row

# Download the file
files.download(csv_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
print(f"Number of links: {len(link_container)}")

15048

## Scraping the news content and news headline from EmitenNews

In [None]:
news_container = []


for link in tqdm(link_container, desc="scrapping links"):
  response = requests.get(link)
  news = {}

  if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')

    # get time element
    time_posted_element = soup.find('span', class_='time-posted')

    if time_posted_element:
      time_posted = time_posted_element.get_text(strip=True)
      news['time_posted'] = time_posted
    else:
      print("Time posted element not found on the page.")
      news['time_posted'] = None

    # get the news content
    div_content = soup.find('div', class_='article-body')

    if div_content:
      text_content = div_content.get_text(strip=True)
      news['text_content'] = text_content
    else:
      print("Div not found on the page.")
      news['text_content'] = None

    news_container.append(news)
  else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")

scrapping links:   0%|          | 0/15048 [00:00<?, ?it/s]

In [None]:
import pandas as pd
from google.colab import files

df = pd.DataFrame(news_container)
df.to_csv('/content/stock_news_emiten.csv', index=False)

files.download('/content/stock_news_emiten.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Bisnis.com

## Scraping news' link from Bisnis.com

In [None]:
import requests
from bs4 import BeautifulSoup
from tqdm.auto import tqdm
from datetime import datetime, timedelta

base_link = "https://www.bisnis.com/index?categoryId=194&type=indeks&date="

# Define start and end dates
start_date = datetime.strptime('2016-01-01', '%Y-%m-%d')
end_date = datetime.strptime('2022-12-31', '%Y-%m-%d')

# Iterate through the date range
current_date = start_date
news_link_container = []
n_iter = (end_date - start_date).days

print(n_iter)

progress_bar = tqdm(total=n_iter, desc="Processing items", unit="task")

while current_date <= end_date:
  link = base_link + current_date.strftime('%Y-%m-%d')

  response = requests.get(link)
  soup = BeautifulSoup(response.text, 'html.parser')
  list_view = soup.find(id='indeksListView')
  news_section = list_view.find_all(class_='artContent')

  for section in news_section:
    news_dict = {}
    news_type = section.find(class_='artChannel').get_text(strip=True)
    news_dict['type'] = news_type

    news_link = section.find(class_='artLink')['href']
    news_dict['link'] = news_link

    news_link_container.append(news_dict)

  # Print the current date in YYYY-MM-DD format
  print(current_date.strftime('%Y-%m-%d'))

  # Increment the current date by 1 day
  current_date += timedelta(days=1)

  progress_bar.update(1)

progress_bar.close()



2556


Processing items:   0%|          | 0/2556 [00:00<?, ?task/s]

2016-01-01
2016-01-02
2016-01-03
2016-01-04
2016-01-05
2016-01-06
2016-01-07
2016-01-08
2016-01-09
2016-01-10
2016-01-11
2016-01-12
2016-01-13
2016-01-14
2016-01-15
2016-01-16
2016-01-17
2016-01-18
2016-01-19
2016-01-20
2016-01-21
2016-01-22
2016-01-23
2016-01-24
2016-01-25
2016-01-26
2016-01-27
2016-01-28
2016-01-29
2016-01-30
2016-01-31
2016-02-01
2016-02-02
2016-02-03
2016-02-04
2016-02-05
2016-02-06
2016-02-07
2016-02-08
2016-02-09
2016-02-10
2016-02-11
2016-02-12
2016-02-13
2016-02-14
2016-02-15
2016-02-16
2016-02-17
2016-02-18
2016-02-19
2016-02-20
2016-02-21
2016-02-22
2016-02-23
2016-02-24
2016-02-25
2016-02-26
2016-02-27
2016-02-28
2016-02-29
2016-03-01
2016-03-02
2016-03-03
2016-03-04
2016-03-05
2016-03-06
2016-03-07
2016-03-08
2016-03-09
2016-03-10
2016-03-11
2016-03-12
2016-03-13
2016-03-14
2016-03-15
2016-03-16
2016-03-17
2016-03-18
2016-03-19
2016-03-20
2016-03-21
2016-03-22
2016-03-23
2016-03-24
2016-03-25
2016-03-26
2016-03-27
2016-03-28
2016-03-29
2016-03-30
2016-03-31

In [None]:
import pandas as pd
from google.colab import files

df = pd.DataFrame(news_link_container)
df.to_csv('/content/list_bisniscom.csv', index=False)

files.download('/content/list_bisniscom.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Scraping the news content and news headline from Bisnis.com

In [None]:
import requests
from bs4 import BeautifulSoup
from tqdm.auto import tqdm
import pandas as pd

FILE_NAME_BISNISCOM = "link_bisniscom_2016_2022.csv"

news_link_df = pd.read_csv(FILE_NAME_BISNISCOM)
news_link_container = news_link_df.to_dict('records')

news_container = []

class_name_news_content = "detailsContent"
news_content_tag = 'article'

class_name_time_posted = "detailsAttributeDates"
time_posted_tag = 'div'

class_name_headline = "detailsTitleCaption"
headline_tag = 'h1'

for news_link in tqdm(news_link_container, desc="Scrapping news content and news headline from bisnis.com"):
  try:
    response = requests.get(news_link['link'])
    news = {}
    news['link'] = news_link['link']
    news['type'] = news_link['type']

    news['text_content'] = ''

    if response.status_code == 200:
      soup = BeautifulSoup(response.text, 'html.parser')

      # get time element
      time_posted_element = soup.find(time_posted_tag, class_=class_name_time_posted)

      if time_posted_element:
        time_posted = time_posted_element.get_text(strip=True)
        news['time_posted'] = time_posted
      else:
        print("Time posted element not found on the page.")
        news['time_posted'] = None

      # get the news content element
      div_content = soup.find(news_content_tag, class_=class_name_news_content)

      if div_content:
        for p in div_content.find_all('p', recursive=False):
          text_content = p.get_text(strip=True)
          news['text_content'] += (text_content + '\n')
      else:
        print("Div not found on the page.")
        news['text_content'] = None

      # get the news headline element
      headline_element = soup.find(headline_tag, class_=class_name_headline)

      if headline_element:
        headline = headline_element.get_text(strip=True)
        news['headline'] = headline
      else:
        print("Headline element not found on the page.")
        news['headline'] = None

      news_container.append(news)
    else:
      print(f"Failed to retrieve the page. Status code: {response.status_code}")
  except:
    print('Error')

Scrapping news content and news headline from bisnis.com:   0%|          | 0/51484 [00:00<?, ?it/s]

Error
Failed to retrieve the page. Status code: 404
Error
Failed to retrieve the page. Status code: 502
Failed to retrieve the page. Status code: 404


In [None]:
import pandas as pd
from google.colab import files

df = pd.DataFrame(news_container)
df.to_csv('/content/stock_news_bisniscom_2016_2022.csv', index=False)

files.download('/content/stock_news_bisniscom_2016_2022.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# DetikFinance

## Scraping news link from Detikfinance

In [None]:
import requests
from bs4 import BeautifulSoup
from tqdm.auto import tqdm
import pandas as pd

base_url = "https://finance.detik.com/bursa-dan-valas/indeks?date="

id_name = "indeks-container"
class_name = "media__text"

link_container = []

start_date = "2016-01-01"
end_date = "2022-12-31"

date_range = pd.date_range(start=start_date, end=end_date, freq='D')

all_dates = date_range.tolist()

for date in tqdm(all_dates, desc="scrapping links"):
  tanggal = date.strftime('%d')
  bulan = date.strftime('%m')
  tahun = date.strftime('%Y')

  print(tanggal, bulan, tahun)
  formatted_date = f"{bulan}%2F{tanggal}%2F{tahun}"

  url = base_url + formatted_date

  response = requests.get(url)

  soup = BeautifulSoup(response.text, 'html.parser')

  div_content = soup.find(id=id_name)

  news_section = div_content.find_all(class_=class_name)

  for section in news_section:
    for a_tag in section.find_all('a', href=True):
      print(a_tag['href'])
      link_container.append(a_tag['href'])

scrapping links:   0%|          | 0/2557 [00:00<?, ?it/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
https://finance.detik.com/bursa-dan-valas/d-5228650/awal-pekan-dolar-as-berada-di-level-rp-14-650
https://finance.detik.com/bursa-dan-valas/d-5228632/sesuai-prediksi-ihsg-awal-pekan-dibuka-hijau
https://finance.detik.com/bursa-dan-valas/d-5228535/aigo-sepekan-debut-di-lantai-bursa-saham-agensi-bts-turun-tajam
https://finance.detik.com/bursa-dan-valas/d-5228483/kresna-sekuritas-dibekukan-ojk-bagaimana-nasib-uang-nasabah
27 10 2020
https://finance.detik.com/bursa-dan-valas/d-5231529/rolls-royce-incar-rp-38-t-lewat-rights-issue
https://finance.detik.com/bursa-dan-valas/d-5231436/dolar-as-tertekan-di-antara-ketidakpastian-pemilu-as-dan-covid-19
https://finance.detik.com/bursa-dan-valas/d-5231426/wall-street-dibuka-bergairah-jelang-pemilu-as
https://finance.detik.com/bursa-dan-valas/d-5231197/waskita-menanti-pembentukan-dana-abadi-kenapa
https://finance.detik.com/bursa-dan-valas/d-5231032/ant-group-jack-ma-tutup-penawaran-saha

In [None]:
import pandas as pd
from google.colab import files

df = pd.DataFrame(link_container)
df.to_csv('/content/list_detikfinance.csv', index=False)

files.download('/content/list_detikfinance.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Scraping the news content and news headline from Detikfinance

In [None]:
import requests
from bs4 import BeautifulSoup
from tqdm.auto import tqdm
import pandas as pd

link_container = pd.read_csv("link_detikfinance_2016_2022.csv")
link_container = link_container['link_berita'].tolist()
news_container = []

class_name_content = "detail__body-text itp_bodycontent"
content_tags = 'div'

class_name_time_posted = "detail__date"
time_posted_tags = 'div'

class_name_headline = 'detail__title'
headline_tag = 'h1'

for news_link in tqdm(link_container, desc="Scrapping news content and news headline from detikfinance"):
  try:
    response = requests.get(news_link)
    news = {}
    news['link'] = news_link
    news['text_content'] = ''

    if response.status_code == 200:
      soup = BeautifulSoup(response.text, 'html.parser')

      time_posted_element = soup.find(time_posted_tags, class_=class_name_time_posted)

      if time_posted_element:
        time_posted = time_posted_element.get_text(strip=True)
        news['time_posted'] = time_posted
      else:
        print("Time posted element not found on the page.")
        news['time_posted'] = None

      div_content = soup.find(content_tags, class_=class_name_content)

      if div_content:
        text_content = div_content.get_text(separator=" ", strip=True)
        news['text_content'] = text_content
      else:
        print("Div not found on the page.")
        news['text_content'] = None

      headline_element = soup.find(headline_tag, class_=class_name_headline)

      if headline_element:
        headline = headline_element.get_text(strip=True)
        news['headline'] = headline
      else:
        print("Headline element not found on the page.")
        news['headline'] = None

      news_container.append(news)
    else:
      print(f"Failed to retrieve the page. Status code: {response.status_code}")
  except:
    print('Error')

Scrapping news content and news headline from detikfinance:   0%|          | 0/15146 [00:00<?, ?it/s]

Error


In [None]:
import pandas as pd
link_container = pd.read_csv("link_detikfinance_2016_2022.csv")
link_container = link_container['link_berita'].tolist()
print(len(link_container))

15146


In [None]:
import pandas as pd
from google.colab import files

df = pd.DataFrame(news_container)
df.to_csv('/content/stock_news_detikfinance_2016_2022.csv', index=False)

files.download('/content/stock_news_detikfinance_2016_2022.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>