In [None]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
import json
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def scrape_news(emiten_list, output_file):
    # Dictionary to store all news data
    news_data = []

    # Setup WebDriver
    driver = webdriver.Chrome(ChromeDriverManager().install())

    # Loop through each ticker
    for emiten in emiten_list:
        print(f"Searching news for {emiten}...")
        
        # Open the search URL
        search_url = "http://www.iqplus.info/news/search/"
        driver.get(search_url)

        # Wait for the search input element to be present and interact with it
        try:
            search_input = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.NAME, "search"))
            )
            # Interact with the search input once it's present
            search_input.send_keys(emiten)
            search_input.submit()
        except:
            print(f"Search input element not found for {emiten}. Moving to next ticker.")
            continue  # Skip to the next ticker if the search input is not found

        time.sleep(3)  # Wait for the page to load

        # Parse the page source
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Find news items
        news_list = soup.find_all("li", style="text-transform:capitalize;")

        # Extract news details
        if news_list:
            print(f"Found {len(news_list)} news items for {emiten}")
            for news in news_list:
                date_time = news.find("b").text.strip() if news.find("b") else "No Date"
                title = news.find("a").text.strip() if news.find("a") else "No Title"
                link = news.find("a")["href"] if news.find("a") else "#"

                # Check if title contains the emiten name followed by a colon
                if f"{emiten}:" in title:
                    # Append news as a dictionary
                    news_data.append({
                        "Emiten": emiten,
                        "Date": date_time,
                        "Title": title,
                        "Link": link
                    })
                else:
                    print(f"Skipping news item as title does not contain '{emiten}:'")
        else:
            print(f"No news found for {emiten}.")

    # Close the WebDriver
    driver.quit()

    # Save the news data to a JSON file
    with open(output_file, "w", encoding="utf-8") as json_file:
        json.dump(news_data, json_file, indent=4, ensure_ascii=False)
    
    print(f"News data saved to {output_file}")


In [None]:
# Process JSON files from pt1 to pt5
for i in range(1, 6):
    input_file = f"emiten_list_pt{i}.json"
    output_file = f"stock_news.json_pt{i}"

    try:
        # Read emiten list from JSON file
        with open(input_file, "r", encoding="utf-8") as file:
            emiten_list = json.load(file)
        print(f"Successfully loaded {len(emiten_list)} emiten from {input_file}")

            # Scrape news for the current emiten list
        scrape_news(emiten_list, output_file)

    except FileNotFoundError:
        print(f"Error: {input_file} not found. Skipping...")
        continue
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in {input_file}. Skipping...")
        continue