In [None]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
import json
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def scrape_news(emiten_list, output_file):
    # Dictionary to store all news data
    news_data = []

    # Setup WebDriver
    driver = webdriver.Chrome(ChromeDriverManager().install())

    # Loop through each ticker
    for emiten in emiten_list:
        print(f"Searching news for {emiten}...")
        
        # Open the search URL
        search_url = "http://www.iqplus.info/news/search/"
        driver.get(search_url)

        # Wait for the search input element to be present and interact with it
        try:
            search_input = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.NAME, "search"))
            )
            # Interact with the search input once it's present
            search_input.send_keys(emiten)
            search_input.submit()
        except:
            print(f"Search input element not found for {emiten}. Moving to next ticker.")
            continue  # Skip to the next ticker if the search input is not found

        time.sleep(3)  # Wait for the page to load

        # Parse the page source
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Find news items
        news_list = soup.find_all("li", style="text-transform:capitalize;")

        # Extract news details
        if news_list:
            print(f"Found {len(news_list)} news items for {emiten}")
            for news in news_list:
                date_time = news.find("b").text.strip() if news.find("b") else "No Date"
                title = news.find("a").text.strip() if news.find("a") else "No Title"
                link = news.find("a")["href"] if news.find("a") else "#"

                # Check if title contains the emiten name followed by a colon
                if f"{emiten}:" in title:
                    # Append news as a dictionary
                    news_data.append({
                        "Emiten": emiten,
                        "Date": date_time,
                        "Title": title,
                        "Link": link
                    })
                else:
                    print(f"Skipping news item as title does not contain '{emiten}:'")
        else:
            print(f"No news found for {emiten}.")

    # Close the WebDriver
    driver.quit()

    # Save the news data to a JSON file
    with open(output_file, "w", encoding="utf-8") as json_file:
        json.dump(news_data, json_file, indent=4, ensure_ascii=False)
    
    print(f"News data saved to {output_file}")


In [None]:
# Process JSON files from pt1 to pt5
for i in range(1, 6):
    input_file = f"emiten_list_pt{i}.json"
    output_file = f"stock_news.json_pt{i}"

    try:
        # Read emiten list from JSON file
        with open(input_file, "r", encoding="utf-8") as file:
            emiten_list = json.load(file)
        print(f"Successfully loaded {len(emiten_list)} emiten from {input_file}")

            # Scrape news for the current emiten list
        scrape_news(emiten_list, output_file)

    except FileNotFoundError:
        print(f"Error: {input_file} not found. Skipping...")
        continue
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in {input_file}. Skipping...")
        continue

### Ingestion ke MongoDB

In [None]:
import time
import pymongo
import json

# Start timing

start_time = time.time()

# Load data from all JSON files
print("Loading data from JSON files...")
all_news_data = []

for i in range(1, 6):
    file_name = f"stock_news.json_pt{i}"
    try:
        with open(file_name, "r", encoding="utf-8") as f:
            news_data = json.load(f)
            all_news_data.extend(news_data)
            print(f"Loaded {len(news_data)} records from {file_name}")
    except FileNotFoundError:
        print(f"Warning: {file_name} not found. Skipping...")
    except json.JSONDecodeError:
        print(f"Warning: Invalid JSON format in {file_name}. Skipping...")

print(f"Loaded total of {len(all_news_data)} news records from all files")

# Connect to MongoDB Atlas
connection_string = "mongodb+srv://kelompok-5:FwJP0h7Bo6cTpEol@big-data.do3of.mongodb.net/?retryWrites=true&w=majority&ssl=true"
client = pymongo.MongoClient(connection_string, 
                           maxPoolSize=100,  # Increase connection pool
                           retryWrites=True)

# Select database and collection
db = client["Big_Data_kel_5"]  # Database name
collection = db["Stock_News"]   # Collection name

# Create compound index for faster lookups if it doesn't exist
collection.create_index([("Emiten", 1), ("Date", 1), ("Title", 1)], unique=True, background=True)

# Get all existing emiten-date-title combinations in one query
print("Fetching existing records...")
existing_records = {}
for doc in collection.find({}, {"Emiten": 1, "Date": 1, "Title": 1, "_id": 0}):
    emiten = doc["Emiten"]
    date = doc["Date"]
    title = doc["Title"]
    
    if emiten not in existing_records:
        existing_records[emiten] = {}
    
    if date not in existing_records[emiten]:
        existing_records[emiten][date] = set()
    
    existing_records[emiten][date].add(title)

print(f"Found existing records for {len(existing_records)} emitens")

# Prepare bulk operations
bulk_ops = []
new_record_count = 0
batch_size = 1000  # Process in batches

print("Preparing bulk operations...")
for record in all_news_data:
    emiten = record["Emiten"]
    date = record["Date"]
    title = record["Title"]
    
    # Skip if this record already exists
    if (emiten in existing_records and 
        date in existing_records[emiten] and 
        title in existing_records[emiten][date]):
        continue
    
    # Add to bulk operations
    bulk_ops.append(pymongo.InsertOne(record))
    new_record_count += 1
    
    # Execute batch if reached batch size
    if len(bulk_ops) >= batch_size:
        if bulk_ops:
            collection.bulk_write(bulk_ops, ordered=False)
            print(f"Inserted batch of {len(bulk_ops)} records")
            bulk_ops = []

# Insert any remaining operations
if bulk_ops:
    collection.bulk_write(bulk_ops, ordered=False)
    print(f"Inserted final batch of {len(bulk_ops)} records")

elapsed_time = time.time() - start_time
print(f"Completed MongoDB ingestion process. Inserted {new_record_count} new records in {elapsed_time:.2f} seconds")