### Import library

In [None]:
# Library untuk mengambil data tabular dari yfinance
import yfinance as yf

# Library untuk HTTP requests dan parsing HTML
import requests
from bs4 import BeautifulSoup

# Library untuk otomatisasi web scraping dengan Selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
import time

# Library untuk parsing file XML
import xml.etree.ElementTree as ET

# Library untuk mengekstrak file ZIP (misalnya instance.zip)
import zipfile

# Library untuk koneksi ke MongoDB
import pymongo

# Library tambahan (opsional) seperti untuk pengolahan data dan logging
import pandas as pd
import os
import logging

import json
from datetime import datetime

### Scraping Data

Setelah import library maka akan dilakukan proses pengambilan data menggunakan yfinance lalu disimpan dalam bentuk json

In [None]:
# Load emiten list from JSON file
with open('emiten_list.json', 'r') as file:
    emiten_list = json.load(file)

# Create a list to store all data
all_data = []

# Process each emiten
for emiten in emiten_list:
    print(f"Processing {emiten}...")
    try:
        # Get stock data
        stock = yf.Ticker(emiten)
        data = stock.history(period="1y")
        
        # Reset index to make Date a regular column
        data.reset_index(inplace=True)
        
        # Convert Date to string format
        data["Date"] = data["Date"].apply(lambda x: x.strftime("%d/%m/%Y - %H:%M"))
        
        # Convert DataFrame to list of dictionaries
        records = json.loads(data.to_json(orient="records"))
        
        # Add emiten information to each record
        for record in records:
            record["emiten"] = emiten
            
        # Add these records to our main list
        all_data.extend(records)
        
        print(f"✅ Added {len(records)} records for {emiten}")
    except Exception as e:
        print(f"❌ Error processing {emiten}: {str(e)}")

# Save all data to a single JSON file
output_file = "yfinancescrape.json"
with open(output_file, 'w') as f:
    json.dump(all_data, f, indent=2)

print(f"Successfully saved {len(all_data)} records to {output_file}")


### Ingestion ke MongoDB
Setelah dibuat JSON filenya, maka langkah selanjutnya adalah memasukkannya ke mongoDB

In [None]:
# Start timing
start_time = time.time()

# Load data from JSON file
print("Loading data from JSON file...")
with open("yfinancescrape.json", "r") as f:
    all_data = json.load(f)

print(f"Loaded {len(all_data)} records from JSON file")

# Connect to MongoDB Atlas
connection_string = "mongodb+srv://kelompok-5:FwJP0h7Bo6cTpEol@big-data.do3of.mongodb.net/?retryWrites=true&w=majority&ssl=true"
client = pymongo.MongoClient(connection_string, 
                            maxPoolSize=100,  # Increase connection pool
                            retryWrites=True)

# Select database and collection
db = client["Big_Data_kel_5"]  # Database name
collection = db["Test_yfinance"]     # Collection name

# Create compound index for faster lookups if it doesn't exist
collection.create_index([("emiten", 1), ("Date", 1)], unique=True, background=True)

# Get all existing emiten-date pairs in one query (much faster than multiple queries)
print("Fetching existing records...")
existing_records = {}
for doc in collection.find({}, {"emiten": 1, "Date": 1, "_id": 0}):
    emiten = doc["emiten"]
    date = doc["Date"]
    if emiten not in existing_records:
        existing_records[emiten] = set()
    existing_records[emiten].add(date)

print(f"Found existing records for {len(existing_records)} emitens")

# Prepare bulk operations
bulk_ops = []
new_record_count = 0
batch_size = 1000  # Process in batches

print("Preparing bulk operations...")
for record in all_data:
    emiten = record["emiten"]
    date = record["Date"]
    
    # Skip if this record already exists
    if emiten in existing_records and date in existing_records[emiten]:
        continue
    
    # Add to bulk operations
    bulk_ops.append(pymongo.InsertOne(record))
    new_record_count += 1
    
    # Execute batch if reached batch size
    if len(bulk_ops) >= batch_size:
        if bulk_ops:
            collection.bulk_write(bulk_ops, ordered=False)
            print(f"Inserted batch of {len(bulk_ops)} records")
            bulk_ops = []

# Insert any remaining operations
if bulk_ops:
    collection.bulk_write(bulk_ops, ordered=False)
    print(f"Inserted final batch of {len(bulk_ops)} records")

elapsed_time = time.time() - start_time
print(f"Completed MongoDB ingestion process. Inserted {new_record_count} new records in {elapsed_time:.2f} seconds")