# Ringkasan berita menggunakan LLM Lokal

Program notebook ini bertujuan untuk meringkas file berita yand suda di scraping dalam bentuk json menggunakan LLM local

## 1. Import Library yang Diperlukan

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import time
from urllib.parse import urlparse, parse_qs
import pandas as pd
from IPython.display import HTML, display
from datetime import datetime
import json
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Debug: Print the BASE_DOWNLOAD_DIR to verify it's loaded correctly
print(f"BASE_DOWNLOAD_DIR from .env: {os.environ.get('BASE_DOWNLOAD_DIR', 'Not found')}")

## 2. Fungsi Integrasi dengan LLM Lokal

In [None]:
def summarize_with_local_llm(title, content, prompt_template=None):
    """
    Meringkas berita menggunakan LLM lokal.
    
    Args:
        title (str): Judul berita
        content (str): Isi berita
        prompt_template (str, optional): Template prompt. Default None akan menggunakan template standar.
        
    Returns:
        str: Ringkasan berita dari LLM
    """
    # URL endpoint lokal untuk LLM
    url = "http://127.0.0.1:1234/v1/chat/completions"
    headers = {"Content-Type": "application/json"}
    
    # Template prompt default jika tidak disediakan
    if prompt_template is None:
        prompt_template = "Ringkas berita berikut menjadi 1 buah paragraf yang terdiri dari 3-4 kalimat. Pastikan tidak ada kesalahan dalam penulisan data. Output hanya terdiri dari ringkasan berita tanpa kalimat pembuka atau penjelas tambahan\n\nJudul: {title}\n\nIsi Berita: {content}"
    
    # Format prompt dengan judul dan isi berita
    formatted_prompt = prompt_template.format(title=title, content=content)
    
    # Persiapkan payload
    data = {
        "model": "gemma-3-12b-it-qat",
        "messages": [{"role": "user", "content": formatted_prompt}]
    }
    
    try:
        # Kirim request ke API LLM lokal
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=120)
        response.raise_for_status()  # Raise exception untuk status error
        
        # Parse respons JSON
        result = response.json()
        
        # Ekstrak ringkasan dari respons
        if "choices" in result and len(result["choices"]) > 0:
            summary = result["choices"][0]["message"]["content"]
            return summary
        else:
            return "Gagal mendapatkan ringkasan dari LLM."
    
    except requests.exceptions.RequestException as e:
        error_msg = f"Error saat mengakses LLM lokal: {str(e)}"
        print(error_msg)
        return error_msg
    except json.JSONDecodeError as e:
        error_msg = f"Error saat memproses respons JSON dari LLM: {str(e)}"
        print(error_msg)
        return error_msg
    except Exception as e:
        error_msg = f"Error tak terduga saat meringkas dengan LLM: {str(e)}"
        print(error_msg)
        return error_msg

## 3. Meringkas berita dari file JSON

### Meringkas berita menggunakan LLM Lokal

In [None]:
def save_checkpoint(results, part, batch_num):
    """
    Menyimpan checkpoint hasil ringkasan ke file JSON
    
    Args:
        results (list): List hasil ringkasan berita
        part (int): Nomor bagian file yang diproses
        batch_num (int): Nomor batch checkpoint
    
    Returns:
        str: Path file checkpoint yang disimpan
    """
    if not results:
        print("Tidak ada data untuk disimpan dalam checkpoint!")
        return None
    
    # Gunakan current directory + Batch subfolder
    current_dir = os.path.dirname(os.path.abspath("__file__"))
    base_dir = os.path.join(current_dir, 'Batch')
    
    # Buat nama file checkpoint
    checkpoint_filename = os.path.join(base_dir, f"checkpoint_pt{part}_batch{batch_num}.json")
    
    # Pastikan direktori output ada
    os.makedirs(base_dir, exist_ok=True)
    
    # Simpan ke JSON
    with open(checkpoint_filename, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=4, ensure_ascii=False)
    
    print(f"Checkpoint {batch_num} disimpan ke {checkpoint_filename}")
    
    return checkpoint_filename

def process_json_news_by_part(part=3, checkpoint_interval=100, start_from=0, input_filepath=None):
    """
    Membaca file JSON dari path tertentu berdasarkan part dan meringkasnya dengan LLM lokal.
    
    Args:
        part (int): Nomor bagian file JSON yang akan diproses (default: 3)
        checkpoint_interval (int): Interval penyimpanan checkpoint (default: 100)
        start_from (int): Indeks berita untuk memulai proses (default: 0, dari awal)
        input_filepath (str): Path file input (opsional)
        
    Returns:
        list: Hasil ringkasan berita dalam bentuk list of dictionaries
    """
    # Jika filepath disediakan, gunakan itu; jika tidak, coba gunakan filepath default
    if input_filepath and os.path.exists(input_filepath):
        json_filepath = input_filepath
        print(f"Menggunakan file input yang disediakan: {json_filepath}")
    else:
        # Coba beberapa kemungkinan lokasi file
        # 1. Current directory + filename
        current_dir = os.path.dirname(os.path.abspath("__file__"))
        local_filepath = os.path.join(current_dir, f"stock_news_pt{part}.json")
        
        # 2. Subdirectory Scrapping_Berita
        scraping_dir = os.path.join(os.path.dirname(current_dir), "Scrapping_Berita")
        scraping_filepath = os.path.join(scraping_dir, f"stock_news_pt{part}.json")
        
        # 3. Original path from .env
        base_dir = os.environ.get('BASE_DOWNLOAD_DIR', '')
        env_filepath = os.path.join(base_dir, f"stock_news_pt{part}.json")
        
        # Check which file exists and use that
        if os.path.exists(local_filepath):
            json_filepath = local_filepath
            print(f"File ditemukan di direktori lokal: {json_filepath}")
        elif os.path.exists(scraping_filepath):
            json_filepath = scraping_filepath
            print(f"File ditemukan di direktori Scrapping_Berita: {json_filepath}")
        elif os.path.exists(env_filepath):
            json_filepath = env_filepath
            print(f"File ditemukan di direktori .env: {json_filepath}")
        else:
            # Ask the user for the correct path
            print(f"File stock_news_pt{part}.json tidak ditemukan di lokasi umum.")
            user_filepath = input(f"Masukkan path lengkap ke file stock_news_pt{part}.json: ")
            if os.path.exists(user_filepath):
                json_filepath = user_filepath
            else:
                print(f"Error: File yang dimasukkan tidak ditemukan!")
                return []
    
    try:
        # Baca file JSON
        print(f"Membaca file JSON dari: {json_filepath}")
        with open(json_filepath, 'r', encoding='utf-8') as f:
            news_data = json.load(f)
        
        if isinstance(news_data, dict):
            # Jika berbentuk dictionary, bukan list
            if "items" in news_data:
                news_items = news_data["items"]
            else:
                news_items = [news_data]
        else:
            # Jika sudah berbentuk list
            news_items = news_data
            
        total_items = len(news_items)
        print(f"Berhasil membaca {total_items} item berita dari file JSON.")
        
        if start_from > 0:
            if start_from >= total_items:
                print(f"Error: start_from ({start_from}) melebihi jumlah item yang tersedia ({total_items}).")
                return []
            print(f"Memulai proses dari indeks {start_from} (item ke-{start_from+1}).")
        
        results = []
        
        # Untuk melacak batch checkpoint
        batch_num = 1
        items_in_current_batch = 0
        
        for idx, item in enumerate(news_items[start_from:], start=start_from):
            print(f"\nMemproses berita {idx+1}/{total_items}: {item.get('Title', 'Untitled')}")
            
            # Konversi struktur dari format JSON ke format yang digunakan aplikasi
            news_item = {
                "Emiten": item.get("Emiten", ""),
                "Date": item.get("Date", ""),
                "Judul": item.get("Title", ""),
                "Link": item.get("Link", ""),
                "content": item.get("Content", "")  # Temporary field for processing
            }
            
            # Meringkas berita dengan LLM lokal jika konten ada
            if news_item["content"]:
                print("Meringkas berita dengan LLM lokal...")
                summary = summarize_with_local_llm(news_item["Judul"], news_item["content"])
                news_item["Ringkasan"] = summary
                
                # Tampilkan hasil
                display_result = {
                    "title": news_item["Judul"],
                    "date": news_item["Date"],
                    "content": news_item["content"],
                    "url": news_item["Link"],
                    "emiten": news_item["Emiten"],
                    "summary": news_item["Ringkasan"]
                }
                # Tambahkan fungsi display_news_results jika belum didefinisikan
                try:
                    display_news_results(display_result)
                except NameError:
                    # Fungsi belum didefinisikan, tampilkan ringkasan saja
                    print(f"Ringkasan: {news_item['Ringkasan']}")
            else:
                news_item["Ringkasan"] = "Tidak ada konten berita untuk diringkas."
                print("Tidak ada konten berita untuk diringkas.")
            
            # Hapus field sementara yang tidak perlu disimpan ke output
            if "content" in news_item:
                del news_item["content"]
                
            results.append(news_item)
            items_in_current_batch += 1
            
            # Simpan checkpoint jika sudah mencapai interval
            if items_in_current_batch >= checkpoint_interval:
                save_checkpoint(results, part, batch_num)
                batch_num += 1
                items_in_current_batch = 0
            
            print("-" * 50)
        
        # Simpan checkpoint terakhir jika masih ada item yang belum masuk checkpoint
        if items_in_current_batch > 0:
            save_checkpoint(results, part, batch_num)
        
        return results
    
    except FileNotFoundError:
        print(f"Error: File {json_filepath} tidak ditemukan!")
        print(f"Pastikan Anda memiliki file dengan nama tersebut.")
        return []
    except Exception as e:
        print(f"Error saat memproses file JSON: {str(e)}")
        return []

### Menyimpan ke dalam file json

In [None]:
def save_json_results_by_part(results, part=3):
    """
    Simpan hasil ringkasan berita ke file JSON dengan part tertentu
    
    Args:
        results (list): List hasil ringkasan berita
        part (int): Nomor bagian file yang diproses
    """
    if not results:
        print("Tidak ada data untuk disimpan!")
        return None
        
    # Gunakan current directory untuk menyimpan hasil
    current_dir = os.path.dirname(os.path.abspath("__file__"))
    
    # Buat nama file output berdasarkan part
    json_filename = os.path.join(current_dir, f"ringkasan_news_pt{part}.json")
    
    # Simpan ke JSON
    with open(json_filename, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=4, ensure_ascii=False)
    print(f"Data berhasil disimpan ke {json_filename}")
    
    # Tampilkan ringkasan data
    print("\nRingkasan data yang diproses:")
    try:
        # Konversi ke DataFrame untuk tampilan lebih baik
        df = pd.DataFrame(results)
        # Cek kolom yang tersedia dalam DataFrame
        available_columns = df.columns.tolist()
        print(f"Kolom yang tersedia: {available_columns}")
        
        # Pilih kolom yang ingin ditampilkan (jika ada)
        display_columns = []
        if 'Emiten' in available_columns:
            display_columns.append('Emiten')
        if 'Date' in available_columns:
            display_columns.append('Date')
        if 'Judul' in available_columns:
            display_columns.append('Judul')
            
        if display_columns:
            display(df[display_columns].head())
        else:
            display(df.head())
    except Exception as e:
        print(f"Warning: Tidak dapat menampilkan data: {str(e)}")
        
    return json_filename

def find_latest_checkpoint(part):
    """
    Mencari checkpoint terbaru untuk part tertentu
    
    Args:
        part (int): Nomor part yang dicari
    
    Returns:
        tuple: (Batch number, jumlah item, path file) atau (None, 0, None) jika tidak ditemukan
    """
    import os
    import glob
    
    # Directory tempat checkpoint disimpan - menggunakan direktori Batch di direktori saat ini
    current_dir = os.path.dirname(os.path.abspath("__file__"))
    base_dir = os.path.join(current_dir, 'Batch')
    pattern = os.path.join(base_dir, f"checkpoint_pt{part}_batch*.json")
    
    # Cari semua file checkpoint untuk part ini
    checkpoint_files = glob.glob(pattern)
    
    if not checkpoint_files:
        return None, 0, None
    
    # Ambil batch number dari nama file
    batch_numbers = []
    for file in checkpoint_files:
        try:
            batch_num = int(file.split("batch")[1].split(".")[0])
            batch_numbers.append((batch_num, file))
        except:
            continue
    
    if not batch_numbers:
        return None, 0, None
    
    # Cari batch number tertinggi
    latest_batch, latest_file = max(batch_numbers, key=lambda x: x[0])
    
    # Hitung jumlah item dalam checkpoint terbaru
    try:
        with open(latest_file, 'r', encoding='utf-8') as f:
            checkpoint_data = json.load(f)
        item_count = len(checkpoint_data)
    except:
        item_count = 0
    
    return latest_batch, item_count, latest_file

### Proses Meringkas

In [None]:
# Contoh pemakaian:

# 1. Memulai proses ringkasan dari awal dengan checkpoint setiap 100 item
# hasil_ringkasan = process_json_news_by_part(part=3, checkpoint_interval=100)

# 2. Melanjutkan proses dari checkpoint terbaru (jika ada)
# hasil_ringkasan = continue_from_checkpoint(part=3) # Note: continue_from_checkpoint function might need to be defined or implemented

# 3. Memulai proses dari indeks tertentu
# hasil_ringkasan = process_json_news_by_part(part=3, start_from=150) 

# 4. Menggunakan file input spesifik
# hasil_ringkasan = process_json_news_by_part(5, checkpoint_interval=150, input_filepath=file_path)

# Jalankan proses ringkasan dengan checkpoint setiap 150 item
hasil_ringkasan = process_json_news_by_part(5, checkpoint_interval=150)

In [None]:
def display_news_results(result):
    """
    Tampilkan hasil ringkasan berita dengan format yang lebih baik
    
    Args:
        result (dict): Hasil ringkasan berita dalam bentuk dictionary
    """
    print(f"Judul: {result.get('title', 'Tidak ada judul')}")
    print(f"Tanggal: {result.get('date', 'Tidak ada tanggal')}")
    print(f"Emiten: {result.get('emiten', 'Tidak ada emiten')}")
    print(f"URL: {result.get('url', 'Tidak ada URL')}")
    print(f"\nRingkasan:\n{result.get('summary', 'Tidak ada ringkasan')}")
    print("\n---")

## 4. Menyimpan ke MongoDB

Bagian ini berisi fungsi untuk mengupload data ringkasan berita ke MongoDB.

In [None]:
import pymongo
import glob
import os
import json
from dotenv import load_dotenv
from datetime import datetime

def load_ringkasan_files(base_dir=None, start_part=1, end_part=5):
    """
    Load multiple ringkasan JSON files.
    
    Args:
        base_dir (str): Directory containing the files (default: from .env or current directory)
        start_part (int): Starting part number
        end_part (int): Ending part number
        
    Returns:
        list: Combined data from all loaded files
    """
    all_data = []
    
    # Make sure environment variables are loaded
    load_dotenv()
    
    if base_dir is None:
        # First check if BASE_DOWNLOAD_DIR is set in environment variables
        base_dir = os.environ.get('BASE_DOWNLOAD_DIR')
        if not base_dir:
            # If not in environment, use current directory
            base_dir = os.getcwd()
    
    # Ensure the path is normalized and exists
    base_dir = os.path.normpath(base_dir)
    
    # Debug output to verify the path being used
    print(f"Using base directory: {base_dir}")
    print(f"Searching for JSON files with pattern: ringkasan_news_pt[1-5].json")
    
    for part in range(start_part, end_part + 1):
        filepath = os.path.join(base_dir, f"ringkasan_news_pt{part}.json")
        print(f"Looking for file: {filepath}")
        if os.path.exists(filepath):
            try:
                with open(filepath, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    if isinstance(data, list):
                        all_data.extend(data)
                        print(f"✓ Loaded {len(data)} records from {filepath}")
                    else:
                        print(f"⚠ Warning: {filepath} does not contain a list of records")
            except Exception as e:
                print(f"❌ Error loading {filepath}: {e}")
        else:
            print(f"❌ File not found: {filepath}")
    
    print(f"Total records loaded: {len(all_data)}")
    return all_data

In [None]:
def connect_to_mongodb():
    """
    Connect to MongoDB using credentials from environment variables.
    
    Returns:
        tuple: (client, database) MongoDB client and database objects
    """
    # Make sure environment variables are loaded
    load_dotenv()
    
    connection_string = os.environ.get('MONGODB_CONNECTION_STRING')
    database_name = os.environ.get('MONGODB_DATABASE_NAME')
    
    if not connection_string or not database_name:
        raise ValueError("MongoDB connection string or database name not found in .env file")
        
    try:
        # Connect to MongoDB
        client = pymongo.MongoClient(connection_string)
        # Test the connection
        client.server_info()
        print("Connected to MongoDB successfully")
        
        # Get the database
        db = client[database_name]
        return client, db
    
    except pymongo.errors.ConnectionFailure as e:
        print(f"Could not connect to MongoDB: {e}")
        return None, None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None, None

In [None]:
def upload_to_mongodb(data, replace_existing=False):
    """
    Upload data to MongoDB collection.
    
    Args:
        data (list): List of dictionaries to upload
        replace_existing (bool): Whether to replace existing collection (default: False)
        
    Returns:
        tuple: (success, count) Boolean success flag and count of inserted documents
    """
    if not data:
        print("No data to upload")
        return False, 0
    
    # Connect to MongoDB
    client, db = connect_to_mongodb()
    # Explicitly check if client or db is None
    if client is None or db is None:
        return False, 0
    
    try:
        # Get the collection name from .env
        collection_name = os.environ.get('COLLECTION_NEWS_SUMMARY_DATA')
        if not collection_name:
            print("Collection name not found in .env file, using default name")
            collection_name = "Data_Berita_Ringkasan"
        
        # Get or create the collection
        collection = db[collection_name]
        
        # If replacing existing data, drop the collection first
        if replace_existing and collection_name in db.list_collection_names():
            collection.drop()
            print(f"Dropped existing collection: {collection_name}")
        
        # Add upload timestamp to each record
        for item in data:
            item['uploaded_at'] = datetime.now()
        
        # Insert the data
        result = collection.insert_many(data)
        inserted_count = len(result.inserted_ids)
        print(f"Successfully inserted {inserted_count} documents into {collection_name}")
        
        return True, inserted_count
    
    except Exception as e:
        print(f"Error uploading data to MongoDB: {e}")
        return False, 0
    
    finally:
        # Close the MongoDB connection
        if client:
            client.close()
            print("MongoDB connection closed")

In [None]:
def upload_all_ringkasan_files(base_dir=None, start_part=1, end_part=5, replace_existing=False):
    """
    Load and upload all ringkasan files to MongoDB.
    
    Args:
        base_dir (str): Base directory containing the JSON files
        start_part (int): Starting part number
        end_part (int): Ending part number
        replace_existing (bool): Whether to replace the existing collection
    """
    print(f"Beginning upload process for ringkasan_news_pt{start_part} through pt{end_part}...")
    
    # Make sure environment variables are loaded
    load_dotenv()
    
    # If base_dir is None, get it from .env
    if base_dir is None:
        base_dir = os.environ.get('BASE_DOWNLOAD_DIR')
        if base_dir:
            print(f"Using BASE_DOWNLOAD_DIR from .env: {base_dir}")
        else:
            base_dir = os.getcwd()
            print(f"BASE_DOWNLOAD_DIR not found in .env, using current directory: {base_dir}")
    
    # Ensure the path is properly formatted
    base_dir = os.path.normpath(base_dir)
    
    # Load the data
    all_data = load_ringkasan_files(base_dir, start_part, end_part)
    
    if not all_data:
        print("No data was loaded. Upload aborted.")
        return
    
    # Upload to MongoDB
    success, count = upload_to_mongodb(all_data, replace_existing)
    
    if success:
        print(f"Upload completed successfully. {count} documents were inserted.")
    else:
        print("Upload failed. Please check the error messages above.")

In [None]:
# Print the current environment variables to debug
print("Checking environment variables:")
print(f"BASE_DOWNLOAD_DIR = {os.environ.get('BASE_DOWNLOAD_DIR', 'Not set')}")
print(f"MONGODB_CONNECTION_STRING = {'Set (hidden for security)' if os.environ.get('MONGODB_CONNECTION_STRING') else 'Not set'}")
print(f"MONGODB_DATABASE_NAME = {os.environ.get('MONGODB_DATABASE_NAME', 'Not set')}")
print(f"COLLECTION_NEWS_SUMMARY_DATA = {os.environ.get('COLLECTION_NEWS_SUMMARY_DATA', 'Not set')}")
print("\n")

# Run the upload process for all 5 parts
# Set replace_existing=True if you want to replace any existing data
upload_all_ringkasan_files(start_part=1, end_part=5, replace_existing=False)