In [9]:
# Import Libraries
import pandas as pd
import requests
import os
import pymongo
from pymongo import MongoClient
from dotenv import load_dotenv

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_community.document_loaders import CSVLoader

In [10]:
# Load Environment Variables
load_dotenv()
MONGODB_URI = os.getenv("MONGODB_URI")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [11]:
# Load CSV Dataset
loader = CSVLoader(
    file_path='faq_allobank_clean.csv',
    content_columns=["answer"],  # Kolom utama untuk konten
    metadata_columns=["question"],  # Metadata pertanyaan
    csv_args={'delimiter': ','}
)
documents = loader.load()

In [12]:
# Split Into Chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=50,
    length_function=len
)
chunks = text_splitter.split_documents(documents)
print(f"Chunks dari faq menjadi {len(chunks)} sub-documents.")

Chunks dari faq menjadi 704 sub-documents.


In [13]:
# Initialize Embeddings dengan model text-embedding-3-small
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
    openai_api_key=OPENAI_API_KEY,
)

In [14]:
# MongoDB Connection
client = MongoClient(MONGODB_URI)
collection = client['finalproject_db']['faq']

In [15]:
# Vector Store Setup
vector_store = MongoDBAtlasVectorSearch.from_documents(
    documents=chunks,
    embedding=embeddings,
    collection=collection,
    index_name="vector_index"
)

print("Data berhasil diunggah ke MongoDB Atlas!")
print(f"Jumlah dokumen: {len(documents)}")
print(f"Jumlah chunks: {len(chunks)}")

Data berhasil diunggah ke MongoDB Atlas!
Jumlah dokumen: 352
Jumlah chunks: 704
