# Cleaning, Store, dan Experiment

In [1]:
# %pip install --upgrade --force-reinstall "sqlalchemy>=2.0.0" langchain-postgres

In [2]:
import os
import pandas as pd
from dotenv import load_dotenv

In [3]:
load_dotenv()
db_url = os.getenv("DATABASE_URL")

In [4]:
df = pd.read_csv("../data/dataset_assignment.csv")
df.head(10)

Unnamed: 0,prompt,response
0,are you llama?,"*ahem* I'm not a real llama, but I can certain..."
1,hello from fareed,Hello Fareed! It's nice to meet you. Is there ...
2,hi from hugg,"Hi from me too! It's great to meet you, Hugg! ..."
3,Algunos de estos articulos respalda la creació...,Después de analizar los artículos proporcionad...
4,leyes respalda la creación de una biblioteca v...,"Una excelente pregunta!\n\nEn general, la crea..."
5,leyes en venezuela que respalda la creación de...,"Excelente pregunta!\n\nEn Venezuela, existen v..."
6,en la Ley de Acceso a la Información (2005) no...,"Tienes razón, me disculpo por el error. La Ley..."
7,wassup bro,"What's good?! Just an AI, no bro vibes here, b..."
8,this is pretty cool,"I'm glad you think so! However, I'm not sure w..."
9,well,"It seems like you're excited about something, ..."


In [5]:
df = df.dropna(subset=['prompt', 'response'])
df.head(10)

Unnamed: 0,prompt,response
0,are you llama?,"*ahem* I'm not a real llama, but I can certain..."
1,hello from fareed,Hello Fareed! It's nice to meet you. Is there ...
2,hi from hugg,"Hi from me too! It's great to meet you, Hugg! ..."
3,Algunos de estos articulos respalda la creació...,Después de analizar los artículos proporcionad...
4,leyes respalda la creación de una biblioteca v...,"Una excelente pregunta!\n\nEn general, la crea..."
5,leyes en venezuela que respalda la creación de...,"Excelente pregunta!\n\nEn Venezuela, existen v..."
6,en la Ley de Acceso a la Información (2005) no...,"Tienes razón, me disculpo por el error. La Ley..."
7,wassup bro,"What's good?! Just an AI, no bro vibes here, b..."
8,this is pretty cool,"I'm glad you think so! However, I'm not sure w..."
9,well,"It seems like you're excited about something, ..."


In [6]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_postgres import PGVector
from langchain_core.documents import Document

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
documents = []
for index, row in df.iterrows():
    content = f"Question: {row['prompt']}\nAnswer: {row['response']}"
    doc = Document(
        page_content=content,
        metadata={"source": "../data/dataset_assignment.csv", "row_id": index}
    )
    documents.append(doc)

len(documents)

9106

In [8]:
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

Loading weights: 100%|██████████| 103/103 [00:00<00:00, 176.15it/s, Materializing param=pooler.dense.weight]                             
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [12]:
import time

connection_string = db_url.replace("postgresql+psycopg2://", "postgresql+psycopg://")

vector_store = PGVector(
    embeddings=embedding,
    collection_name="customer_support_vector_db",
    connection=connection_string,
    use_jsonb=True,
    engine_args={
        "pool_size": 1,
        "max_overflow": 0,
        "pool_timeout": 60,
        "connect_args": {
            "connect_timeout": 10,
        }
    }
)

def batch_add_documents(documents, batch_size=32):
    total_docs = len(documents)
    print(f"Total ada {total_docs} dokumen")
    
    for i in range(0, total_docs, batch_size):
        batch = documents[i : i + batch_size]
        print(f"Mengirim batch {i} sampai {i + len(batch)}...")
        
        try:
            vector_store.add_documents(batch)
            print("VVVV Sukses masuk!")
            time.sleep(1)
        except Exception as e:
            print(f"XXXX Gagal di batch {i}: {e}")

batch_add_documents(documents)

Total ada 9106 dokumen
Mengirim batch 0 sampai 32...
VVVV Sukses masuk!
Mengirim batch 32 sampai 64...
VVVV Sukses masuk!
Mengirim batch 64 sampai 96...
VVVV Sukses masuk!
Mengirim batch 96 sampai 128...
VVVV Sukses masuk!
Mengirim batch 128 sampai 160...
VVVV Sukses masuk!
Mengirim batch 160 sampai 192...
VVVV Sukses masuk!
Mengirim batch 192 sampai 224...
VVVV Sukses masuk!
Mengirim batch 224 sampai 256...
VVVV Sukses masuk!
Mengirim batch 256 sampai 288...
VVVV Sukses masuk!
Mengirim batch 288 sampai 320...
VVVV Sukses masuk!
Mengirim batch 320 sampai 352...
VVVV Sukses masuk!
Mengirim batch 352 sampai 384...
VVVV Sukses masuk!
Mengirim batch 384 sampai 416...
VVVV Sukses masuk!
Mengirim batch 416 sampai 448...
VVVV Sukses masuk!
Mengirim batch 448 sampai 480...
VVVV Sukses masuk!
Mengirim batch 480 sampai 512...
VVVV Sukses masuk!
Mengirim batch 512 sampai 544...
VVVV Sukses masuk!
Mengirim batch 544 sampai 576...
VVVV Sukses masuk!
Mengirim batch 576 sampai 608...
VVVV Sukses masu