In [None]:
!pip -q install langchain huggingface_hub tiktoken chromadb lark gdown

In [1]:
from langchain.schema import Document
from langchain_community.vectorstores import Chroma
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.embeddings import HuggingFaceBgeEmbeddings
import torch

In [2]:
import pandas as pd
import os

In [2]:
RAW_DATA_PATH = './train_gpt-4_raw_RAG.csv'
VECTORIZE_DB_PERSIST_DIRECTORY = 'SE2024/vdb'

In [None]:
if os.path.exists(RAW_DATA_PATH):
    print('Data file already exists')
else:
    print("Data doesn't exist, start download from the google drive...")
    !gdown 15VK8MaOEg2gF8iwmI4bummXt8whZF9Bq -O $RAW_DATA_PATH

# Load Documents

In [3]:
doc_loader = CSVLoader(
    file_path=RAW_DATA_PATH,
    source_column="ID",
    metadata_columns=["ID"],
)

In [4]:
docs = doc_loader.load()

# Get Embeddings

In [None]:
embedding = HuggingFaceBgeEmbeddings(
        model_name="BAAI/bge-large-en-v1.5",
        model_kwargs={"device": 'cuda' if torch.cuda.is_available() else 'cpu'},
        encode_kwargs={'normalize_embeddings': True},
)

# Prepare and Persist DB

In [None]:
os.makedirs(VECTORIZE_DB_PERSIST_DIRECTORY, exist_ok=True)

In [None]:
vector_store = Chroma(
    persist_directory=VECTORIZE_DB_PERSIST_DIRECTORY,
    embedding=embedding,
    documents=docs,
)

In [None]:
vector_store.persist()