In [None]:
import os
os.environ['KAGGLE_USERNAME'] = "<username>"
os.environ['KAGGLE_KEY'] = "<token>"

# Download the zip file to local storage and then extract the desired contents directly to the GKE GCS CSI mounted bucket. The bucket is mounted at the "/persist-data" path in the jupyter pod.
!kaggle datasets download -d shivamb/netflix-shows -p ~/data --force
!mkdir /data/netflix-shows -p
!unzip -o ~/data/netflix-shows.zip -d /data/netflix-shows

In [None]:
!pip install langchain-google-cloud-sql-pg

In [None]:
import os
import uuid

from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings

from langchain_google_cloud_sql_pg import PostgresEngine, PostgresVectorStore
from google.cloud.sql.connector import IPTypes

# initialize parameters
INSTANCE_CONNECTION_NAME = os.environ.get("CLOUDSQL_INSTANCE_CONNECTION_NAME", "")
print(f"Your instance connection name is: {INSTANCE_CONNECTION_NAME}")
cloud_variables = INSTANCE_CONNECTION_NAME.split(":")

GCP_PROJECT_ID = os.environ.get("GCP_PROJECT_ID", cloud_variables[0])
GCP_CLOUD_SQL_REGION = os.environ.get("CLOUDSQL_INSTANCE_REGION", cloud_variables[1])
GCP_CLOUD_SQL_INSTANCE = os.environ.get("CLOUDSQL_INSTANCE", cloud_variables[2])

DB_NAME = os.environ.get("INSTANCE_CONNECTION_NAME", "pgvector-database")
VECTOR_EMBEDDINGS_TABLE_NAME = os.environ.get("EMBEDDINGS_TABLE_NAME", "netflix_reviews_db")
CHAT_HISTORY_TABLE_NAME = os.environ.get("CHAT_HISTORY_TABLE_NAME", "message_store")

VECTOR_DIMENSION = os.environ.get("VECTOR_DIMENSION", 384)
SENTENCE_TRANSFORMER_MODEL = 'intfloat/multilingual-e5-small' 

SHARED_DATASET_BASE_PATH="/data/netflix-shows/"
REVIEWS_FILE_NAME="netflix_titles.csv"

BATCH_SIZE = 100
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 10
TABLE_NAME = 'netflix_reviews_db'

try:
    db_username_file = open("/etc/secret-volume/username", "r")
    DB_USER = db_username_file.read()
    db_username_file.close()

    db_password_file = open("/etc/secret-volume/password", "r")
    DB_PASS = db_password_file.read()
    db_password_file.close()
except:
    DB_USER = os.environ.get("DB_USERNAME", "postgres")
    DB_PASS = os.environ.get("DB_PASS", "postgres")

engine = PostgresEngine.from_instance(
        project_id=GCP_PROJECT_ID,
        region=GCP_CLOUD_SQL_REGION,
        instance=GCP_CLOUD_SQL_INSTANCE,
        database=DB_NAME,
        user=DB_USER,
        password=DB_PASS,
        ip_type=IPTypes.PRIVATE,
)

try:
    engine.init_vectorstore_table(
        VECTOR_EMBEDDINGS_TABLE_NAME,
        vector_size=VECTOR_DIMENSION,
        overwrite_existing=True,
    )
except Exception as err:
    print(f"Error: {err}")


embeddings_service = HuggingFaceEmbeddings(model_name=SENTENCE_TRANSFORMER_MODEL)
vector_store = PostgresVectorStore.create_sync(
    engine=engine,
    embedding_service=embeddings_service,
    table_name=VECTOR_EMBEDDINGS_TABLE_NAME,
)

splitter = RecursiveCharacterTextSplitter(
            chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, length_function=len
)

loader = CSVLoader(file_path=f"{SHARED_DATASET_BASE_PATH}/{REVIEWS_FILE_NAME}")
documents = loader.load()

documents = documents[:1000] #Taking a sample for test purposes 

splits = splitter.split_documents(documents)
ids = [str(uuid.uuid4()) for i in range(len(splits))]
vector_store.add_documents(splits, ids)