# RAG-on-GKE Application

This is a Python notebook for generating the vector embeddings used by the RAG on GKE application. For full information, please checkout the GitHub documentation [here](https://github.com/GoogleCloudPlatform/ai-on-gke/blob/main/applications/rag/README.md).


In [None]:
# Replace these with your settings
# Navigate to https://www.kaggle.com/settings/account and generate an API token to be used to setup the env variable. See https://www.kaggle.com/docs/api#authentication how to create one.
KAGGLE_USERNAME = "<username>"
KAGGLE_KEY = "<token>"

In [None]:
!pip install ray[default]==2.9.3 kaggle==1.6.6 langchain-google-cloud-sql-pg

In [None]:
import os
os.environ['KAGGLE_USERNAME'] = KAGGLE_USERNAME
os.environ['KAGGLE_KEY'] = KAGGLE_KEY

# Download the zip file to local storage and then extract the desired contents directly to the GKE GCS CSI mounted bucket. The bucket is mounted at the "/persist-data" path in the jupyter pod.
!kaggle datasets download -d shivamb/netflix-shows -p ~/data --force
!mkdir /data/netflix-shows -p
!unzip -o ~/data/netflix-shows.zip -d /data/netflix-shows

In [None]:
# Create a directory to package the contents that need to be downloaded in ray worker
! mkdir -p rag-app

In [None]:
%%writefile rag-app/job.py
# Comment out the above line if you want to see notebook print out, but the line is required for the actual ray job (the job.py is downloaded by the ray workers)

import os
import uuid

import ray
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings

from langchain_google_cloud_sql_pg import PostgresEngine, PostgresVectorStore
from google.cloud.sql.connector import IPTypes

# initialize parameters
INSTANCE_CONNECTION_NAME = os.environ.get("CLOUDSQL_INSTANCE_CONNECTION_NAME")
print(f"Your instance connection name is: {INSTANCE_CONNECTION_NAME}")
cloud_variables = INSTANCE_CONNECTION_NAME.split(":")

GCP_PROJECT_ID = os.environ.get("GCP_PROJECT_ID", cloud_variables[0])
GCP_CLOUD_SQL_REGION = os.environ.get("CLOUDSQL_INSTANCE_REGION", cloud_variables[1])
GCP_CLOUD_SQL_INSTANCE = os.environ.get("CLOUDSQL_INSTANCE", cloud_variables[2])

DB_NAME = os.environ.get("INSTANCE_CONNECTION_NAME", "pgvector-database")
VECTOR_EMBEDDINGS_TABLE_NAME = os.environ.get("EMBEDDINGS_TABLE_NAME", "netflix_reviews_db")
CHAT_HISTORY_TABLE_NAME = os.environ.get("CHAT_HISTORY_TABLE_NAME", "message_store")

VECTOR_DIMENSION = os.environ.get("VECTOR_DIMENSION", 384)

try:
    db_username_file = open("/etc/secret-volume/username", "r")
    DB_USER = db_username_file.read()
    db_username_file.close()

    db_password_file = open("/etc/secret-volume/password", "r")
    DB_PASS = db_password_file.read()
    db_password_file.close()
except:
    DB_USER = os.environ.get("DB_USERNAME", "postgres")
    DB_PASS = os.environ.get("DB_PASS", "postgres")

engine = PostgresEngine.from_instance(
        project_id=GCP_PROJECT_ID,
        region=GCP_CLOUD_SQL_REGION,
        instance=GCP_CLOUD_SQL_INSTANCE,
        database=DB_NAME,
        user=DB_USER,
        password=DB_PASS,
        ip_type=IPTypes.PRIVATE,
)

try:
    engine.init_vectorstore_table(
        VECTOR_EMBEDDINGS_TABLE_NAME,
        vector_size=VECTOR_DIMENSION,
        overwrite_existing=True,
    )
except Exception as err:
    print(f"Error: {err}")


SENTENCE_TRANSFORMER_MODEL = 'intfloat/multilingual-e5-small' # Transformer to use for converting text chunks to vector embeddings
# the dataset has been pre-dowloaded to the GCS bucket as part of the notebook in the cell above. Ray workers will find the dataset readily mounted.
SHARED_DATASET_BASE_PATH="/data/netflix-shows/"
REVIEWS_FILE_NAME="netflix_titles.csv"

BATCH_SIZE = 100
CHUNK_SIZE = 1000 # text chunk sizes which will be converted to vector embeddings
CHUNK_OVERLAP = 10
TABLE_NAME = 'netflix_reviews_db'  # CloudSQL table name
DIMENSION = 384  # Embeddings size
ACTOR_POOL_SIZE = 1 # number of actors for the distributed map_batches function

class Splitter:
  def __init__(self):
        self.splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, length_function=len)

  def __call__(self, text_batch):
      text = text_batch["item"]
      chunks = []
      for data in text:
        splits = self.splitter.split_text(data)
        chunks.extend(splits)

      return {'results':chunks}

# Process the dataset first, wrap the csv file contents into a Ray dataset
ray_ds = ray.data.read_csv(SHARED_DATASET_BASE_PATH + REVIEWS_FILE_NAME)
print(ray_ds.schema)

# Distributed flat map to extract the raw text fields.
ds_batch = ray_ds.flat_map(lambda row: [{
    'item': "This is a " + str(row["type"]) + " in " + str(row["country"]) + " called " + str(row["title"]) + 
    " added at " + str(row["date_added"]) + " whose director is " + str(row["director"]) + 
    " and with cast: " + str(row["cast"]) + " released at " + str(row["release_year"]) + 
    ". Its rating is: " + str(row['rating']) + ". Its duration is " + str(row["duration"]) + 
    ". Its description is " + str(row['description']) + "."
}])
print(ds_batch.schema)

# Distributed map batches to create chunks out of each row.
ds_splitted = ds_batch.map_batches(
    Splitter,
    compute=ray.data.ActorPoolStrategy(size=ACTOR_POOL_SIZE),
    batch_size=BATCH_SIZE,  # Large batch size to maximize GPU utilization.
    num_gpus=1,  # 1 GPU for each actor.
    # num_cpus=1,
)

print("torch cuda version", torch.version.cuda)
device="cpu"
if torch.cuda.is_available():
    print("device cuda found")
    device="cuda"
    
embeddings_service = HuggingFaceEmbeddings(model_name=SENTENCE_TRANSFORMER_MODEL, model_kwargs=dict(device=device))
vector_store = PostgresVectorStore.create_sync(
    engine=engine,
    embedding_service=embeddings_service,
    table_name=VECTOR_EMBEDDINGS_TABLE_NAME,
)

for output in ds_splitted.iter_rows():
    id = uuid.uuid4()
    splits = output["results"]
    vector_store.add_texts(splits, id)


#Validate results
query = "List the cast of squid game"
query_vector = embeddings_service.embed_query(query)
docs = vector_store.similarity_search_by_vector(query_vector, k=4)

for i, document in enumerate(docs):
  print(f"Result #{i+1}")
  print(document.page_content)
  print("-" * 100)
  
print ("end job")

In [None]:
import ray, time
from ray.job_submission import JobSubmissionClient
client = JobSubmissionClient("ray://ray-cluster-kuberay-head-svc:10001")

In [None]:
# Port forward to the Ray dashboard and go to `localhost:8265` in a browser to see job status: kubectl port-forward -n <namespace> service/ray-cluster-kuberay-head-svc 8265:8265
import time

start_time = time.time()
job_id = client.submit_job(
    entrypoint="python job.py",
    # Path to the local directory that contains the entrypoint file.
    runtime_env={
        "working_dir": "/home/jovyan/rag-app", # upload the local working directory to ray workers
    }
)

# The Ray job typically takes 5m-10m to complete.
print("Job submitted with ID:", job_id)
while True:
    status = client.get_job_status(job_id)
    print("Job status:", status)
    print("Job info:", client.get_job_info(job_id).message)
    if status.is_terminal():
        break
    time.sleep(30)

end_time = time.time()
job_duration = end_time - start_time
print(f"Job  completed in {job_duration} seconds.")