In [None]:
import json
import pandas as pd

with open("../.env/config.json", "r") as f:
    CONFIG = json.load(f)

PROJECT_ID = CONFIG["project_id"]
BUCKET_NAME = CONFIG["bucket_name"]
MODEL_NAME = CONFIG["model_name"]
LOCATION = CONFIG["location"]

In [None]:
#authentication to GCP
from google.cloud import storage
import os

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "../.env/service-account.json"

In [60]:
#get the file from the bucket
import io

def read_from_bucket(filename):
    try:
        client = storage.Client(project=PROJECT_ID)
        bucket = client.bucket(BUCKET_NAME)
        blob = bucket.blob(filename)
        file_content = blob.download_as_bytes()
        return pd.read_csv(io.BytesIO(file_content))
    
    except Exception as e:
        print(f"Error reading file from bucket: {e}")
        raise

df_raw = read_from_bucket("all_tickets.csv")

print(df_raw.shape)

(48549, 9)


In [None]:
#preprocessing
#we keep only the columns we need
df = df_raw[["title","body"]]

#dropping the missing values
df = df.dropna()

#concatenating the title and body
df["Text"] = "title:" + df["title"] + ";body:" + df["body"]
df = df.drop(columns=["title","body"])
print(df.shape)
print(df.head())


(47837, 1)
                                                Text
1  title:connection with icon;body:icon dear plea...
2  title:work experience user;body:work experienc...
3  title:requesting for meeting;body:requesting m...
4  title:reset passwords for external accounts;bo...


In [None]:
#we calculate the number of characters in the dataset and estimate a price for embedding
total_char = df["Text"].apply(len).sum()
print(f"{total_char} characters in the dataset")
print(f"{total_char * 0.0002 / 1000} dollars expected to embed this dataset")


14488931 characters in the dataset
2.8977862 dollars expected to embed this dataset


In [None]:
#text embedding functions
from google.api_core import retry
import google.generativeai as genai
from tqdm.auto import tqdm

tqdm.pandas()

#function to get embeddings for a batch of texts
@retry.Retry(timeout=300.0)
def embed_fn_batch(texts: list[str]) -> list[list[float]]:
    response = genai.embed_content(
        model=f"models/{MODEL_NAME}", content=texts, task_type="clustering"
    )

    return response["embedding"]

#function to process the texts in batches
def process_in_batches(texts: list[str], batch_size: int = 200):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
        batch = texts[i:i + batch_size]
        embeddings.extend(embed_fn_batch(batch))
    return embeddings

In [None]:
#embedding
texts = df["Text"].tolist()
df["Embeddings"] = process_in_batches(texts, batch_size=200)

#saving the embeddings
df.to_csv("../embeddings/all_tickets_embeddings.csv", index=False)

Processing batches: 100%|██████████| 240/240 [15:02<00:00,  3.76s/it]


In [86]:
df = df.rename(columns={"Text":"id","Embeddings":"embedding"})

#saving to json
json_data = df.to_json(orient="records")
with open("../embeddings/all_tickets_embeddings.json", "w") as f:
    f.write(json_data)

In [87]:
#sending the embeddings to a bucket
df.to_json(f"gs://{BUCKET_NAME}/all_tickets_embeddings.json", index=False)

In [92]:
#sending to a vector database
from google.cloud import aiplatform

DEPLOYED_INDEX_ID = "support_tickets_index_deployed"

aiplatform.init(project=PROJECT_ID, location=LOCATION)

#creating the index
my_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name="support-tickets-index",
    contents_delta_uri=f"gs://{BUCKET_NAME}/all_tickets_embeddings.json",
    dimensions=768,
    approximate_neighbors_count=5,
    distance_measure_type="DOT_PRODUCT_DISTANCE",
)

#creating IndexEndpoint
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name="support-tickets-index-endpoint",
    public_endpoint_enabled=True,
)

#deploying the Index to the Index Endpoint
my_index_endpoint.deploy_index(index=my_index, deployed_index_id=DEPLOYED_INDEX_ID)



Creating MatchingEngineIndex
Create MatchingEngineIndex backing LRO: projects/802291245749/locations/us-central1/indexes/1491425950426988544/operations/3493933939423182848
MatchingEngineIndex created. Resource name: projects/802291245749/locations/us-central1/indexes/1491425950426988544
To use this MatchingEngineIndex in another session:
index = aiplatform.MatchingEngineIndex('projects/802291245749/locations/us-central1/indexes/1491425950426988544')
Creating MatchingEngineIndexEndpoint
Create MatchingEngineIndexEndpoint backing LRO: projects/802291245749/locations/us-central1/indexEndpoints/3788631196292874240/operations/5832005833470246912
MatchingEngineIndexEndpoint created. Resource name: projects/802291245749/locations/us-central1/indexEndpoints/3788631196292874240
To use this MatchingEngineIndexEndpoint in another session:
index_endpoint = aiplatform.MatchingEngineIndexEndpoint('projects/802291245749/locations/us-central1/indexEndpoints/3788631196292874240')
Deploying index Matchi

<google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint.MatchingEngineIndexEndpoint object at 0x0000019E98ECF9D0> 
resource name: projects/802291245749/locations/us-central1/indexEndpoints/3788631196292874240