In [None]:
!pip install google-cloud-aiplatform

In [None]:
import google.auth
from google.colab import auth
import vertexai
import numpy as np
%matplotlib inline
# Define 2d Plot
import matplotlib.pyplot as plt
!pip install mplcursors
import mplcursors
from sklearn.decomposition import PCA
import pandas as pd
import time
import pickle
from sklearn.cluster import KMeans

In [None]:

# Authenticate user to Google Cloud
auth.authenticate_user()

# Get default credentials and project ID
credentials, project_id = google.auth.default()

# If project_id is not automatically determined, you may need to set it manually.
# For example: PROJECT_ID = 'your-gcp-project-id'
PROJECT_ID = 'osrprocessor20251029'

if not PROJECT_ID:
  print("Warning: PROJECT_ID was not automatically detected. Please set it manually if needed.")
  # Example of how to set it manually:
  # PROJECT_ID = 'your-gcp-project-id'

print(f"Successfully authenticated to Google Cloud Project: {PROJECT_ID}")

In [None]:
REGION = 'us-central1'
vertexai.init(project = PROJECT_ID,
              location = REGION,
              credentials = credentials)
from vertexai.language_models import TextEmbeddingModel
embedding_model = TextEmbeddingModel.from_pretrained("text-embedding-005")

In [None]:
# Generating Word Embedding
embedding = embedding_model.get_embeddings(["life"])
vector = embedding[0].values
print(f"Length = {len(vector)}")
print(vector[:10]) # print first 10 values

# Generating Text Embedding
embedding = embedding_model.get_embeddings(["What is the meaning of life?"])
vector = embedding[0].values
print(f"Length = {len(vector)}")
print(vector[:10])

In [None]:
# Calculate Cosine Similarities between two sentences; as a number between 0 and 1
emb_1 = embedding_model.get_embeddings(["What is the meaning of life?"])
emb_2 = embedding_model.get_embeddings(["How does one spend their time well on Earth?"])
emb_3 = embedding_model.get_embeddings(["Would you like a salad?"])
# Of Course We cannot plot for all 768 dimentions; for display using 2-Dimentions only.
# We need to wrap the 2-D embeddings(wiz a list) into another list becuase cosine_similarity expects a 2D Array or (List of List as in our case)
vec_1 = [emb_1[0].values]
vec_2 = [emb_2[0].values]
vec_3 = [emb_3[0].values]
# print(len(emb_1[0].values))
# print(emb_1[0].values)
# vec_1 = np.stack(emb_1)
# print(len(vec_1))
# print(vec_1)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
print(cosine_similarity(vec_1,vec_2))
print(cosine_similarity(vec_2,vec_3))
print(cosine_similarity(vec_1,vec_3))

In [None]:
# Proof that Sentence Embedding Capture meaning.
# Start with below sentences and see in the final matplot how similar sentences are near each other.
sentence_1 = "Misssentenceg flamsentencego discovered at swimmsentenceg pool"
sentence_2 = "Sea otter spotted on surfboard by beach"
sentence_3 = "Baby panda enjoys boat ride"
sentence_4 = "Breakfast themed food truck beloved by all!"
sentence_5 = "New curry restaurant aims to please!"
sentence_6 = "Python developers are wonderful people"
sentence_7 = "TypeScript, C++ or Java? All are great!"
sentences = [sentence_1, sentence_2, sentence_3, sentence_4, sentence_5, sentence_6, sentence_7]
embeddings = []
for sentence in sentences:
    emb_vec = embedding_model.get_embeddings([sentence])
    # print("Sentence:", sentence, "\nEmbedding Vector:", emb, "\n")
    emb = emb_vec[0].values
    embeddings.append(emb)

embeddings_array = np.array(embeddings)

In [None]:
def plot_2D(x_values, y_values, labels):
    # Create scatter plot
    fig, ax = plt.subplots()
    scatter = ax.scatter(x_values,
                         y_values,
                         alpha = 0.5,
                         edgecolors='k',
                         s = 40)
    # Create a mplcursors object to manage the data point interaction
    cursor = mplcursors.cursor(scatter, hover=True)
    #aes
    ax.set_title('Embedding visualization in 2D')  # Add a title
    ax.set_xlabel('X_1')  # Add x-axis label
    ax.set_ylabel('X_2')  # Add y-axis label
    # Define how each annotation should look
    @cursor.connect("add")
    def on_add(sel):
        sel.annotation.set_text(labels[sel.target.index])
        sel.annotation.get_bbox_patch().set(facecolor='white', alpha=0.5) # Set annotation's background color
        sel.annotation.set_fontsize(12)
    plt.show()

In [None]:
# Using PCA(Pricipal Component Analysis) lets flatten it from 768 to 2 Dimentions.
PCA_model = PCA(n_components = 2)
PCA_model.fit(embeddings_array)
flattened_values = PCA_model.transform(embeddings_array)
print("values:",  flattened_values)
print("Shape: " + str(flattened_values.shape))
# Now plot them in a 2D array.
plot_2D(flattened_values[:,0], flattened_values[:,1], sentences)