In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# LLM text embeddings visualization

This notebook demonstrates how vector similarity is relevant to LLM-generated embeddings. It will embed a collection of labelled documents and then perform a clustering analysis

## Getting started

### Install libraries

In [None]:
!pip install langchain==0.0.315
!pip install google-cloud-aiplatform==1.35.0
!pip install scikit-learn==1.3.1

### Authenticating your notebook environment

* If you are using **Colab** to run this notebook, uncomment the cell below and continue
* If you are using **Vertex AI Workbench**, check out the setup instructions [here](https://github.com/GoogleCloudPlatform/generative-ai/tree/main/setup-env)

In [None]:
# from google.colab import auth as google_auth
# google_auth.authenticate_user()

# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

### Import libraries

***Colab only***: Uncomment the following cell to restart the kernel or use the button to restart the kernel. For Vertex AI Workbench you can restart the terminal using the button on top

In [None]:
# from google.cloud import aiplatform
# PROJECT_ID = '[YOUR PROJECT ID]'
# aiplatform.init(project=PROJECT_ID, location='us-central1')

In [None]:
import re

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from google.api_core import retry
from sklearn.datasets import fetch_20newsgroups
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
from vertexai.language_models import TextEmbeddingModel

tqdm.pandas()

## Fetch and clean the data

In this example we are using the open source [20 Newsgroups](http://qwone.com/~jason/20Newsgroups/) dataset, a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across 20 different newsgroups

In [None]:
categories = ["comp.graphics", "sci.space", "sci.med", "rec.sport.hockey"]
newsgroups = fetch_20newsgroups(categories=categories)

In [None]:
raw_data = pd.DataFrame()
raw_data["text"] = newsgroups.data
raw_data["target"] = [newsgroups.target_names[x] for x in newsgroups.target]

Because of the 8k token limit, in this example we will exclude all documents less than 8,000 characters. Don't confuse tokens with characters, the number could be higher, this is just to be safe

In [None]:
filtered = raw_data.loc[raw_data["text"].str.len() <= 8000]

Subsample the dataset into 500 data points, stratified on the label

In [None]:
x_subsample, _, y_subsample, _ = train_test_split(
    raw_data["text"], raw_data["target"], stratify=raw_data["target"], train_size=500
)

Clean out the text removing emails, names, etc. Even with Gen AI, garbage in means garbage out

In [None]:
x_subsample = [re.sub(r"[\w\.-]+@[\w\.-]+", "", d) for d in x_subsample]  # Remove email
x_subsample = [re.sub(r"\([^()]*\)", "", d) for d in x_subsample]  # Remove names
x_subsample = [d.replace("From: ", "") for d in x_subsample]  # Remove "From: "
x_subsample = [
    d.replace("\nSubject: ", "") for d in x_subsample
]  # Remove "\nSubject: "

In [None]:
df = pd.DataFrame()
df["text"] = x_subsample
df["target"] = list(y_subsample)

We now have 500 data points roughly evenly distributed

In [None]:
df["target"].value_counts()

## Create and visualize the embeddings

Load the model

In [None]:
model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")

In [None]:
# Retrieve embeddings from the specified model with retry logic
def make_embed_text_fn(model):
    @retry.Retry(timeout=300.0)
    def embed_fn(text):
        return model.get_embeddings([text])[0].values

    return embed_fn

Create the embeddings

In [None]:
df["embeddings"] = df["text"].progress_apply(make_embed_text_fn(model))

In [None]:
df.head()

The vectors generate by our model are 768 dimensions, we're not able to visualize them in their raw form. Use [t-SNE](https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding) to reduce to 2 dimensions for us humans

In [None]:
embeddings_array = np.array(df["embeddings"].to_list(), dtype=np.float32)
tsne = TSNE(random_state=0, n_iter=1000)
tsne_results = tsne.fit_transform(embeddings_array)

In [None]:
df_tsne = pd.DataFrame(tsne_results, columns=["TSNE1", "TSNE2"])
df_tsne["target"] = df["target"]  # Add labels column from df_train to df_tsne

In [None]:
df_tsne.head()

Plot the data points. It's clear the documents from the same newsgroup embed closely to eachother in the vector space. This will be useful in the next lab when we use embeddings to find similar documents

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))  # Set figsize
sns.set_style("darkgrid", {"grid.color": ".6", "grid.linestyle": ":"})
sns.scatterplot(data=df_tsne, x="TSNE1", y="TSNE2", hue="target", palette="hls")
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
plt.title("Scatter plot of news using t-SNE")
plt.xlabel("TSNE1")
plt.ylabel("TSNE2")
plt.axis("equal")