## **Vector Embeddings using OpenAIEmbeddings (Closed Source Auto-Encoder) and Visualising Vectors**

In [48]:
import os
import glob
from dotenv import load_dotenv
import gradio as gr

In [49]:
# imports from langchain, chroma and plotly

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import numpy as np
from sklearn.manifold import TSNE  
import plotly.graph_objects as go     

In [50]:
load_dotenv(override=True)

MODEL = 'gpt-4.1-nano'
db_name = "vector_db"

In [51]:
# get all 4 folders inside knowledge base ---> load all content of each folder one by one into dopcuments list with custom metadata, doc_type(employees, products, contracts or company).

folders = glob.glob("knowledge-base/*")

text_loader_kwargs = {'encoding': 'utf-8'}

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata['doc_type'] = doc_type
        documents.append(doc)

In [52]:
# List of document objects
len(documents)

31

In [53]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)



In [54]:
len(chunks)

123

#### **A sidenote on Embeddings, and "Auto-Encoding LLMs"**
We will be mapping each chunk of text into a Vector that represents the meaning of the text, known as an embedding.

OpenAI offers a model to do this, which we will use by calling their API with some LangChain code.

This model is an example of an "Auto-Encoding LLM" which generates an output given a complete input. It's different to all the other LLMs we've discussed today, which are known as "Auto-Regressive LLMs", and generate future tokens based only on past context.

Another example of an Auto-Encoding LLMs is BERT from Google. In addition to embedding, Auto-encoding LLMs are often used for classification.

In [55]:
doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)
print(f"Document Types found: {', '.join(doc_types)}")

Document Types found: employees, contracts, company, products


In [56]:
# instance of OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

In [57]:
# Check if Chroma DB already, exists. If so, delete the collection to start from scratch.

if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

In [58]:
# Create our Chroma Vector Store

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents.")

Vectorstore created with 123 documents.


In [59]:
# Get one vector and find out how many dimensions it has:
# NOTE: 'vectorstore._collection' is a Collection Object 

collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])['embeddings'][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions: ,} dimensions.")


The vectors have  1,536 dimensions.


In [60]:
sample_embedding = collection.get(limit=1, include=["embeddings"])

In [61]:
sample_embedding['embeddings'][0]

array([-0.01782627,  0.00708584, -0.02671911, ..., -0.01271662,
       -0.00528223, -0.02747709], shape=(1536,))

In [62]:
sample_embedding

{'ids': ['225c012b-ca5f-4190-a863-d22771d83312'],
 'embeddings': array([[-0.01782627,  0.00708584, -0.02671911, ..., -0.01271662,
         -0.00528223, -0.02747709]], shape=(1, 1536)),
 'documents': None,
 'uris': None,
 'included': ['embeddings'],
 'data': None,
 'metadatas': None}

In [63]:
result = collection.get(include=["embeddings", "documents", "metadatas"])
vectors = np.array(result['embeddings'])
documents = result['documents']
doc_types = [metadata['doc_type'] for metadata in result['metadatas']]
colors = [['blue', 'green', 'red', 'orange'][['products', 'employees', 'contracts', 'company'].index(t)] for t in doc_types]

In [64]:
# We humans find it easier to visalize things in 2D!
# Reduce the dimensionality of the vectors to 2D using t-SNE
# (t-distributed stochastic neighbor embedding)

tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [65]:
# Let's try 3D!

tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()