# 2.3 Vectorstores and Embeddings - part 2

## Using other embedding models

In [None]:
%pip install python-dotenv langchain langchain-openai chromadb docarray --upgrade --quiet

In [1]:
#import os
#import openai

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

#openai.api_key  = os.environ['OPENAI_API_KEY']

In [2]:
from langchain.llms import OpenAI

# !pip install -U langchain-community
# !pip install -U gpt4all
from langchain_community.embeddings import GPT4AllEmbeddings
# !pip install -U chromadb
from langchain_community.vectorstores import Chroma

# !pip install -U sentence-transformers
from langchain.embeddings import HuggingFaceEmbeddings

# !pip install -U bs4
# from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import CharacterTextSplitter

# !pip install -U unstructured
from langchain_community.document_loaders import UnstructuredMarkdownLoader

In [9]:
# create the open-source embedding function
embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
#embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
#embedding_function = GPT4AllEmbeddings()

In [16]:
loaders = [
    UnstructuredMarkdownLoader("../data/listing1.md"),
    UnstructuredMarkdownLoader("../data/listing2.md"),
    UnstructuredMarkdownLoader("../data/listing3.md"),
]
documents = []
for loader in loaders:
    documents.extend(loader.load())


In [17]:
from langchain.text_splitter import CharacterTextSplitter

# split it into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=32)
splitDocs = text_splitter.split_documents(documents)

# embeddings = []
# for sp in splitDocs:
#     embeddings = embedding.embed_query(sp.page_content)

print(f"splitDocs count: {len(splitDocs)}")

splitDocs count: 10


In [18]:
#persist_directory = '../db/chroma-hugging-1/'
#!rm -rf ../db/chroma-hugging-1  # remove old database files if any

print('Loading the vector store...')
vectorstore = Chroma.from_documents(documents=splitDocs, embedding=embedding_function) #, persist_directory=persist_directory)

Loading the vector store...


In [28]:
question = "I'm looking for a 2-bedroom apartment"
#question = "I'm looking for an apartment with a stackable washer."
#question = "I'm looking for an electric car with autopilot"

print("Similarity search...")
docs = vectorstore.similarity_search(question, k=1)

length = len(docs)
print(f"Result: {length}")

for d in docs:
    print(d)

Similarity search...
Result: 1
page_content="2022 Tesla Model S Plaid - A Marvel of Electric Performance\n\nPrice: $129,900\n\nMake: Tesla\n\nModel: Model S Plaid\n\nYear: 2022\n\nMileage: 5,000 miles\n\nExterior Color: Midnight Silver Metallic\n\nInterior Color: Black with Carbon Fiber Decor\n\nEngine: Triple Electric Motor AWD\n\nBattery Range: Up to 396 miles on a single charge\n\nCharging: Supercharger capable, includes home charging kit\n\nOverview\n\nExperience the pinnacle of electric performance with the 2022 Tesla Model S Plaid. This vehicle not only redefines what an electric car can be, but it also sets a new standard for luxury sedans. With just 5,000 miles on the odometer, it's as close to new as you can get without driving it off the showroom floor. The Model S Plaid's breathtaking acceleration, unparalleled range, and advanced technology make it a standout in the electric vehicle market.\n\nPerformance\n\n0 to 60 mph in 1.99 seconds thanks to its tri-motor all-wheel driv