In [104]:
# 1. Embedding
# 2. Vector DB
# 3. Similarity

from openai import OpenAI
import chromadb
import os

In [132]:
os.environ["OPENAI_API_KEY"] = "yourkey"

chroma_Client = chromadb.PersistentClient(
    path="D:/workplace/OpenAI-Playground/vector-db"
)
collection = chroma_Client.get_or_create_collection(
    name="open_ai_collection", metadata={"hnsw:space": "cosine"}
)

openAI_Client = OpenAI()

In [114]:
def callEmbedding(val):
    response = openAI_Client.embeddings.create(
        model="text-embedding-3-small",
        # model="text-embedding-ada-002",
        input=val,
        encoding_format="float",
    )
    return response.data[0].embedding

def chromaStore(key, value):
    collection.add(embeddings=value, ids=key)

# check for available model for Embedding.
def checkModelList():
    for i in openAI_Client.models.list().data:
        print(i)

def storeEmbeddingsChroma(array):
    for val in array:
        response = callEmbedding(val)
        print(val, '\'s Vector is' , response)
        chromaStore(val, response)
        print(val, '\'s Vector successfully stored in Chroma')
        
# checkModelList()

In [133]:
storeEmbeddingsChroma(["Dog","Cat","Tiger","Monkey","apache","websphere"])

Dog 's Vector is [0.020139037, 0.0038084495, -0.004792253, 0.08152617, 0.006950007, -0.04210789, 0.011926895, 0.00066172495, 0.022508983, 0.015994385, 0.019929599, -0.025088368, -0.020293359, -0.01344807, 0.013977175, -0.012246562, 0.015432212, -0.01843047, -0.0070877946, 0.029541662, 0.023787651, 0.06922449, -0.073854156, -0.014329911, 0.051014483, 0.051984508, 0.025705656, 0.0010182504, 0.025264734, -0.060802914, -0.012301677, -0.041248098, 0.006415391, -0.024493124, -0.0055638636, -0.013822853, 0.011072612, 0.0128858965, -0.00031277788, 0.043584976, -0.016545536, -0.034678385, 0.0127315745, 0.015079475, 0.014682647, -0.024669493, -0.011397791, -0.041027635, -0.02429471, 0.037412092, 0.054321386, 0.032032862, -0.0066523855, 0.0807766, 0.011287561, -0.033840634, -0.0022032238, -0.004012375, -0.012004056, -0.012334746, -0.012136332, -0.027292969, 0.031239206, -0.019367427, 0.0071649556, -0.00927035, -0.033906773, 0.024382895, -0.009942753, -0.010190771, 0.033399716, -0.030842377, -0.01

In [135]:
# Chroma operation, pick the one you want use

# chroma_Client.list_collections()
# collection.get() 
# collection.count()
# collection.peek()
# collection.delete(ids=["Tiger"])
# chroma_Client.delete_collection(name="open_ai_collection")

In [None]:

def findMostSimilarChroma(val):
    response = callEmbedding(val)
    return collection.query(query_embeddings=response)

findMostSimilarChroma("cougar")

In [None]:

def findMostSimilarChroma(val):
    response = callEmbedding(val)
    return collection.query(query_embeddings=response, n_results=5)

# findMostSimilarChroma("cougar")
findMostSimilarChroma("Tomcat")


In [141]:
# run your mongoDB in local at port 27017
from pymongo import MongoClient

# 连接到MongoDB实例
client = MongoClient('localhost', 27017)

# 选择数据库和集合
db = client["mango-db"]
mongoCollection = db['vector_collection']

In [None]:
def mongoStore( key, value):
    document = {"key": key, "vector": value}
    mongoCollection.insert_one(document)
    
def storeEmbeddingsMongo(array):
    for val in array:
        response = callEmbedding(val)
        print(val, '\'s Vector is' , response)
        mongoStore(val, response)
        print(val, '\'s Vector successfully stored in Mongo')
    
storeEmbeddingsMongo(["Dog","Cat","Tiger","Monkey","apache","websphere"])

In [None]:
import numpy as np

# # 计算欧几里得距离
# euclidean_distance = np.linalg.norm(a - b)

# # 计算曼哈顿距离
# manhattan_distance = np.sum(np.abs(a - b))

# # 计算余弦相似度
# cosine_similarity = np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def cosine_similarity(vector_a, vector_b):
    dot_product = np.dot(vector_a, vector_b)
    norm_a = np.linalg.norm(vector_a)
    norm_b = np.linalg.norm(vector_b)
    similarity = dot_product / (norm_a * norm_b)
    return similarity


def computeSimilarity(val):
    response = callEmbedding(val)
    documentArray = mongoCollection.find()
    similarityArray = []
    for document in documentArray:
        print(document)
        similarityArray.append(
            {
                "key": document["key"],
                "similarity": cosine_similarity(response, document["vector"]),
            }
        )
    return sorted(similarityArray, key=lambda x: x["similarity"])


# computeSimilarity("cougar")
computeSimilarity("Tomcat")