Skip to content

Commit

Permalink
search/embeddings: speed up similarity search by more than 50%
Browse files Browse the repository at this point in the history
Exploit normalization of embeddings. Document this assumption and fulfill it in the mock embedding model.
  • Loading branch information
LinqLover committed Aug 29, 2023
1 parent bf76bb1 commit 541ae49
Show file tree
Hide file tree
Showing 8 changed files with 15 additions and 27 deletions.
@@ -1,6 +1,6 @@
service
getEmbeddingsForAll: strings config: aConfigOrNil
"Answer a collection with one embedding for each string. Each embedding vector is a Float32Array of numbers."
"Answer a collection with one embedding for each string. Each embedding vector is a Float32Array of numbers. Each vector is normalized, i.e., has a length very close to 1."

| embeddingResult |
embeddingResult := self
Expand Down
Expand Up @@ -12,7 +12,7 @@
"getEmbeddingFor:" : "ct 8/20/2023 12:49",
"getEmbeddingFor:config:" : "ct 8/20/2023 13:03",
"getEmbeddingsForAll:" : "ct 8/20/2023 22:55",
"getEmbeddingsForAll:config:" : "ct 8/20/2023 19:43",
"getEmbeddingsForAll:config:" : "ct 8/29/2023 17:16",
"name:" : "ct 8/19/2023 22:15",
"pathToEndpoint" : "ct 8/17/2023 18:07",
"priceFor:" : "ct 8/27/2023 16:55",
Expand Down
@@ -1,21 +1,7 @@
private
distanceBetween: embedding and: anotherEmbedding
"cosine distance"
"Answer the cosine distance between both embeddings. The length of embeddings is ignored, so senders have to take care not to compare differences between pairs of vectors with different total scalars."

| abs otherAbs |
anotherEmbedding ifNil: [^ Float infinity].

abs := embedding squaredLength.
abs = 0 ifTrue: [^ Float infinity].
otherAbs := anotherEmbedding squaredLength.
otherAbs = 0 ifTrue: [^ Float infinity].
^ 1.0 -
(
(embedding dot: anotherEmbedding)
/
(
abs
*
otherAbs
) sqrt
)
^ 1.0 - (embedding dot: anotherEmbedding)
@@ -1,5 +1,6 @@
search-embeddings
findAllDocuments: number nearEmbedding: embedding
"Note: Ranking of results is invariant of length of the embedding argument."

| relatednesses |
relatednesses := Array new: self documents size streamContents: [:stream |
Expand Down
Expand Up @@ -10,7 +10,7 @@ findDocuments: number similarTo: documents collect: collectBlock thenSelect: sel
average := classifiedDocuments first embedding shallowCopy.
2 to: classifiedDocuments size do: [:i |
average += (classifiedDocuments at: i) embedding].
"average /= documents size." "NOT required because cosine distance only depends on vector direction"
"average /= documents size." "NOT required because ranking of results only depends on vector direction"

^ self
findDocuments: number
Expand Down
Expand Up @@ -12,7 +12,7 @@
"addFragmentDocumentsFrom:title:content:as:" : "ct 8/14/2023 16:15",
"addFragmentDocumentsFromTitle:content:collect:" : "ct 8/14/2023 19:47",
"allDocumentsForObject:" : "ct 8/16/2023 20:07",
"distanceBetween:and:" : "ct 8/19/2023 22:09",
"distanceBetween:and:" : "ct 8/29/2023 17:32",
"documentClass" : "ct 8/14/2023 15:23",
"documents" : "ct 7/2/2023 17:41",
"documentsForObject:" : "ct 8/14/2023 16:30",
Expand All @@ -25,15 +25,15 @@
"estimateTimeToInitializeEmbeddingsFor:" : "ct 8/20/2023 21:00",
"estimateTokens" : "ct 8/20/2023 20:57",
"estimateTokensFor:" : "ct 8/27/2023 16:57",
"findAllDocuments:nearEmbedding:" : "ct 8/16/2023 19:03",
"findAllDocuments:nearEmbedding:" : "ct 8/29/2023 17:30",
"findClusteredDocuments:nearEmbedding:" : "ct 8/16/2023 19:12",
"findClusters:nearEmbedding:" : "ct 8/19/2023 22:09",
"findDocuments:nearEmbedding:" : "ct 8/16/2023 19:03",
"findDocuments:nearEmbedding:collect:" : "ct 8/17/2023 12:02",
"findDocuments:nearEmbedding:collect:thenSelect:" : "ct 8/19/2023 22:07",
"findDocuments:nearEmbedding:useClusters:" : "ct 8/16/2023 19:03",
"findDocuments:similarTo:collect:" : "ct 8/17/2023 12:03",
"findDocuments:similarTo:collect:thenSelect:" : "ct 8/19/2023 22:02",
"findDocuments:similarTo:collect:thenSelect:" : "ct 8/29/2023 19:51",
"findDocuments:similarToObject:collect:" : "ct 8/16/2023 20:02",
"findDocuments:similarToObjects:collect:" : "ct 8/16/2023 20:03",
"findDocuments:similarToQuery:collect:" : "ct 8/17/2023 20:35",
Expand Down
@@ -1,15 +1,16 @@
service
getEmbeddingsForAll: strings config: aConfigOrNil
"Answer a collection with one embedding for each string. Each embedding vector is an array of numbers, commonly represented as a Float32Array."
"Answer a collection with one embedding for each string. Each embedding vector is an array of numbers, commonly represented as a Float32Array. Each vector is normalized, i.e., has a length very close to 1."

| config |
config := self baseConfig.
aConfigOrNil ifNotNil:
[config := config updatedWith: aConfigOrNil].

^ strings collect: [:string |
| words |
| words vector |
words := string substrings collect: [:word | word asLowercaseAlphabetic] as: Bag.
self keywords
vector := self keywords
collect: [:keyword | (words occurrencesOf: keyword) / words size]
as: Float32Array]
as: Float32Array.
vector /= vector length]
Expand Up @@ -8,7 +8,7 @@
"getEmbeddingFor:" : "ct 8/20/2023 12:48",
"getEmbeddingFor:config:" : "ct 8/20/2023 12:52",
"getEmbeddingsForAll:" : "ct 8/20/2023 12:47",
"getEmbeddingsForAll:config:" : "ct 8/20/2023 13:03",
"getEmbeddingsForAll:config:" : "ct 8/29/2023 17:16",
"keywords" : "ct 8/17/2023 20:34",
"truncateString:minusString:minusWords:to:" : "ct 8/20/2023 13:11",
"truncateString:to:" : "ct 8/20/2023 13:11" } }

0 comments on commit 541ae49

Please sign in to comment.