Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 126 additions & 0 deletions resources.js
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,132 @@ export class MemoryCount extends Resource {
}
}

// ---------------------------------------------------------------------------
// MemoryStore - Dedup-aware storage (SHA-256 hash + vector similarity)
// ---------------------------------------------------------------------------

export class MemoryStore extends Resource {
async post(data) {
const { text, dedupThreshold, agentId, channelId, authorId, sourceType, threadTs, supersedes } = data || {};

if (!text || typeof text !== 'string' || text.trim().length === 0) {
return { error: 'text is required and must be a non-empty string' };
}

log('info', 'Memory store requested', { dedupThreshold, hasDedup: !!dedupThreshold });

// Compute SHA-256 hash of normalized text for fast exact-match dedup
const contentHash = createHash('sha256').update(text.trim().toLowerCase()).digest('hex');

// Fast path: exact content hash match
for await (const existing of Memory.search({
select: ['id', 'summary', 'rawText'],
conditions: { attribute: 'contentHash', comparator: 'equals', value: contentHash },
limit: 1,
})) {
log('info', 'Exact duplicate detected via content hash', { existingId: existing.id });
return {
stored: false,
deduplicated: true,
action: 'exact_match',
id: existing.id,
summary: existing.summary,
};
}

// Generate embedding for the new memory
const embedding = await generateEmbedding(text);

// If dedupThreshold is provided, search for similar existing memories
if (dedupThreshold && typeof dedupThreshold === 'number' && dedupThreshold > 0) {
const searchParams = {
select: ['id', 'rawText', 'summary', '$distance'],
sort: {
attribute: 'embedding',
target: embedding,
},
limit: 5,
};

// Optionally filter by agentId or channelId to scope dedup
if (agentId) {
searchParams.conditions = { attribute: 'agentId', comparator: 'equals', value: agentId };
}

const potentialDupes = [];
for await (const record of Memory.search(searchParams)) {
// Normalize distance to similarity score
const similarity = Math.max(0, 1 - (record.$distance || 0) / 2);
if (similarity >= dedupThreshold) {
potentialDupes.push({ ...record, similarity });
}
}

if (potentialDupes.length > 0) {
const duplicate = potentialDupes[0]; // Highest similarity (first result from HNSW)
log('info', 'Memory deduplicated', {
dedupId: duplicate.id,
similarity: duplicate.similarity,
threshold: dedupThreshold,
});
return {
stored: false,
deduplicated: true,
action: 'fuzzy_match',
id: duplicate.id,
summary: duplicate.summary,
similarity: duplicate.similarity,
supersedes: null,
};
}
}

// No duplicate found (or dedup disabled), classify and store new memory
const [classification] = await Promise.all([
classifyMessage(text),
]);

const memoryRecord = {
rawText: text,
contentHash,
source: 'api',
sourceType: sourceType || 'direct',
channelId: channelId || '',
channelName: '',
authorId: authorId || '',
authorName: '',
agentId: agentId || null,
classification: classification.category,
entities: classification.entities,
embedding,
summary: classification.summary,
timestamp: new Date(),
threadTs: threadTs || null,
supersedes: supersedes || null,
metadata: {
embedding_model: EMBEDDING_MODEL,
stored_via: 'memory_store',
dedup_threshold: dedupThreshold || null,
},
};

await Memory.put(memoryRecord);

log('info', 'Memory stored', {
classification: classification.category,
dedupThreshold,
contentHash,
});

return {
stored: true,
deduplicated: false,
id: memoryRecord.id || 'generated',
summary: memoryRecord.summary,
};
}
}

// ---------------------------------------------------------------------------
// 3. Memory Table Extension - Strip embeddings from GET responses
// ---------------------------------------------------------------------------
Expand Down
2 changes: 2 additions & 0 deletions schema.graphql
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ type Memory @table {
classification: String @indexed
entities: Any
embedding: [Float] @indexed(type: "HNSW", distance: "cosine")
contentHash: String @indexed
supersedes: String @indexed
summary: String
timestamp: Date @indexed
threadTs: String
Expand Down
235 changes: 235 additions & 0 deletions test/dedup-store.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
import assert from 'node:assert/strict';
import { describe, it, mock } from 'node:test';

const mockSearch = mock.fn(function*() {});

class MockMemory {
static put = mock.fn();
static search = mockSearch;
static get = mock.fn();
}

mock.module('harperdb', {
namedExports: {
Resource: class Resource {},
tables: { Memory: MockMemory, SynapseEntry: class {} },
},
});

let mockClassifyFn;
mock.module('@anthropic-ai/sdk', {
defaultExport: class Anthropic {
constructor() {
this.messages = {
create: mockClassifyFn || mock.fn(),
};
}
},
});

const mockExtractor = mock.fn();
mock.module('@xenova/transformers', {
namedExports: {
pipeline: mock.fn(async () => mockExtractor),
},
});

process.env.ANTHROPIC_API_KEY = 'test-key';

const { MemoryStore } = await import('../resources.js');

describe('MemoryStore with Deduplication', () => {
it('returns error for missing text', async () => {
const store = new MemoryStore();
const result = await store.post({});

assert.ok(result.error);
assert.ok(result.error.includes('text is required'));
});

it('returns error for empty text', async () => {
const store = new MemoryStore();
const result = await store.post({ text: '' });

assert.ok(result.error);
});

it('stores memory without dedup threshold', async () => {
mockExtractor.mock.mockImplementation(async () => ({
data: new Float32Array(384).fill(0.5),
}));

mockSearch.mock.mockImplementation(function*() {});

mockClassifyFn = mock.fn(async () => ({
messages: {
create: mock.fn(async () => ({
content: [
{
text: JSON.stringify({
category: 'decision',
entities: { people: [], projects: [], technologies: [], topics: [] },
summary: 'Test decision',
}),
},
],
})),
},
}));

const store = new MemoryStore();
const result = await store.post({ text: 'This is a new memory' });

assert.equal(result.stored, true);
assert.equal(result.deduplicated, false);
assert.ok(result.summary);
});

it('deduplicates when similarity exceeds threshold', async () => {
mockExtractor.mock.mockImplementation(async () => ({
data: new Float32Array(384).fill(0.5),
}));

const existingRecord = {
id: 'existing-1',
rawText: 'Similar memory',
summary: 'Similar decision',
$distance: 0.1, // High similarity: 1 - 0.1/2 = 0.95
};

mockSearch.mock.mockImplementation(function*() {
yield existingRecord;
});

const store = new MemoryStore();
const result = await store.post({
text: 'Very similar memory',
dedupThreshold: 0.9,
});

assert.equal(result.stored, false);
assert.equal(result.deduplicated, true);
assert.equal(result.id, 'existing-1');
assert.ok(result.similarity >= 0.9);
});

it('stores when similarity is below threshold', async () => {
mockExtractor.mock.mockImplementation(async () => ({
data: new Float32Array(384).fill(0.5),
}));

const dissimilarRecord = {
id: 'different-1',
rawText: 'Different memory',
summary: 'Completely different',
$distance: 1.5, // Low similarity: 1 - 1.5/2 = 0.25
};

mockSearch.mock.mockImplementation(function*() {
yield dissimilarRecord;
});

mockClassifyFn = mock.fn(async () => ({
messages: {
create: mock.fn(async () => ({
content: [
{
text: JSON.stringify({
category: 'question',
entities: { people: [], projects: [], technologies: [], topics: [] },
summary: 'New question',
}),
},
],
})),
},
}));

const store = new MemoryStore();
const result = await store.post({
text: 'Unrelated memory',
dedupThreshold: 0.9,
});

assert.equal(result.stored, true);
assert.equal(result.deduplicated, false);
});

it('filters dedup search by agentId when provided', async () => {
mockExtractor.mock.mockImplementation(async () => ({
data: new Float32Array(384).fill(0.5),
}));

let capturedParams;
mockSearch.mock.mockImplementation(function*(params) {
capturedParams = params;
});

const store = new MemoryStore();
await store.post({
text: 'Test memory',
dedupThreshold: 0.9,
agentId: 'agent-xyz',
});

assert.ok(capturedParams);
assert.equal(capturedParams.conditions.attribute, 'agentId');
assert.equal(capturedParams.conditions.value, 'agent-xyz');
});

it('stores metadata including dedup threshold', async () => {
mockExtractor.mock.mockImplementation(async () => ({
data: new Float32Array(384).fill(0.5),
}));

mockSearch.mock.mockImplementation(function*() {});

mockClassifyFn = mock.fn(async () => ({
messages: {
create: mock.fn(async () => ({
content: [
{
text: JSON.stringify({
category: 'knowledge',
entities: { people: [], projects: [], technologies: [], topics: [] },
summary: 'Stored knowledge',
}),
},
],
})),
},
}));

const store = new MemoryStore();
await store.post({
text: 'Knowledge to store',
dedupThreshold: 0.95,
agentId: 'agent-123',
});

const callArgs = MockMemory.put.mock.calls[0]?.[0];
assert.ok(callArgs);
assert.equal(callArgs.metadata.dedup_threshold, 0.95);
assert.equal(callArgs.agentId, 'agent-123');
assert.equal(callArgs.metadata.stored_via, 'memory_store');
});

it('respects dedup search limit of 5', async () => {
mockExtractor.mock.mockImplementation(async () => ({
data: new Float32Array(384).fill(0.5),
}));

let capturedParams;
mockSearch.mock.mockImplementation(function*(params) {
capturedParams = params;
});

const store = new MemoryStore();
await store.post({
text: 'Test memory',
dedupThreshold: 0.5,
});

assert.equal(capturedParams.limit, 5);
});
});
Loading