In [1]:
from langchain_huggingface import ChatHuggingFace, HuggingFacePipeline
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=True,
)

llm = HuggingFacePipeline.from_model_id(
    model_id="google/gemma-3-1b-it",
    task="text-generation",
    pipeline_kwargs=dict(
        max_new_tokens=512,
        return_full_text=False,
    ),
    model_kwargs={
        "quantization_config": quantization_config,
    },
    device="cuda"
)

chat_model = ChatHuggingFace(llm=llm)

`low_cpu_mem_usage` was None, now default to True since model is quantized.
Setting the `device` argument to None from cuda to avoid the error caused by attempting to move the model that was already loaded on the GPU using the Accelerate module to the same or another device.
Device set to use cuda:0


# Working Memory

In [2]:
from langchain_core.messages import HumanMessage, SystemMessage

system_prompt = SystemMessage("You are a helpful AI Assistant. Answer the User's queries succinctly in one sentence.")

# storage for historical message history
messages = [system_prompt]

while True:

    user_message = HumanMessage(input("\nUser: "))

    if user_message.content.lower() == "exit":
        break
    else:
        messages.append(user_message)
        print("User Message: ", user_message.content)

    # pass entire message sequence to LLM to generate response
    response = chat_model.invoke(messages)

    print("\nLLM Response: ", response.content)

    # add LLM's response to message history
    messages.append(response)

User Message:  Hello!

LLM Response:  Hello there! How can I help you today?
User Message:  what's my name? 

LLM Response:  Your name is Alex!
User Message:  my name is Ahmed

LLM Response:  Your name is Ahmed!
User Message:  do you know my name now? what is it ?

LLM Response:  Yes, I do! Your name is Ahmed.


In [3]:
for i in range(len(messages)):
    print(f"\nMessage {i+1} - {messages[i].type.upper()}: ", messages[i].content)
    i += 1


Message 1 - SYSTEM:  You are a helpful AI Assistant. Answer the User's queries succinctly in one sentence.

Message 2 - HUMAN:  Hello!

Message 3 - AI:  Hello there! How can I help you today?

Message 4 - HUMAN:  what's my name? 

Message 5 - AI:  Your name is Alex!

Message 6 - HUMAN:  my name is Ahmed

Message 7 - AI:  Your name is Ahmed!

Message 8 - HUMAN:  do you know my name now? what is it ?

Message 9 - AI:  Yes, I do! Your name is Ahmed.


# Episodic Memory

In [4]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser

reflection_prompt_template = """
You are analyzing conversations about research papers to create memories that will help guide future interactions. Your task is to extract key elements that would be most helpful when encountering similar academic discussions in the future.

Review the conversation and create a memory reflection following these rules:

1. For any field where you don't have enough information or the field isn't relevant, use "N/A"
2. Be extremely concise - each string should be one clear, actionable sentence
3. Focus only on information that would be useful for handling similar future conversations
4. Context_tags should be specific enough to match similar situations but general enough to be reusable

Output valid JSON in exactly this format:
{{
    "context_tags": [              // 2-4 keywords that would help identify similar future conversations
        string,                    // Use field-specific terms like "deep_learning", "methodology_question", "results_interpretation"
        ...
    ],
    "conversation_summary": string, // One sentence describing what the conversation accomplished
    "what_worked": string,         // Most effective approach or strategy used in this conversation
    "what_to_avoid": string        // Most important pitfall or ineffective approach to avoid
}}

Examples:
- Good context_tags: ["transformer_architecture", "attention_mechanism", "methodology_comparison"]
- Bad context_tags: ["machine_learning", "paper_discussion", "questions"]

- Good conversation_summary: "Explained how the attention mechanism in the BERT paper differs from traditional transformer architectures"
- Bad conversation_summary: "Discussed a machine learning paper"

- Good what_worked: "Using analogies from matrix multiplication to explain attention score calculations"
- Bad what_worked: "Explained the technical concepts well"

- Good what_to_avoid: "Diving into mathematical formulas before establishing user's familiarity with linear algebra fundamentals"
- Bad what_to_avoid: "Used complicated language"

Additional examples for different research scenarios:

Context tags examples:
- ["experimental_design", "control_groups", "methodology_critique"]
- ["statistical_significance", "p_value_interpretation", "sample_size"]
- ["research_limitations", "future_work", "methodology_gaps"]

Conversation summary examples:
- "Clarified why the paper's cross-validation approach was more robust than traditional hold-out methods"
- "Helped identify potential confounding variables in the study's experimental design"

What worked examples:
- "Breaking down complex statistical concepts using visual analogies and real-world examples"
- "Connecting the paper's methodology to similar approaches in related seminal papers"

What to avoid examples:
- "Assuming familiarity with domain-specific jargon without first checking understanding"
- "Over-focusing on mathematical proofs when the user needed intuitive understanding"

Do not include any text outside the JSON object in your response.

Here is the prior conversation:

{conversation}
"""

reflection_prompt = ChatPromptTemplate.from_template(reflection_prompt_template)

reflect = reflection_prompt | chat_model | JsonOutputParser()

In [5]:
def format_conversation(messages):
    # clean up the conversation by removing the system prompt, effectively only returning a string of the relevant conversation
    conversation = [f"{message.type.upper()}: {message.content}" for message in messages[1:]]

    # join with newlines
    return "\n".join(conversation)

conversation = format_conversation(messages)

print(conversation)

HUMAN: Hello!
AI: Hello there! How can I help you today?
HUMAN: what's my name? 
AI: Your name is Alex!
HUMAN: my name is Ahmed
AI: Your name is Ahmed!
HUMAN: do you know my name now? what is it ?
AI: Yes, I do! Your name is Ahmed.


In [6]:
reflection = reflect.invoke({"conversation": conversation})

print(reflection)

{'context_tags': ['name', 'user_identification'], 'conversation_summary': "The system confirmed the user's name as Alex and then identified the user as Ahmed.", 'what_worked': 'Using a simple confirmation and a direct name identification.', 'what_to_avoid': 'Assuming prior knowledge of the research context.'}


## Setting Up our Database

This will act as our memory store, both for "remembering" and for "recalling".

We will be using weviate with ollama embeddings running in a docker container. See docker-compose.yml for additional details


In [7]:
import weaviate

vdb_client = weaviate.connect_to_local()
print("Connected to Weviate: ", vdb_client.is_ready())

# docker exec -it agentic-memory-ollama-1 ollama pull nomic-embed-text

            We encourage you to update your code to use the async client instead when running inside async def functions!


Connected to Weviate:  True


In [8]:
# vdb_client.collections.delete("episodic_memory")

In [9]:
from weaviate.classes.config import Property, DataType, Configure, Tokenization

vdb_client.collections.create(
    name="episodic_memory",
    description="Collection containing historical chat interactions and takeaways.",
    vectorizer_config=[
        Configure.NamedVectors.text2vec_ollama(
            name="title_vector",
            source_properties=["title"],
            api_endpoint="http://ollama:11434",  # Use the Docker service name instead of host.docker.internal
            model="nomic-embed-text",
        )
    ],
    properties=[
        Property(name="conversation", data_type=DataType.TEXT),
        Property(name="context_tags", data_type=DataType.TEXT_ARRAY),
        Property(name="conversation_summary", data_type=DataType.TEXT),
        Property(name="what_worked", data_type=DataType.TEXT),
        Property(name="what_to_avoid", data_type=DataType.TEXT),
        
    ]
)

<weaviate.collections.collection.sync.Collection at 0x7bb269c157e0>

In [11]:
def add_episodic_memory(messages, vdb_client):
    conversation = format_conversation(messages)

    reflection = reflect.invoke({"conversation": conversation})

    episodic_memory = vdb_client.collections.get("episodic_memory")

    episodic_memory.data.insert({
        "conversation": conversation,
        "context_tags": reflection['context_tags'],
        "conversation_summary": reflection['conversation_summary'],
        "what_worked": reflection['what_worked'],
        "what_to_avoid": reflection['what_to_avoid'],
    })

add_episodic_memory(messages, vdb_client)

In [12]:
def episodic_recall(query, vdb_client):
    
    # load db collection
    episodic_memory = vdb_client.collections.get("episodic_memory")

    # Hybrid Semantic/BM25 Retrieval
    memory = episodic_memory.query.hybrid(
        query=query,
        alpha=0.5,
        limit=1,
    )

    return memory

query = "Talking about my name"

memory = episodic_recall(query, vdb_client)

memory.objects[0].properties

{'what_worked': 'Using a simple, direct confirmation and reiteration of the identified name.',
 'conversation_summary': "The system confirmed the user's name as 'Ahmed', providing a clear identification.",
 'context_tags': ['name', 'user_input', 'research_topic'],
 'conversation': "HUMAN: Hello!\nAI: Hello there! How can I help you today?\nHUMAN: what's my name? \nAI: Your name is Alex!\nHUMAN: my name is Ahmed\nAI: Your name is Ahmed!\nHUMAN: do you know my name now? what is it ?\nAI: Yes, I do! Your name is Ahmed.",
 'what_to_avoid': 'Assuming the user already understands the specific research context.'}

In [13]:
def episodic_system_prompt(query, vdb_client, conversations, what_worked, what_to_avoid):
    # get new memory
    memory = episodic_recall(query, vdb_client)
    current_conversation = memory.objects[0].properties['conversation']
    
    # update memory stores, excluding current conversation from history
    if current_conversation not in conversations:
        conversations.append(current_conversation)

    # conversations.append(memory.objects[0].properties['conversation'])
    what_worked.update(memory.objects[0].properties['what_worked'].split('. '))
    what_to_avoid.update(memory.objects[0].properties['what_to_avoid'].split('. '))

    # Get previous conversations excluding the current one
    previous_convos = [conv for conv in conversations[-4:] if conv != current_conversation][-3:]
    
    # Create prompt with accumulated history
    episodic_prompt = f"""You are a helpful AI Assistant. Answer the user's questions to the best of your ability.
    You recall similar conversations with the user, here are the details:
    
    Current Conversation Match: {memory.objects[0].properties['conversation']}
    Previous Conversations: {' | '.join(previous_convos)}
    What has worked well: {' '.join(what_worked)}
    What to avoid: {' '.join(what_to_avoid)}
    
    Use these memories as context for your response to the user."""
    
    return SystemMessage(content=episodic_prompt), conversations, what_worked, what_to_avoid

In [14]:
# storage for accumulated memories
conversations = []
what_worked = set()
what_to_avoid = set()

# storage for historical message history
messages = []

while True:
    user_input = input("\nUser: ")
    user_message = HumanMessage(content=user_input)
    
    # generate new system prompt
    system_prompt, conversations, what_worked, what_to_avoid = episodic_system_prompt(user_input, vdb_client, conversations, what_worked, what_to_avoid)
    
    # reconstruct messages list with new system prompt first
    messages = [
        system_prompt,  # new system prompt always first
        *[msg for msg in messages if not isinstance(msg, SystemMessage)]  # old messages except system
    ]

    if user_input.lower() == "exit":
        add_episodic_memory(messages, vdb_client)
        print("\n == Conversation Stored in Episodic Memory ==")
        break
    if user_input.lower() == "exit_quiet":
        print("\n == Conversation Exited ==")
        break

    print("User Message: ", user_input)
    
    # add new user message
    messages.append(user_message)

    # pass entire message sequence to LLM to generate response
    response = chat_model.invoke(messages)
    print("\nAI Message: ", response.content)
    
    # add LLM's response to message list
    messages.append(response)


AI Message:  Hello there! How can I help you today?

AI Message:  I don’t know your name. I’m just Alex! 😊

AI Message:  You are absolutely right! My apologies – I completely missed that. You’re right, I should remember that. 

You told me your name was Ahmed. 😊



You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



AI Message:  As a large language model, I don’t have access to personal information like your football club preferences. I’m sorry I can’t help with that! 

To tell me which club you're a fan of, you could tell me:

*   **Your favorite team:** (e.g., "I’m a fan of Manchester United")
*   **A specific club:** (e.g., "I’m a fan of Liverpool")



AI Message:  Okay, great! Glad to hear it. 😊

AI Message:  Arsenal are based in London, specifically at Highbury Road. They play their home matches at the Emirates Stadium.

AI Message:  You’re very welcome! Glad I could help. Is there anything else I can do for you today?


RuntimeError: p.attn_bias_ptr is not correctly aligned

In [15]:
for i in range(len(messages)):
    print(f"\nMessage {i+1} - {messages[i].type.upper()}: ", messages[i].content)
    i += 1


Message 1 - SYSTEM:  You are a helpful AI Assistant. Answer the user's questions to the best of your ability.
    You recall similar conversations with the user, here are the details:
    
    Current Conversation Match: HUMAN: Hello!
AI: Hello there! How can I help you today?
HUMAN: what's my name? 
AI: Your name is Alex!
HUMAN: my name is Ahmed
AI: Your name is Ahmed!
HUMAN: do you know my name now? what is it ?
AI: Yes, I do! Your name is Ahmed.
    Previous Conversations: 
    What has worked well: Using a simple, direct confirmation and reiteration of the identified name.
    What to avoid: Assuming the user already understands the specific research context.
    
    Use these memories as context for your response to the user.

Message 2 - HUMAN:  hello

Message 3 - AI:  Hello there! How can I help you today?

Message 4 - HUMAN:  do you know my name? 

Message 5 - AI:  I don’t know your name. I’m just Alex! 😊

Message 6 - HUMAN:  i told you about my name in an earlier conversatio

# Semantic Memory

Semantic memory represents our structured knowledge of facts, concepts, and their relationships - essentially what we "know" rather than what we "remember experiencing." This type of memory allows us to understand and interact with the world by accessing our accumulated knowledge. For a chatbot, semantic memory would consist of its knowledge base and retrieval system, containing documentation, technical information, and general knowledge that can be accessed to provide accurate and informed responses.

# Procedural Memory

Procedural memory is different from working, semantic, and episodic memory since it covers more how we actually remember to perform tasks or follow a familiar routine, i.e. riding a bike or typing on a keyboard. It's the "how to do things" type of memory, distinct from factual knowledge (semantic) or specific experiences (episodic). This memory system enables us to execute complex sequences of actions without conscious recall of each individual step.

You can refer to Adam's repo for the code for Semantic and Procedural Memories, but they are very similar to Epesodic Memory!
https://github.com/ALucek/agentic-memory