In [None]:
!pip install -qU langchain langchain-community langchain-classic
!pip install -qU transformers accelerate bitsandbytes sentencepiece chromadb gradio

# Fix PyTorch/Torchvision compatibility issue
!pip install torch torchvision torchaudio --upgrade --index-url https://download.pytorch.org/whl/cu124

[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m59.1/59.1 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import torch
import gradio as gr
import os
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# ======================================================
# CORRECTED IMPORTS
# ======================================================
# 1. Legacy Chains & Memory (from langchain_classic)
from langchain_classic.chains import ConversationChain
from langchain_classic.memory import ConversationSummaryMemory

# 2. Integrations (from langchain_community)
from langchain_community.llms import HuggingFacePipeline
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

# ======================================================
# 1Ô∏è‚É£ Load LLM (Mistral 7B)
# ======================================================
MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.2"

print("üîÑ Loading Tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

print("üîÑ Loading Model (4-bit)...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    load_in_4bit=True,
    device_map="auto",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True
)

model.eval()
print("‚úÖ LLM loaded")

# ======================================================
# 2Ô∏è‚É£ HuggingFace Pipeline
# ======================================================
generation_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256,
    temperature=0.7,
    top_p=0.9,
    do_sample=True,
    repetition_penalty=1.1,
    return_full_text=False  # CRITICAL FIX
)

llm = HuggingFacePipeline(pipeline=generation_pipeline)

# ======================================================
# 3Ô∏è‚É£ Persistent Vector Memory (Chroma)
# ======================================================
PERSIST_DIR = "./chroma_memory"

print("üîÑ Loading Embeddings...")
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

vectorstore = Chroma(
    persist_directory=PERSIST_DIR,
    embedding_function=embedding_model
)

print("‚úÖ Persistent vector memory ready")

# ======================================================
# 4Ô∏è‚É£ Summary Memory (LangChain)
# ======================================================
summary_memory = ConversationSummaryMemory(
    llm=llm
)

conversation = ConversationChain(
    llm=llm,
    memory=summary_memory,
    verbose=False
)

# ======================================================
# 5Ô∏è‚É£ Chat Logic (Hybrid Memory)
# ======================================================
def chat(message, history):
    """
    - Uses summary memory for context
    - Stores conversations in vector DB
    """

    # Retrieve relevant long-term memories
    docs = vectorstore.similarity_search(message, k=3)
    retrieved_memory = "\n".join([d.page_content for d in docs])

    augmented_input = f"""
Relevant past memories:
{retrieved_memory}

Current message:
{message}
"""

    # Generate response
    response = conversation.predict(input=augmented_input)

    # Save interaction to vector memory
    vectorstore.add_texts(
        texts=[f"User: {message}\nAssistant: {response}"]
    )

    # .persist() is not strictly needed in new Chroma versions but safe to keep
    if hasattr(vectorstore, "persist"):
        vectorstore.persist()

    return response

# ======================================================
# 6Ô∏è‚É£ Gradio UI
# ======================================================
demo = gr.ChatInterface(
    fn=chat,
    title="üß† LLM with Summary + Persistent Memory (Mistral 7B)",
    description="LangChain ConversationSummaryMemory + Chroma (FREE Colab GPU)"
)

demo.launch(share=True)