# LangChain: Q&A over Documents

## OllamaEmbeddings Model Docs with Faiss

In [None]:
%pip install -q docarray==0.41.0

In [5]:
from langchain.document_loaders import CSVLoader
from langchain_ollama import OllamaEmbeddings
from langchain.vectorstores import FAISS


In [6]:
# 1️⃣ 加载 CSV 文件
loader = CSVLoader(file_path="OutdoorClothingCatalog_1000.csv", encoding="utf-8-sig")
docs = loader.load()
total_docs = len(docs)
print(f"总文档数: {total_docs}")

总文档数: 4


In [None]:
%pip install -qU faiss-cpu 

In [10]:
# 2️⃣ 定义 embedding
embeddings = OllamaEmbeddings(model="phi3:mini")

# 3️⃣ 构建 FAISS 向量库
vectorstore = FAISS.from_documents(docs, embeddings)

# 4️⃣ 持久化到本地
vectorstore.save_local("faiss_index")


In [11]:
vectorstore = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

In [12]:
from langchain_ollama import ChatOllama
from langchain.chains import RetrievalQA

llm = ChatOllama(model="phi3:mini", temperature=0.0)

qa = RetrievalQA.from_chain_type(
    llm=llm,  # 这里必须是真正的 LLM 实例
    retriever=vectorstore.as_retriever(search_kwargs={"k": 3})
)

query = "Tell me about ultradurable recycled Waterhog dog mat"
answer = qa.invoke(query)
print(answer)


{'query': 'Tell me about ultradurable recycled Waterhog dog mat', 'result': "The Ultradurable Recycled Waterhog Dog Mat is an eco-friendly product designed to protect your floors from spills and splashes caused by pets. Made right here in the USA, this rugged mat boasts a unique chevron weave design that not only adds style but also enhances its functionality.\n\nThe Waterhog dog mat is constructed using 24 oz of polyester fabric made from recycled materials - specifically, it's composed of 94% post-consumer plastic waste which helps reduce landfill contributions and ocean pollution while promoting a circular economy approach to product manufacturing.\n\nThe rugged mat features thick and thin fibers that work together effectively; the thicker ones scrape dirt off your pet's paws, whereas the finer strands absorb water quickly - this dual-purpose design ensures cleanliness while also keeping muddy footprints at bay.\n\nThe Waterhog dog mat is not only durable but it’s designed to withst

## Similarity with sentence transformers


In [18]:
%pip install -Uq transformers sentence-transformers torch

In [1]:
%pip list | findstr "transformers sentence-transformers torch"

sentence-transformers     5.1.0
torch                     2.8.0
transformers              4.55.4


In [1]:
%pip install -Uq hf_xet

%pip list | findstr "hf"

hf-xet                    1.1.9


In [3]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

sentences = [
    "That is a happy person",
    "That is a happy dog",
    "That is a very happy person",
    "Today is a sunny day"
]
embeddings = model.encode(sentences)

similarities = model.similarity(embeddings, embeddings)
print(similarities.shape)
# [4, 4]

torch.Size([4, 4])


In [4]:
print(similarities)

tensor([[1.0000, 0.6946, 0.9429, 0.2569],
        [0.6946, 1.0000, 0.6211, 0.2491],
        [0.9429, 0.6211, 1.0000, 0.2106],
        [0.2569, 0.2491, 0.2106, 1.0000]])


## QA With Sentence transformers

In [None]:
from langchain.document_loaders import CSVLoader
loader = CSVLoader(file_path="OutdoorClothingCatalog_1000.csv", encoding="utf-8")


In [10]:
""" from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
embed = embeddings.embed_query("Hi my name is Harrison")
print(len(embed))
print(embed[:5])

db = DocArrayInMemorySearch.from_documents(
    docs, 
    embeddings
)
query = "Please suggest a shirt with sunblocking"
docs = db.similarity_search(query)

len(docs)

docs[0]

retriever = db.as_retriever()

llm = ChatOpenAI(temperature = 0.0)
qdocs = "".join([docs[i].page_content for i in range(len(docs))])
response = llm.call_as_llm(f"{qdocs} Question: Please list all your \
shirts with sun protection in a table in markdown and summarize each one.") 
display(Markdown(response))
 """
 
from sentence_transformers import SentenceTransformer
import ollama
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.embeddings.base import Embeddings
from IPython.display import display, Markdown

# 创建一个自定义的 Embeddings 类来包装 SentenceTransformer
class SentenceTransformerEmbeddings(Embeddings):
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
    
    def embed_documents(self, texts):
        return self.model.encode(texts).tolist()
    
    def embed_query(self, text):
        return self.model.encode([text])[0].tolist()

# 使用 SentenceTransformer 替代 OpenAIEmbeddings
embeddings = SentenceTransformerEmbeddings()
embed = embeddings.embed_query("Hi my name is Harrison")
print(len(embed))
print(embed[:5])

db = DocArrayInMemorySearch.from_documents(
    docs, 
    embeddings
)
query = "Please suggest a shirt with sunblocking"
docs = db.similarity_search(query)

len(docs)

docs[0]

retriever = db.as_retriever()

# 使用 ollama 调用 phi3:3.8b 替代 ChatOpenAI
def call_phi3(prompt):
    response = ollama.generate(
        model='phi3:3.8b',
        prompt=prompt,
        options={
            'temperature': 0.0,
        }
    )
    return response['response']

qdocs = "".join([docs[i].page_content for i in range(len(docs))])
response = call_phi3(f"{qdocs} Question: Please list all your \
shirts with sun protection in a table in markdown and summarize each one.") 
display(Markdown(response))

384
[-0.06155332177877426, -0.06207887455821037, -0.018952982500195503, 0.04829912260174751, -0.028553958982229233]




| Shirt Name                           | Sun Protection Rating | Fabric Composition               | Additional Features                                      | Care Instructions       | Imported (Y/N) | Summary                                                                                                   |
|--------------------------------------|----------------------|---------------------------------|---------------------------------------------------------|------------------------|---------------|----------------------------------------------------------------------------------------------------------|
| Infant and Toddler Girls' Coastal Chill Swimsuit, Two-Piece  | UPF 50+               | Four-way stretch chlorine resistant fabric with a crossover no-slip strap. Lined bottom for maximum coverage.    | Bright colors, ruffles and whimsical prints                      | Machine wash & line dry| Y             | A two-piece swimsuit designed to keep its shape while providing high sun protection; suitable for infants/toddlers with a fun design that's easy on the skin.  |
| Refresh Swimwear, V-Neck Tankini Contrasts   | UPF 50+               | Recycled nylon and Lycra spandex blend; lined in recycled materials for breathability and quick drying with abrasion resistance.    | Colorblock style that moves well during activities like swimming or SUP riding                      | Handwash & line dry  | Y             | A tankini top made from eco-friendly, stretchy fabric offering sun protection; designed for comfort in water sports and casual wear with a colorful look.    |
| Women's Campside Oxfords              | N/A                  | Soft canvas material over EVA innersole with antimicrobial odor control featuring camping motifs on the liner, moderate arch contour for support and cushioning from a rubber outsole.    | Super-soft feel; lace-toe design without added bulk                      | N/A (not applicable)  | Y             | A comfortable pair of women's camping boots with quality construction, antimicrobial odor control and vintage motifs for a classic look suitable for outdoor activities.   |
| Recycled Waterhog Dog Mat, Chevron Weave    | N/A                  | Polyester fabric made from recycled materials; rubber backing with an exclusive design featuring thick and thin fibers to scrape dirt and absorb water     | Quick-drying properties resistant to fading, rotting, mildew and shedding   | Vacuum or hose clean  | Y             | A durable dog mat made from recycled plastics for indoor/outdoor use that protects floors by absorbing water and dirt while being eco-friendly.                      |

Please note: The table above assumes the availability of sun protection ratings, which are not provided in all product descriptions except where explicitly mentioned (e.g., UPF 50+ for swimwear). Additionally, some products may have features that contribute to overall comfort and care but do not specifically mention UV or SPF rating; these should be considered when evaluating sun protection capabilities of clothing items like the Infant & Toddler Girls' Swimsuit.

In [19]:
from langchain.chains import RetrievalQA
from langchain_ollama import ChatOllama

llm = ChatOllama(temperature=0.0, model="phi3:3.8b")
qa_stuff = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever, 
    verbose=True
)

In [18]:
query =  "Please list all your shirts with sun protection in a table \
in markdown and summarize each one."

In [20]:
response = qa_stuff.invoke(query)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


In [26]:
display(Markdown(response.get('result')))

| Shirt Name                      | Sun Protection Rating | Description Summary                                                                                   | Imported (Y/N) | Size & Fit Notes                                                                     | Additional Features                                                                                                       | Fabric & Care Details                                                                                  |
|---------------------------------|----------------------|-------------------------------------------------------------------------------------------------------|----------------|-------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------|
| Infant and Toddler Girls' Coastal Chill Swimsuit, Two-Piece | UPF 50+               | Bright colors with ruffles for a whimsical look; chlorine resistant fabric that retains shape.             | Y              | Fits snugly on toddlers                                                                | No slip straps and fully lined bottom ensure secure fit and maximum coverage. Machine wash, line dry recommended.  | Four-way stretch cotton blend with UPF protection; machine washable for durability.                        |
| Refresh Swimwear, V-Neck Tankini Contrasts | Not specified         | Colorblock style tankini top that moves well and is comfortable while providing sun protection up to 98%.   | Y              | Fits close with racerback straps for ease of use                                        | Lightweight racerback straps, V-neck silhouette enhance comfort. Made from recycled materials where possible.    | Recycled nylon and Lycra spandex blend; quick drying and abrasion resistant with UPF protection up to 50+. Handwash recommended.|

Please note that the "Women's Campside Oxfords" do not provide sun protection, so they are omitted from this table as per your request for shirts only. The Recycled Waterhog Dog Mat is also excluded because it does not qualify as a shirt and therefore doesn't offer direct sun protection to the wearer but rather protects floors against spills and splashes, which indirectly contributes to maintaining cleanliness in outdoor environments.

## Similarity with OllamaEmbeddings

In [None]:
import ollama
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class SimilarityOllamaEmbeddings:
    def __init__(self, model_name="minilm-embed"):
        self.model_name = model_name
        
    def encode(self, sentences):
        """编码句子为嵌入向量"""
        if isinstance(sentences, str):
            sentences = [sentences]
            
        embeddings = []
        for sentence in sentences:
            try:
                response = ollama.embeddings(
                    model=self.model_name,
                    prompt=sentence
                )
                embeddings.append(response['embedding'])
            except Exception as e:
                print(f"编码句子时出错: {e}")
                return None
        
        return np.array(embeddings)
    
    def similarity(self, embeddings1, embeddings2):
        """计算两组嵌入向量的相似度"""
        return cosine_similarity(embeddings1, embeddings2)

# 使用Ollama替代sentence-transformers
model = SimilarityOllamaEmbeddings("all-minilm-l6-v2:latest")  # 使用你创建的Ollama模型

sentences = [
    "That is a happy person",
    "That is a happy dog",
    "That is a very happy person",
    "Today is a sunny day"
]

embeddings = model.encode(sentences)
if embeddings is not None:
    similarities = model.similarity(embeddings, embeddings)
    print(similarities.shape)
    print("相似度矩阵:")
    print(similarities)

(4, 4)
相似度矩阵:
[[1.         0.67739325 0.94911729 0.22696173]
 [0.67739325 1.         0.6087489  0.2149515 ]
 [0.94911729 0.6087489  1.         0.1910378 ]
 [0.22696173 0.2149515  0.1910378  1.        ]]


![Additonal methods](image.png)