# 1. Get a Data Loader


In [None]:
from langchain.document_loaders import TextLoader
from langchain.schema import Document

# 指定txt文件的路径
file_path = 'your book.txt'

# 使用TextLoader加载txt文件
loader = TextLoader(file_path)

# 加载文档
text_documents = loader.load()



In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=30)
documents=text_splitter.split_documents(text_documents)
documents[0]

In [None]:
from sentence_transformers import SentenceTransformer
from langchain.embeddings.base import Embeddings as BaseEmbeddings
from typing import List

class SentenceTransformerEmbeddings(BaseEmbeddings):
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return self.model.encode(texts, convert_to_tensor=False).tolist()

    def embed_query(self, text: str) -> List[float]:
        return self.model.encode([text], convert_to_tensor=False).tolist()[0]


embedding = SentenceTransformerEmbeddings(model_name='your model name')

In [None]:
# 读取txt文件并按行分割
with open('semantic_chunks.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

# 去除每行末尾的换行符
lines = [line.strip() for line in lines]

In [None]:
from langchain.vectorstores import FAISS
from langchain.docstore import InMemoryDocstore
from langchain.docstore.document import Document
from typing import Tuple

document_embeddings = embedding.embed_documents(lines)

# 创建 (text, embedding) 对的列表
text_embedding_pairs: List[Tuple[str, List]] = list(zip(lines, document_embeddings))

# 创建 FAISS 向量存储
vectorstore = FAISS.from_embeddings(text_embedding_pairs, embedding)

修改数据集格式

In [None]:
import csv

# 定义输入和输出的CSV文件路径
input_file = 'your dataset.csv'
output_file = 'output dataset.csv'

# 打开输入文件和输出文件
with open(input_file, mode='r', encoding='utf-8') as infile, open(output_file, mode='w', newline='', encoding='utf-8') as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)
    
    # 遍历每一行数据
    for row in reader:
        # 将每个字段用[]包裹
        wrapped_row = ['[' + field + ']' for field in row]
        # 写入新的一行数据
        writer.writerow(wrapped_row)

In [None]:
from langchain.prompts import ChatPromptTemplate
import pandas as pd
df = pd.read_csv("your dataset.csv")
# sample_df = df.sample(n=100, random_state=1)  # random_state for reproducibility
# print(sample_df)
# df = sample_df
query= df['question']

In [None]:
results = []
all_contexts = []
import time


# Loop through each query, retrieve context, and store it
for q in query:
    start_time = time.time()
    result = vectorstore.similarity_search(q)
    context_text = ([doc.page_content for doc in result])
    all_contexts.append(context_text)

# 3. Make a RAG pipeline


In [None]:
from langchain.prompts import ChatPromptTemplate


# Combine all contexts into a single string
#final_context = "\n\n".join(all_contexts)
#print(final_context)

# Create prompt template
PROMPT_TEMPLATE = """
根据以下上下文回答问题：
{context}
根据上述上下文回答问题：{question}。
提供直接简单明了的答案。
不要为你的答案提供理由。
不要提供上下文信息中未提到的信息,如果你在上下文中找不到，请回答不知道。
不要说“根据上下文”或“在上下文中提到”或类似的话。
"""

prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)

In [None]:
import os
from langchain_openai import OpenAIEmbeddings,ChatOpenAI
os.environ['OPENAI_API_KEY'] = "your api key"
os.environ['OPENAI_API_BASE'] = "your api base"
llm = ChatOpenAI(model = 'local_model or online_model')

In [None]:
responses = []
results = []
all_contexts = []
import time
total_time = 0
query_count = 0
#计算耗时
for q in query:
    start_time = time.time()
    result = vectorstore.similarity_search(q)
    context_text = ([doc.page_content for doc in result])
    all_contexts.append(context_text)
    prompt = prompt_template.format(context=all_contexts, question=q)
    model = llm 
    response_text = model.predict(prompt)
    end_time = time.time()
    total_time += end_time - start_time
    query_count += 1
    responses.append(response_text)

avg_time = total_time / query_count
print(f"Average time taken: {avg_time:.2f} seconds")



In [None]:
len(responses)

In [None]:
df['answer'] = responses
df["retrival_contexts"] = all_contexts
print("Existing columns:", df.columns)
df.to_csv("naive_rag_output_dataset.csv", index=False)