In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from langchain_teddynote import logging

# 프로젝트 이름을 입력합니다.
logging.langsmith("LangGraph-QA")

LangSmith 추적을 시작합니다.
[프로젝트명]
LangGraph-QA


# 데이터 전처리

## 데이터 로드

In [3]:
import os
import glob
import utils

ROOT_PATH = "C:/Users/skyop/JaehoNote_2/fastcampus-code-qa_copy/langgraph-main"

# 가져올 문서 경로
libs_path = ROOT_PATH + "/libs"
docs_path = ROOT_PATH + "/docs"
examples_path = ROOT_PATH + "/examples"

all_repos = [
    libs_path,
    docs_path,
    examples_path,
]

여러가지 파일형식을 다루는 GenericLoader(ex. text, txt, html, markdown, csv, json, )
```python
from langchain_community.document_loaders.generic import GenericLoader
```

In [4]:
from langchain_text_splitters import Language
from langchain_community.document_loaders.generic import GenericLoader 
from langchain_community.document_loaders.parsers import LanguageParser
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import DirectoryLoader

# 파이썬 파일 로드: 코드 30줄 이상인 py파일만 파싱
def load_python_files(repos):
    documents = []
    for repo in repos:
        loader = GenericLoader.from_filesystem(
            repo,
            glob="**/*",
            suffixes=[".py"],
            parser=LanguageParser(language=Language.PYTHON, parser_threshold=30), 
        )
        documents.extend(loader.load())
    print(f".py 파일의 개수: {len(documents)}")
    return documents

# 마크다운 파일 로드
def load_markdown_files(repos):
    documents = []
    for repo in repos:
        try:
            loader = DirectoryLoader(
                repo,
                glob="**/*.md",
                loader_cls=TextLoader,
                loader_kwargs={"encoding": "utf-8"},
                recursive=True, # 재귀적으로 탐색: 현재 디렉토리와 모든 하위 디렉토리에서 .md 파일을 찾습니다.
            )
            documents.extend(loader.load())
        except Exception as e:
            print(f"Error loading markdown files from {repo}: {e}")
            continue
    print(f".md 파일의 개수: {len(documents)}")
    return documents

### 노트북 파일 로드 (컨버트 필요)
def load_notebook_files(root_path):
    # ipynb파일 찾기
    notebook_files = glob.glob(root_path + "/**/*.ipynb", recursive=True)
    
    # 노트북 파일을 마크다운 파일로 변환 (ipynb파일은 셀로 되있기 때문에, 마크다운 셀로 변환)
    notebook_md_files = []
    for f in notebook_files:
        converted_file = utils.convert_notebook_to_md(f)
        notebook_md_files.append(converted_file)

    # 변환된 마크다운 로드
    documents = []
    for f in notebook_md_files:
        loader = TextLoader(f, encoding="utf-8")
        docs = loader.load()
        # md파일을 ipynb로 변환 (마크다운 셀이 포함된채 저장)
        for doc in docs:
            doc.metadata["source"] = doc.metadata["source"].replace(".md", ".ipynb")
        documents.extend(docs)
        os.remove(f) # 변환된 마크다운 파일은 처리 후 삭제

    print(f".ipynb 파일의 개수: {len(documents)}")
    return documents

# 모든 문서 로드
py_documents = load_python_files(all_repos)
md_documents = load_markdown_files(all_repos)
notebook_documents = load_notebook_files(ROOT_PATH)


# 마크다운 파일 로드
# def load_markdown_files(repos):
#     documents = []
#     for repo in repos:
#         try:
#             loader = DirectoryLoader(
#                 repo,
#                 glob="**/*.md",
#                 loader_cls=TextLoader,
#                 loader_kwargs={"encoding": "utf-8"},
#                 recursive=True, # 재귀적으로 탐색: 현재 디렉토리와 모든 하위 디렉토리에서 .md 파일을 찾습니다.
#             )
#             documents.extend(loader.load())
#         except Exception as e:
#             print(f"Error loading markdown files from {repo}: {e}")
#             continue
#     print(f".md 파일의 개수: {len(documents)}")
#     return documents


.py 파일의 개수: 1661
.md 파일의 개수: 161
.ipynb 파일의 개수: 245


In [5]:
all_docs = py_documents + md_documents + notebook_documents

len(all_docs)

2067

## 데이터 분할(Chunking)

In [6]:
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter

# python 전용 splitter
pythion_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=2000, chunk_overlap=200
)

# 일반 텍스트용 splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000, chunk_overlap=200
)


split_py_documents = pythion_splitter.split_documents(py_documents)
split_md_documents = text_splitter.split_documents(md_documents)
split_notebook_documents = text_splitter.split_documents(notebook_documents)

In [7]:
# 모든 문서를 합칩니다.
split_docs = split_md_documents + split_md_documents + split_notebook_documents
len(split_docs)

2599

# SAVE: Qdant Vector Store

In [8]:
from langchain_ollama import OllamaEmbeddings
from langchain_qdrant import QdrantVectorStore, FastEmbedSparse, RetrievalMode
from qdrant_client import QdrantClient, models
from qdrant_client.http.models import Distance, VectorParams, SparseVectorParams

In [9]:
client = QdrantClient(host="localhost", port=6333)

collection_name = "LangChain_LangGraph_QA"

dense_embedding = OllamaEmbeddings(model="bge-m3")
sparse_embedding = FastEmbedSparse(model_name="Qdrant/bm25")


In [10]:
client.create_collection(
    collection_name=collection_name,
    vectors_config={"dense": VectorParams(size=1024, distance=Distance.COSINE)},
    sparse_vectors_config={"sparse": SparseVectorParams(index=models.SparseIndexParams(on_disk=False))},
)

True

In [11]:
qdrant = QdrantVectorStore(
    client=client,
    collection_name=collection_name,
    embedding=dense_embedding,
    sparse_embedding=sparse_embedding,
    retrieval_mode=RetrievalMode.HYBRID,
    vector_name="dense",
    sparse_vector_name="sparse"
)

qdrant.add_documents(split_docs)

['c935ae34f5f34879a1e0257f5b4c7c93',
 'c2bfcdef7384493bb5792243b01027f7',
 'df6458a933764854b55d6cc84334b90b',
 'f4adc17498cb42bd9bde53c36170599c',
 '3b1ec1d36fda4693bcf910c9db7d9240',
 'e4ff146a69984a14a55b01f3c74b4832',
 'e6906eefc39d42afbd77379f9f70a1f3',
 '075f0063154844c59722e77f431487fa',
 '0cc2989dc43240809ab7dc649bda8bdf',
 'e06c8fd334b24472a3acdcb985782335',
 '0b62126810fc4e1e823781592aa603f6',
 '1ec577d7f5e847f5890ee9955de71d88',
 '43824864e989496387671feb2aa490f7',
 '5eb8890ff84048f78f3a4085a0cc031b',
 '03bb16f3adc249009eb69f1dd66365f9',
 '1cab492283cf4cfc86ca37b549f437c8',
 '4b1ae2a6187c40d0b3d92b447c2ad8e9',
 '9ca50430040a461083e42d77cbed1f4c',
 '2725341109fd4fbfbeee1952d2432866',
 'a7aa89d77fe94c89808633a6802b1f5a',
 'cf0a652e0ae74af6acc54f32ace1bafb',
 'dbae0b88c3f1485fbcebc9a2e32f6203',
 '737f2499d12345aaaf172b1dbbb372e3',
 'f4a56b9389014519bf6bf31d6ef72478',
 '7e1171721ff4408faf9a604f54f3fa69',
 '6b16c16deaac4eeab4a333b5761e659f',
 '7cad41d4d6bb4430be9514cd35729333',
 

In [12]:
response = qdrant.as_retriever(search_kwargs={"k": 4})

In [13]:
query = "self-rag"
print(response.invoke(query)[0].page_content)

# Self RAG

Self-RAG is a strategy for RAG that incorporates self-reflection / self-grading on retrieved documents and generations. 

[Paper](https://arxiv.org/abs/2310.11511)

![Screenshot 2024-04-01 at 12.41.50 PM.png](attachment:15cba0ab-a549-4909-8373-fb761e384eff.png)

# Environment 

```python
%pip install -qU langchain-pinecone langchain-openai langchainhub langgraph
```

### Tracing

Use [LangSmith](https://docs.smith.langchain.com/) for tracing (shown at bottom)

```python
import os

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = "<your-api-key>"
```

```python
import os

os.environ["LANGCHAIN_PROJECT"] = "pinecone-devconnect"
```

## Retriever
 
Let's use Pinecone's sample movies database

```python
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore

# use pinecone movies database

# Add to vectorDB
vectorstore = PineconeVectorStore