In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from langchain_community.document_loaders import PyPDFLoader

In [3]:
pdf_docs = PyPDFLoader("..\data\Sustainability_report_2024_kr.pdf").load()

In [4]:
len(pdf_docs)

83

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [6]:
rec_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

chunk_docs = rec_splitter.split_documents(pdf_docs)
len(chunk_docs)

207

In [7]:
print(chunk_docs[0])

page_content='A Journey Towards  
a Sustainable Future
삼성전자 지속가능경영보고서 2024' metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 15.1 (Macintosh)', 'creationdate': '2024-11-25T11:10:32+09:00', 'moddate': '2024-11-25T11:10:46+09:00', 'trapped': '/False', 'source': '..\\data\\Sustainability_report_2024_kr.pdf', 'total_pages': 83, 'page': 0, 'page_label': '1'}


In [8]:
for item in chunk_docs:
    item.metadata.pop("creator", None)
    item.metadata.pop("producer", None)


chunk_docs[0].metadata

{'creationdate': '2024-11-25T11:10:32+09:00',
 'moddate': '2024-11-25T11:10:46+09:00',
 'trapped': '/False',
 'source': '..\\data\\Sustainability_report_2024_kr.pdf',
 'total_pages': 83,
 'page': 0,
 'page_label': '1'}

In [9]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [10]:
dim_size = len(embeddings.embed_query("안녕하세요"))
dim_size 

3072

In [11]:
# uv add faiss-cpu 

In [12]:
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

In [13]:
db = FAISS.from_documents(documents=[chunk_docs[0]], embedding=embeddings)

In [14]:
db.index_to_docstore_id

{0: '5ed652c3-6cac-48f1-8116-2ecab1303a03'}

In [15]:
db.docstore.__dict__['_dict']

{'5ed652c3-6cac-48f1-8116-2ecab1303a03': Document(id='5ed652c3-6cac-48f1-8116-2ecab1303a03', metadata={'creationdate': '2024-11-25T11:10:32+09:00', 'moddate': '2024-11-25T11:10:46+09:00', 'trapped': '/False', 'source': '..\\data\\Sustainability_report_2024_kr.pdf', 'total_pages': 83, 'page': 0, 'page_label': '1'}, page_content='A Journey Towards  \na Sustainable Future\n삼성전자 지속가능경영보고서 2024')}

In [16]:
db.similarity_search("삼성전자",k=10)

[Document(id='5ed652c3-6cac-48f1-8116-2ecab1303a03', metadata={'creationdate': '2024-11-25T11:10:32+09:00', 'moddate': '2024-11-25T11:10:46+09:00', 'trapped': '/False', 'source': '..\\data\\Sustainability_report_2024_kr.pdf', 'total_pages': 83, 'page': 0, 'page_label': '1'}, page_content='A Journey Towards  \na Sustainable Future\n삼성전자 지속가능경영보고서 2024')]

In [17]:
vectorstore_db_path = '../7_vercorstore/samsung_faiss.db'
index_name = 'samsung2025'
db.save_local(
    folder_path = vectorstore_db_path,
    index_name = index_name 
)

In [18]:
# 저장된 db 불러오기 
updated_db = FAISS.load_local(
    folder_path = vectorstore_db_path,
    index_name = index_name,
    embeddings=embeddings,
    allow_dangerous_deserialization=True
)

In [19]:
updated_db.docstore.__dict__['_dict']

{'5ed652c3-6cac-48f1-8116-2ecab1303a03': Document(id='5ed652c3-6cac-48f1-8116-2ecab1303a03', metadata={'creationdate': '2024-11-25T11:10:32+09:00', 'moddate': '2024-11-25T11:10:46+09:00', 'trapped': '/False', 'source': '..\\data\\Sustainability_report_2024_kr.pdf', 'total_pages': 83, 'page': 0, 'page_label': '1'}, page_content='A Journey Towards  \na Sustainable Future\n삼성전자 지속가능경영보고서 2024')}

### 문서 추가하기 

In [20]:
chunk_docs[1:10]

[Document(metadata={'creationdate': '2024-11-25T11:10:32+09:00', 'moddate': '2024-11-25T11:10:46+09:00', 'trapped': '/False', 'source': '..\\data\\Sustainability_report_2024_kr.pdf', 'total_pages': 83, 'page': 1, 'page_label': '2'}, page_content='A Journey Towards  \na Sustainable Future\n삼성전자 지속가능경영보고서 2024\nCEO 메시지\n회사 소개\n이해관계자 소통\nOur Company\n04\n05\n06\n준법과 윤리경영\nPrinciple\n53\n중대성 평가\nMateriality Assessment\n08\n임직원\n공급망\n사회공헌\n개인정보보호/보안\n고객의 안전/품질\nPeople\n31\n39\n45\n48\n50\n경제성과\n사회성과\n환경성과\n지역별 수자원 현황   \n사업부문별 환경성과\nFacts & Figures\n56\n57\n62\n65\n66\n독립된 인증인의 인증보고서\nScope 1, 2 온실가스 배출량 검증 의견서\nScope 3 온실가스 배출량 검증 의견서\nGRI Index\nTCFD 대조표\nSASB 대조표\n전사차원의 기후변화 대응 협력 활동\nAbout This Report\nAppendix\n70\n71\n72\n74\n77\n79\n81\n82\n[DX부문] \n추진체계 및 주요성과\n기후변화\n자원순환\n수자원 및 오염물질\n[DS부문]  \n추진체계 및 주요성과 \n기후변화\n수자원\n폐기물\n오염물질\nPlanet\n12\n13\n15\n17\n19\n20\n23\n26\n28\n삼성전자 지속가능경영보고서 2024 02Our Company AppendixMateriality Assessment Facts & Figures PrinciplePlanet People'),
 Doc

In [21]:
updated_db.add_documents(
    chunk_docs[1:10]
)

['90735d07-8b13-4629-ac1e-393699ddb10d',
 'b9673e09-f71b-4dc1-9392-836ab745662a',
 '1677bc5d-652b-4597-92d8-9b05b207f58f',
 'b143af4e-76fb-49c1-a43d-435536d468f3',
 '9b84e0a3-75e8-4ffa-a2da-9c27f9bc76b8',
 '3ba56c7c-caf6-4bfd-88f8-3df6ebb18275',
 '41c00b59-1489-4ad1-b8ff-34f0bcf2fe76',
 '23fe30c2-28b4-43c6-a136-f747b8cf0918',
 'a3044650-b37c-4c7b-902b-c04025f824be']

In [22]:
updated_db.index_to_docstore_id

{0: '5ed652c3-6cac-48f1-8116-2ecab1303a03',
 1: '90735d07-8b13-4629-ac1e-393699ddb10d',
 2: 'b9673e09-f71b-4dc1-9392-836ab745662a',
 3: '1677bc5d-652b-4597-92d8-9b05b207f58f',
 4: 'b143af4e-76fb-49c1-a43d-435536d468f3',
 5: '9b84e0a3-75e8-4ffa-a2da-9c27f9bc76b8',
 6: '3ba56c7c-caf6-4bfd-88f8-3df6ebb18275',
 7: '41c00b59-1489-4ad1-b8ff-34f0bcf2fe76',
 8: '23fe30c2-28b4-43c6-a136-f747b8cf0918',
 9: 'a3044650-b37c-4c7b-902b-c04025f824be'}

In [23]:
chunk_docs[2]

Document(metadata={'creationdate': '2024-11-25T11:10:32+09:00', 'moddate': '2024-11-25T11:10:46+09:00', 'trapped': '/False', 'source': '..\\data\\Sustainability_report_2024_kr.pdf', 'total_pages': 83, 'page': 2, 'page_label': '3'}, page_content='Our Company\n04    CEO 메시지\n05    회사 소개\xa0\n06    이해관계자 소통\n삼성전자 지속가능경영보고서 2024 03Our Company AppendixMateriality Assessment Facts & Figures PrinciplePlanet People')

In [24]:
from langchain_core.documents import Document

updated_db.add_documents(
    [
        Document(page_content="안녕하세요. 반갑습니다.", metadata={"source": "수동"}),
        Document(page_content="2024년 삼성 전자 주식 사지마세요", metadata={"source": "윤택한"})
    ]
)

['be3e063b-e6af-41b8-8070-e56c67657057',
 '7c30adfb-92c4-421c-9231-17fa9f1cfeb0']

In [25]:
updated_db.index_to_docstore_id

{0: '5ed652c3-6cac-48f1-8116-2ecab1303a03',
 1: '90735d07-8b13-4629-ac1e-393699ddb10d',
 2: 'b9673e09-f71b-4dc1-9392-836ab745662a',
 3: '1677bc5d-652b-4597-92d8-9b05b207f58f',
 4: 'b143af4e-76fb-49c1-a43d-435536d468f3',
 5: '9b84e0a3-75e8-4ffa-a2da-9c27f9bc76b8',
 6: '3ba56c7c-caf6-4bfd-88f8-3df6ebb18275',
 7: '41c00b59-1489-4ad1-b8ff-34f0bcf2fe76',
 8: '23fe30c2-28b4-43c6-a136-f747b8cf0918',
 9: 'a3044650-b37c-4c7b-902b-c04025f824be',
 10: 'be3e063b-e6af-41b8-8070-e56c67657057',
 11: '7c30adfb-92c4-421c-9231-17fa9f1cfeb0'}

In [27]:
updated_db.delete([ '23fe30c2-28b4-43c6-a136-f747b8cf0918'])
updated_db.index_to_docstore_id

{0: '5ed652c3-6cac-48f1-8116-2ecab1303a03',
 1: '90735d07-8b13-4629-ac1e-393699ddb10d',
 2: 'b9673e09-f71b-4dc1-9392-836ab745662a',
 3: '1677bc5d-652b-4597-92d8-9b05b207f58f',
 4: 'b143af4e-76fb-49c1-a43d-435536d468f3',
 5: '9b84e0a3-75e8-4ffa-a2da-9c27f9bc76b8',
 6: '3ba56c7c-caf6-4bfd-88f8-3df6ebb18275',
 7: '41c00b59-1489-4ad1-b8ff-34f0bcf2fe76',
 8: 'a3044650-b37c-4c7b-902b-c04025f824be',
 9: 'be3e063b-e6af-41b8-8070-e56c67657057',
 10: '7c30adfb-92c4-421c-9231-17fa9f1cfeb0'}

In [28]:
FAISS.save_local(
    updated_db,
    folder_path = vectorstore_db_path,
    index_name = index_name
)

In [29]:
test_db = FAISS.load_local(
    folder_path = vectorstore_db_path,
    index_name = index_name,
    embeddings=embeddings,
    allow_dangerous_deserialization=True
)


벡터 스토어 합치기 
- 물리적으로 합치기
- 검색기만 하이브리드로 사용 

In [30]:
db1 = FAISS.from_documents(documents=chunk_docs[:10], embedding=embeddings)
db2 = FAISS.from_documents(documents=chunk_docs[10:20], embedding=embeddings)

In [31]:
db1.index_to_docstore_id

{0: '79df7af9-a2f9-4dfa-a9e2-85674ca2f499',
 1: '77de1f34-10f8-4a79-abcc-c42049c96871',
 2: 'b094a3a1-dd4a-4796-9e46-bfdfe434d6cb',
 3: '4c50aa8b-cad3-4a90-8ea0-b2f4bf6e9b16',
 4: '53a86c4e-9693-413b-a88d-559cfd0c8b99',
 5: '09b983ab-2729-42f0-913d-af3960375773',
 6: 'f08976cf-b60b-4b04-9471-0f57685eef7a',
 7: '1980dd32-2900-481d-a09c-b2aa3fdd4d27',
 8: '4d870d7f-454d-492f-9ea9-2444b33ee263',
 9: 'ac298646-0758-406d-9b24-1cf34f20ea43'}

In [32]:
db2.index_to_docstore_id

{0: '60bca951-c41c-4447-9d45-f98498c7d8ed',
 1: 'e4bc31b8-b0be-45b8-bfdd-3d5bae8b41e3',
 2: 'f751d00a-e813-499d-b8f6-23916a37bdd2',
 3: 'd549acff-1130-4602-839d-b238b8a1385b',
 4: '679e0b24-1caf-4af0-9575-dc81ee12468b',
 5: '9e8fcad7-5b6d-4682-8b8d-b0a11ebcf47e',
 6: 'ec2681e2-6825-471c-9bdd-5b39a103b4c7',
 7: 'ed30906e-a6a4-4b41-8317-95d83f340114',
 8: 'd3121c96-d6dc-42b1-b210-6c7c92576afa',
 9: '8a8bddac-040b-47fb-bd7e-3a90177b1039'}

In [33]:
#1. 새로운 공간에 합친 데이터베이스 만들기 -> DB3
#2. db1 에 db2 를 합쳐버리기

In [34]:
db1.merge_from(
    target=db2
)
db1.index_to_docstore_id

{0: '79df7af9-a2f9-4dfa-a9e2-85674ca2f499',
 1: '77de1f34-10f8-4a79-abcc-c42049c96871',
 2: 'b094a3a1-dd4a-4796-9e46-bfdfe434d6cb',
 3: '4c50aa8b-cad3-4a90-8ea0-b2f4bf6e9b16',
 4: '53a86c4e-9693-413b-a88d-559cfd0c8b99',
 5: '09b983ab-2729-42f0-913d-af3960375773',
 6: 'f08976cf-b60b-4b04-9471-0f57685eef7a',
 7: '1980dd32-2900-481d-a09c-b2aa3fdd4d27',
 8: '4d870d7f-454d-492f-9ea9-2444b33ee263',
 9: 'ac298646-0758-406d-9b24-1cf34f20ea43',
 10: '60bca951-c41c-4447-9d45-f98498c7d8ed',
 11: 'e4bc31b8-b0be-45b8-bfdd-3d5bae8b41e3',
 12: 'f751d00a-e813-499d-b8f6-23916a37bdd2',
 13: 'd549acff-1130-4602-839d-b238b8a1385b',
 14: '679e0b24-1caf-4af0-9575-dc81ee12468b',
 15: '9e8fcad7-5b6d-4682-8b8d-b0a11ebcf47e',
 16: 'ec2681e2-6825-471c-9bdd-5b39a103b4c7',
 17: 'ed30906e-a6a4-4b41-8317-95d83f340114',
 18: 'd3121c96-d6dc-42b1-b210-6c7c92576afa',
 19: '8a8bddac-040b-47fb-bd7e-3a90177b1039'}

In [35]:
#만약에 나는 전혀 다른 DB3 에다가 만들고 싶어

In [36]:
db3 = FAISS(
    docstore = InMemoryDocstore(),
    index_to_docstore_id={},
    embedding_function=embeddings,
    index = faiss.IndexFlatL2(dim_size)
)

In [37]:
len(db3.index_to_docstore_id)

0

In [38]:
db3.merge_from(
    target = db1
)

db3.index_to_docstore_id

{0: '79df7af9-a2f9-4dfa-a9e2-85674ca2f499',
 1: '77de1f34-10f8-4a79-abcc-c42049c96871',
 2: 'b094a3a1-dd4a-4796-9e46-bfdfe434d6cb',
 3: '4c50aa8b-cad3-4a90-8ea0-b2f4bf6e9b16',
 4: '53a86c4e-9693-413b-a88d-559cfd0c8b99',
 5: '09b983ab-2729-42f0-913d-af3960375773',
 6: 'f08976cf-b60b-4b04-9471-0f57685eef7a',
 7: '1980dd32-2900-481d-a09c-b2aa3fdd4d27',
 8: '4d870d7f-454d-492f-9ea9-2444b33ee263',
 9: 'ac298646-0758-406d-9b24-1cf34f20ea43',
 10: '60bca951-c41c-4447-9d45-f98498c7d8ed',
 11: 'e4bc31b8-b0be-45b8-bfdd-3d5bae8b41e3',
 12: 'f751d00a-e813-499d-b8f6-23916a37bdd2',
 13: 'd549acff-1130-4602-839d-b238b8a1385b',
 14: '679e0b24-1caf-4af0-9575-dc81ee12468b',
 15: '9e8fcad7-5b6d-4682-8b8d-b0a11ebcf47e',
 16: 'ec2681e2-6825-471c-9bdd-5b39a103b4c7',
 17: 'ed30906e-a6a4-4b41-8317-95d83f340114',
 18: 'd3121c96-d6dc-42b1-b210-6c7c92576afa',
 19: '8a8bddac-040b-47fb-bd7e-3a90177b1039'}