# install

In [None]:
pip install -qU python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install -qU langchain-community

Note: you may need to restart the kernel to use updated packages.


In [None]:
from bs4 import BeautifulSoup
from collections import Counter
from dotenv import load_dotenv
import faiss
import getpass
from IPython import get_ipython
import json
from langchain.docstore.document import Document
from langchain.schema import Document
from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter,
)
from langchain.vectorstores import Chroma
from langchain_upstage import UpstageEmbeddings
import numpy as np
import openai
import os
import pandas as pd
import re
import uuid
import warnings
warnings.filterwarnings("ignore")

In [None]:
# 환경 변수 이름을 정의
API_KEYS = {
    "UPSTAGE_API_KEY": None,
    "LANGCHAIN_API_KEY": None,
    "TAVILY_API_KEY": None
}

''' 환경 변수를 로드하는 함수 정의 '''
def load_env():
    # running in Google Colab
    if "google.colab" in str(get_ipython()):
        from google.colab import userdata
        for key in API_KEYS.keys():
            API_KEYS[key] = os.environ.setdefault(key, userdata.get(key))

    # running in local Jupyter Notebook
    else:
        load_dotenv()  # .env 파일을 로드
        for key in API_KEYS.keys():
            API_KEYS[key] = os.environ.get(key)

    return tuple(API_KEYS.values())

# 환경 변수 값을 로드하여 변수에 저장
UPSTAGE_API_KEY, LANGCHAIN_API_KEY, TAVILY_API_KEY = load_env()


# DB

In [None]:
# Load JSON files
with open(r'C:\Users\wnsgu\Desktop\upstage\cookbook\file\사진\documents.json', 'r', encoding='utf-8') as f1, \
     open(r'C:\Users\wnsgu\Desktop\upstage\cookbook\file\pdf\documents.json', 'r', encoding='utf-8') as f2:
    docs1 = json.load(f1)
    docs2 = json.load(f2)


combined_docs = docs1 + docs2
documents = []
for doc in combined_docs:
    if 'content' in doc:
        documents.append({'content': doc['content'], 'metadata': doc.get('metadata', {})})
    else:
        print(f"문서에 'content'가 없습니다: {doc}")

In [None]:
youth_policies_df = pd.read_csv(r'C:\Users\wnsgu\Desktop\upstage\youth_policies_new.csv')
df_sorted = pd.read_csv(r'C:\Users\wnsgu\Desktop\upstage\cookbook\Solar-Fullstack-LLM-101\df_sorted.csv')

In [None]:
# 특수 문자를 제거하는 함수 정의
def clean_special_characters(text):
    # 정규식을 사용하여 특수 문자 제거 (\x00와 ▶ 같은 불필요한 문자 포함)
    text = re.sub(r'[\x00-\x1F▶]', '', text)  # 제어 문자와 ▶ 제거
    text = re.sub(r'\s+', ' ', text)  # 연속된 공백을 단일 공백으로 변환
    return text.strip()

# HTML 태그를 제거하고 특수 문자도 정리하는 함수
def clean_html(content):
    soup = BeautifulSoup(content, 'html.parser')
    text = soup.get_text()
    return clean_special_characters(text)  # 추가 전처리 적용
# 텍스트 분할 함수 수정
def chunking(docs):
    split_docs = []
    for doc in docs:
        # HTML 태그 제거 및 특수 문자 정리
        cleaned_content = clean_html(doc['content'])

        # 하나의 텍스트로 통합하여 처리
        split_docs.append({
            'content': clean_special_characters(cleaned_content),  # 전체 내용을 하나로
            'metadata': doc.get('metadata', {})
        })

    # Return split_docs as Document objects
    return [Document(page_content=d['content'], metadata=d['metadata']) for d in split_docs]

split_docs = chunking(documents)

In [None]:
# Update document content with youth_policies information
for doc in split_docs:
    title = doc.metadata.get('title')
    if title in youth_policies_df['정책 ID'].values:
        matching_row = youth_policies_df[youth_policies_df['정책 ID'] == title].iloc[0]

        # 기존 metadata에 정책 ID와 정책명 추가
        doc.metadata['정책 ID'] = matching_row['정책 ID']
        doc.metadata['정책명'] = matching_row['정책명']
    else:
        # If no match, use title as 정책 ID and set 정책명 to ''
        doc.metadata['정책 ID'] = title  # title로 정책 ID 대체
        doc.metadata['정책명'] = ''
    additional_content = matching_row.to_dict()
    updated_content = f"{doc.page_content}\n\nAdditional Information:\n{json.dumps(additional_content, ensure_ascii=False, indent=2)}"
    doc.page_content = updated_content

# youth_policies_df 데이터를 split_docs에 추가 (중복 방지)
existing_ids = {doc.metadata.get('정책 ID') for doc in split_docs}  # 기존 정책 ID 집합
for index, row in youth_policies_df.iterrows():
    if row['정책 ID'] not in existing_ids:  # 이미 추가된 ID는 제외
        # 정책 ID와 정책명은 metadata로 설정
        metadata = {
            '정책 ID': row['정책 ID'],  # 반드시 존재해야 하는 열
            '정책명': row['정책명']   # 반드시 존재해야 하는 열
        }

        # 나머지 열은 content로 설정 (한글로 유지)
        content = {
            col: row[col] for col in youth_policies_df.columns if col not in ['정책 ID', '정책명']
        }

        # split_docs에 추가
        split_docs.append(Document(page_content=json.dumps(content, ensure_ascii=False, indent=2), metadata=metadata))

# 기존 metadata 정리 (title과 total_pages 제거)
for doc in split_docs:
    # 정책 ID와 정책명이 metadata에 있는 경우
    if '정책 ID' in doc.metadata and '정책명' in doc.metadata:
        # title과 total_pages 제거
        doc.metadata.pop('title', None)
        doc.metadata.pop('total_pages', None)

# Convert df_sorted to Document objects
df_sorted_docs = []
for index, row in df_sorted.iterrows():
    df_sorted_docs.append(Document(page_content=row['text'], metadata={'title': row['title'], 'source': 'df_sorted', 'index': index}))


In [None]:
# 정책명에서 R+숫자 형식의 숫자 추출 함수
def extract_number_from_policy_name(policy_name):
    match = re.search(r'R(\d+)', policy_name)
    return int(match.group(1)) if match else -1  # R+숫자가 없으면 -1을 반환

# 중복된 정책명을 처리하기 위해 `split_docs`에서 정책명 기준으로 그룹화
policy_name_counts = Counter([doc.metadata.get('정책명') for doc in split_docs if '정책명' in doc.metadata and doc.metadata.get('정책명') != ''])

# 중복된 정책명 추출
duplicates = [name for name, count in policy_name_counts.items() if count > 1]

# 중복된 정책명에 대해 숫자가 가장 큰 것만 남기기
updated_split_docs = []
seen_policy_names = set()

# 중복된 정책명 처리
for name in duplicates:
    # 해당 정책명에 대한 모든 항목을 가져옴
    duplicate_entries = [doc for doc in split_docs if doc.metadata.get('정책명') == name and doc.metadata.get('정책명') != '']

    # 숫자 값을 기준으로 가장 큰 항목만 선택
    max_entry = max(duplicate_entries, key=lambda x: extract_number_from_policy_name(x.metadata.get('정책명')))

    # 최종 리스트에 추가 (중복을 피하기 위해 이미 처리한 정책명은 추가하지 않음)
    if name not in seen_policy_names:
        updated_split_docs.append(max_entry)
        seen_policy_names.add(name)

# `updated_split_docs`에는 중복을 처리한 항목들만 남음
# 나머지 split_docs에 대해 중복 처리되지 않은 항목은 그대로 추가
for doc in split_docs:
    if '정책명' in doc.metadata and doc.metadata['정책명'] == '':
        updated_split_docs.append(doc)  # 정책명이 빈 문자열인 항목은 그대로 추가
    elif '정책명' in doc.metadata and doc.metadata['정책명'] not in seen_policy_names:
        updated_split_docs.append(doc)  # 중복 처리되지 않은 정책명은 그대로 추가


In [None]:
# Initialize lists to hold content lengths based on different conditions
content_with_additional_info_before = []  # Before Additional Information
content_with_additional_info_after = []   # After Additional Information
content_without_additional_info = []      # Without Additional Information


# Step 1: Iterate over split_docs and classify based on conditions
for doc in split_docs:
    original_content_length = len(doc.page_content)  # Original content length

    # Check policy name condition
    if doc.metadata.get('정책명') != '':
        # If policy name is empty, handle as "content_with_policy_name_empty"
        if "Additional Information" in doc.page_content:
            # Split the content into before and after 'Additional Information'
            content_before_additional_info = doc.page_content.split('\n\nAdditional Information:\n')[0]
            content_after_additional_info = doc.page_content.split('\n\nAdditional Information:\n')[1]
            content_with_additional_info_before.append(len(content_before_additional_info))
            content_with_additional_info_after.append(len(content_after_additional_info))
        else:
            content_without_additional_info.append(original_content_length)
    else :
        content_with_additional_info_before.append(original_content_length)

# Step 2: Analyze text size for each condition and print max and average content lengths
def print_content_analysis(content_list, label):
    if content_list:
        avg_content_length = sum(content_list) / len(content_list)
        max_content_length = max(content_list)
        print(f"Average content length {label}: {avg_content_length:.2f}")
        print(f"Max content length {label}: {max_content_length}")
    else:
        print(f"No documents found for {label}")

# Print analysis for each case
print_content_analysis(content_with_additional_info_before, "before Additional Information")
print_content_analysis(content_with_additional_info_after, "after Additional Information")
print_content_analysis(content_without_additional_info, "without Additional Information")



Average content length before Additional Information: 6203.77
Max content length before Additional Information: 74247
Average content length after Additional Information: 1954.22
Max content length after Additional Information: 3880
Average content length without Additional Information: 1342.50
Max content length without Additional Information: 3978


In [None]:
from uuid import uuid4

# Embedding 설정 및 Chroma DB 경로
embedding_function = UpstageEmbeddings(model="solar-embedding-1-large")
persist_directory = r'C:\Users\wnsgu\Desktop\upstage\cookbook\chroma_db\policy_combined'
db = Chroma(embedding_function=embedding_function, persist_directory=persist_directory)

# 최대 청크 크기 및 배치 크기 정의
MAX_CHUNK_SIZE = 2000
MAX_BATCH_SIZE = 100
OVERLAP_SIZE = 50

# Step 1: 텍스트를 청크로 나누는 함수
def split_into_chunks_with_overlap(text, max_chunk_size=MAX_CHUNK_SIZE, overlap_size=OVERLAP_SIZE):
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + max_chunk_size, len(text))  # 텍스트 끝 초과 방지
        chunk = text[start:end]
        chunks.append(chunk)
        if end == len(text):  # 텍스트 끝에 도달하면 종료
            break
        start = end - overlap_size  # 겹침 고려한 새 시작 위치
    return chunks

# Step 2: 문서 처리 및 청크 분리
all_docs_for_embedding = []

# 데이터를 처리하는 함수
def process_documents(docs):
    for doc in docs:
        original_content = doc.page_content if isinstance(doc, Document) else doc.get("page_content", "")
        metadata = doc.metadata if isinstance(doc, Document) else doc.get("metadata", {})
        metadata["id"] = metadata.get("정책 ID", str(uuid4()))  # 정책 ID가 없으면 UUID 생성

        # Additional Information 처리
        if "Additional Information" in original_content:
            # 이후 부분을 하나의 청크로
            content_after = original_content.split('\n\nAdditional Information:\n')[1]
            all_docs_for_embedding.append(Document(
                page_content=content_after,
                metadata=metadata
            ))
            # 이전 부분을 청크 단위로 분리
            content_before = original_content.split('\n\nAdditional Information:\n')[0]
            chunks = split_into_chunks_with_overlap(content_before)
            for idx, chunk in enumerate(chunks):
                all_docs_for_embedding.append(Document(
                    page_content=chunk,
                    metadata=metadata,
                ))
        else:
            # Additional Information이 없는 경우 -> 텍스트를 청크로 나눔
            chunks = split_into_chunks_with_overlap(original_content)
            for idx, chunk in enumerate(chunks):
                all_docs_for_embedding.append(Document(
                    page_content=chunk,
                    metadata=metadata,
                ))

# Step 3: 두 데이터셋을 통합 처리
process_documents(split_docs)       # 첫 번째 데이터셋 처리
process_documents(df_sorted_docs)   # 두 번째 데이터셋 처리

# Step 4: 문서를 배치로 ChromaDB에 추가 및 저장
def add_documents_in_batches(db, documents, max_batch_size=MAX_BATCH_SIZE):
    """
    문서를 배치 단위로 벡터 데이터베이스에 추가하고 저장합니다.
    """
    for i in range(0, len(documents), max_batch_size):
        batch = documents[i:i + max_batch_size]
        db.add_documents(batch)  # Document 객체 리스트 전달
        print(f"Added batch {i // max_batch_size + 1} of {len(documents) // max_batch_size + 1}")
    db.persist()  # 현재 상태를 저장

# Step 5: 벡터 데이터베이스에 문서 추가
try:
    add_documents_in_batches(db, all_docs_for_embedding)  # 처리된 문서 전달
    retriever = db.as_retriever()  # 검색 가능한 Retriever 생성
    print("All documents added successfully!")
except Exception as e:
    print(f"An error occurred: {e}")


Added batch 1 of 33
Added batch 2 of 33
Added batch 3 of 33
Added batch 4 of 33
Added batch 5 of 33
Added batch 6 of 33
Added batch 7 of 33
Added batch 8 of 33
Added batch 9 of 33
Added batch 10 of 33
Added batch 11 of 33
Added batch 12 of 33
Added batch 13 of 33
Added batch 14 of 33
Added batch 15 of 33
Added batch 16 of 33
Added batch 17 of 33
Added batch 18 of 33
Added batch 19 of 33
Added batch 20 of 33
Added batch 21 of 33
Added batch 22 of 33
Added batch 23 of 33
Added batch 24 of 33
Added batch 25 of 33
Added batch 26 of 33
Added batch 27 of 33
Added batch 28 of 33
Added batch 29 of 33
Added batch 30 of 33
Added batch 31 of 33
Added batch 32 of 33
Added batch 33 of 33
All documents added successfully!
