In [1]:
import os, glob, hashlib
from dotenv import load_dotenv
import pandas as pd
from tqdm import tqdm
import torch
from langchain.schema import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
import chromadb
from chromadb.config import Settings
from langchain_chroma import Chroma

In [None]:
load_dotenv()

In [10]:
# Chroma 연결
CHROMA_HOST = os.getenv("CHROMA_HOST")  # 도커 네트워크에서 컨테이너명
CHROMA_PORT = int(os.getenv("CHROMA_PORT"))

client = chromadb.HttpClient(
    host=CHROMA_HOST,
    port=CHROMA_PORT,
    settings=Settings()
)

In [4]:
# 임베딩 모델 지정
model_name = "BAAI/bge-m3"
model_kwargs = {'device': 'cuda'} 

embedding = HuggingFaceEmbeddings(
    model_name=model_name, 
    model_kwargs=model_kwargs, 
    encode_kwargs=model_kwargs,
    show_progress=True)

In [5]:
# 뉴스데이터 불러오기 & 청킹
text_splitter = CharacterTextSplitter(
    chunk_size = 400,
    chunk_overlap = 80,
    separator = "."
)

docs = []
dfs=[]

for i in tqdm(range(48)):
    pkl_path = f"../public_crawling_base_data/backup_files/article_contents_backup_df_{i}.pkl"
    df = pd.read_pickle(pkl_path)

    df['topic_id'] = i
            
    dfs.append(df[['title','content','topic_id']])

df = pd.concat(dfs, ignore_index=True)
df = df.drop_duplicates(subset=['content'], keep='first').reset_index(drop=True)
print(len(df))

df["__doc_text__"] = (df.get("title", "") + "\n\n" + df.get("content", "") + "\n\n").str.strip()

for i, row in df.iterrows():
    text = row["__doc_text__"]
    
    if not text:
        continue

    chunks = text_splitter.split_text(text)
    
    meta = {
        "doc_type": "news_articles",
        "topic_id": None if pd.isna(df['topic_id'][i]) else int(df['topic_id'][i])
    }
    docs.extend([Document(page_content=chunk, metadata=meta) for chunk in chunks])
  
print(f"총 문서 개수: {len(docs)}")

  0%|          | 0/48 [00:00<?, ?it/s]Created a chunk of size 913, which is longer than the specified 400
Created a chunk of size 438, which is longer than the specified 400
  4%|▍         | 2/48 [00:00<00:07,  5.89it/s]Created a chunk of size 493, which is longer than the specified 400
 10%|█         | 5/48 [00:00<00:03, 11.50it/s]Created a chunk of size 463, which is longer than the specified 400
Created a chunk of size 640, which is longer than the specified 400
Created a chunk of size 403, which is longer than the specified 400
Created a chunk of size 402, which is longer than the specified 400
Created a chunk of size 800, which is longer than the specified 400
Created a chunk of size 410, which is longer than the specified 400
Created a chunk of size 414, which is longer than the specified 400
Created a chunk of size 403, which is longer than the specified 400
Created a chunk of size 555, which is longer than the specified 400
 15%|█▍        | 7/48 [00:01<00:06,  6.40it/s]Created 

총 문서 개수: 181643





In [6]:
# 뉴스 데이터 from_documents로 업서트
Chroma.from_documents(
    documents=docs,
    embedding=embedding,
    collection_name="news_articles",
    client=client,
    collection_metadata={"hnsw:space": "cosine"}
)

Batches:   0%|          | 0/1303 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Batches:   0%|          | 0/1303 [00:00<?, ?it/s]

Batches:   0%|          | 0/1303 [00:00<?, ?it/s]

Batches:   0%|          | 0/1303 [00:00<?, ?it/s]

Batches:   0%|          | 0/469 [00:00<?, ?it/s]

In [24]:
# 재무제표 데이터 불러오기
def normalize_company_from_filename(path: str) -> str:
    name = os.path.splitext(os.path.basename(path))[0]
    return name.split("_")[1]

def build_docs_from_fs_csv(csv_path: str) -> list[Document]:
    df = pd.read_csv(csv_path)
    df = df.fillna("") # 결측치

    # 첫 컬럼이 지표명/ 예: 'IFRS(개별)'
    metric_col = df.columns[0]
    value_cols = df.columns[1:]  # 기간 컬럼들/ 예: '2022/12','2023/12',...

    # long 형태로 변환
    long_df = df.melt(id_vars=[metric_col], var_name="period", value_name="value")

    # 기업 이름 받아오기
    company = normalize_company_from_filename(csv_path)
    
    docs = []

    for _, r in long_df.iterrows():
        metric = str(r[metric_col]).strip()
        period = str(r["period"]).strip()
        value  = str(r["value"]).strip()

        # 문장형으로 변환(검색 친화)
        text = f"{company} {period} {metric}: {value}"

        meta = {
            "doc_type": "financials",
            "company": company,
            "source_file": csv_path,
            "period": period,
            "metric": metric,
        }

        docs.append((Document(page_content=text, metadata=meta)))

    return docs

# 재무제표 파일들 경로
csv_dir = "./stocks_training_data"
all_docs = []
for csv_path in glob.glob(os.path.join(csv_dir, "*.csv")):
    items = build_docs_from_fs_csv(csv_path)
    if not items:
        continue
    all_docs.extend(items)

"""
변환 결과 예시:
삼성전자 2022/12 영업수익: 0.0
삼성전자 2023/12 영업수익: 0.0
삼성전자 2024/12 영업수익: 0.0
삼성전자 2025/03 영업수익: 0.0

삼성전자 2022/12 영업이익: 100
삼성전자 2023/12 영업이익: 120
삼성전자 2024/12 영업이익: 150
삼성전자 2025/03 영업이익: 130
"""

print(f"재무제표 문서(행 단위) 개수: {len(all_docs)}")

재무제표 문서(행 단위) 개수: 809970


In [34]:
# 재무제표 데이터 financials collection 업서트
Chroma.from_documents(
    documents=all_docs,
    embedding=embedding,
    collection_name="financials",
    client=client,
    collection_metadata={"hnsw:space": "cosine"}
)

Batches:   0%|          | 0/1303 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Batches:   0%|          | 0/1303 [00:00<?, ?it/s]

Batches:   0%|          | 0/1303 [00:00<?, ?it/s]

Batches:   0%|          | 0/1303 [00:00<?, ?it/s]

Batches:   0%|          | 0/1303 [00:00<?, ?it/s]

Batches:   0%|          | 0/1303 [00:00<?, ?it/s]

Batches:   0%|          | 0/1303 [00:00<?, ?it/s]

Batches:   0%|          | 0/1303 [00:00<?, ?it/s]

Batches:   0%|          | 0/1303 [00:00<?, ?it/s]

Batches:   0%|          | 0/1303 [00:00<?, ?it/s]

Batches:   0%|          | 0/1303 [00:00<?, ?it/s]

Batches:   0%|          | 0/1303 [00:00<?, ?it/s]

Batches:   0%|          | 0/1303 [00:00<?, ?it/s]

Batches:   0%|          | 0/1303 [00:00<?, ?it/s]

Batches:   0%|          | 0/1303 [00:00<?, ?it/s]

Batches:   0%|          | 0/1303 [00:00<?, ?it/s]

Batches:   0%|          | 0/1303 [00:00<?, ?it/s]

Batches:   0%|          | 0/1303 [00:00<?, ?it/s]

Batches:   0%|          | 0/1303 [00:00<?, ?it/s]

Batches:   0%|          | 0/573 [00:00<?, ?it/s]

In [5]:
# 기업 업종명 데이터 불러오기
topic_csv_path = './public_crawling_base_data/company_keyword.csv'

topic_df = pd.read_csv(topic_csv_path)

topic_df = topic_df.fillna("")

# 언더바 기준으로 회사명과 토픽 나누는 함수
def split_keyword(s: str):
    s = str(s).strip()
    if "_" in s:
        company, rest = s.split("_", 1)
    else:
        company, rest = s, ""
    return company.strip(), rest.strip()

docs_topic = []

for i, raw in enumerate(topic_df["stock_keywords"]):
    company, keyword = split_keyword(raw)

    # page_content: 검색 친화 문장
    text_topic = f"{company} 키워드: {keyword}"

    meta = {
        "doc_type": "topics",
        "company": company,
        "keyword": keyword
    }

    docs_topic.append(Document(page_content=text_topic, metadata=meta))


print(f"만든 문서 수: {len(docs_topic)}")

만든 문서 수: 2761


In [7]:
# 기업 업종명 데이터를 topics collection로 업서트
Chroma.from_documents(
    documents=docs_topic,
    embedding=embedding,
    collection_name="topics",
    client=client,
    collection_metadata={"hnsw:space": "cosine"}
)

Batches:   0%|          | 0/87 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


<langchain_chroma.vectorstores.Chroma at 0x7facb25411b0>