In [47]:
import requests, os, fitz, faiss, PyPDF2, tiktoken, pdfplumber, re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from pathlib import Path

In [7]:
from sentence_transformers import SentenceTransformer

### 삼성증권 리포트 다운로드

In [76]:
# 1. 요청
url = "https://www.samsungpop.com/sscommon/jsp/search/research/research_pop.jsp"
headers = {"User-Agent": "Mozilla/5.0 (compatible; ResearchBot/1.0)"}
resp = requests.post(url, headers=headers, timeout=10)

if resp.status_code != 200:
    raise Exception(f"Request failed: {resp.status_code}")

soup = BeautifulSoup(resp.text, "html.parser")

# 2. fileName 추출
file_names = []
table = soup.find("table", class_="tbl-type board")
for a_tag in table.find_all("a", href=True, title=True):
    href = a_tag['href']
    if "fileName=" in href:
        file_name = href.split("fileName=")[-1]
        file_names.append(file_name)

# 3. PDF 다운로드
os.makedirs("samsungpop", exist_ok=True)
base_url = "http://www.samsungpop.com/common.do?cmd=down&saveKey=research.pdf&"

for file_name in file_names:
    pdf_url = base_url + "fileName=" + file_name
    filename = file_name.split("/")[-1]  # 파일명만 추출
    filepath = os.path.join("samsungpop", filename)

    if os.path.exists(filepath):
        print(f"Already exists, skipped: {filepath}")
        continue

    try:
        r = requests.get(pdf_url, headers=headers, timeout=20)
        r.raise_for_status()
        with open(filepath, "wb") as f:
            f.write(r.content)
        print(f"Downloaded: {filepath}")
    except Exception as e:
        print(f"Failed to download {file_name}: {e}")

Downloaded: samsungpop\2025091217070763K_02_01.pdf
Downloaded: samsungpop\2025091216121269K_02_03.pdf
Downloaded: samsungpop\2025091209474022K_02_05.pdf
Downloaded: samsungpop\2025091209083373K_02_03.pdf
Downloaded: samsungpop\2025091210282804K_02_11.pdf
Downloaded: samsungpop\2025091212435266K_02_03.pdf
Already exists, skipped: samsungpop\2025091207551323K_02_02.pdf


### pdf 파일 읽기

In [80]:
pdf_dir = "samsungpop"
output_dir = "processed_pdf"
os.makedirs(output_dir, exist_ok=True)

In [81]:
def extract_lines(word_list, line_tol=3):
    """단어 리스트를 top 좌표 기준으로 줄 단위로 정렬"""
    word_list = sorted(word_list, key=lambda w: (float(w['top']), float(w['x0'])))
    lines = []
    buffer = []
    last_top = None
    for w in word_list:
        top = float(w['top'])
        if last_top is not None and abs(top - last_top) > line_tol:
            if buffer:
                lines.append(" ".join(buffer))
            buffer = [w['text']]
        else:
            buffer.append(w['text'])
        last_top = top
    if buffer:
        lines.append(" ".join(buffer))
    return lines

def process_text(lines):
    """줄 단위 텍스트를 문장 단위로 재조합하고 숫자/표/불릿 처리"""
    paragraphs = []
    buffer = ""
    for line in lines:
        line = line.strip()
        if not line:
            continue
        if re.match(r"^[\d\.\%\(\)\- ]+$", line) or re.match(r"^(\•|\d+\.)\s*", line):
            if buffer:
                paragraphs.append(buffer.strip())
                buffer = ""
            paragraphs.append(line)
        else:
            buffer = f"{buffer} {line}" if buffer else line
            sentences = re.split(r'(?<=[.!?;:])\s+', buffer)
            if len(sentences) > 1:
                paragraphs.extend([s.strip() for s in sentences[:-1] if s.strip()])
                buffer = sentences[-1]
    if buffer:
        paragraphs.append(buffer.strip())
    return "\n".join(paragraphs)

def process_pdf(pdf_file, output_dir):
    """PDF 파일을 열어 페이지별 좌/우 열 텍스트를 처리하고 저장"""
    pdf_name = pdf_file.name
    print(f"Processing {pdf_name} ...")

    with pdfplumber.open(pdf_file) as pdf:
        for page_num, page in enumerate(pdf.pages[:-1], start=1):  # 마지막 페이지 제외
            words = page.extract_words(x_tolerance=3, y_tolerance=3, keep_blank_chars=False)
            if not words:
                continue

            # 좌/우 열 경계 추정
            x_coords = np.array([float(w['x0']) for w in words])
            hist, bin_edges = np.histogram(x_coords, bins=50)
            split_index = np.argmax(hist[1:]) + 1
            split_x = (bin_edges[split_index] + bin_edges[split_index-1]) / 2

            left_lines = extract_lines([w for w in words if float(w['x0']) <= split_x])
            right_lines = extract_lines([w for w in words if float(w['x0']) > split_x])

            left_final = process_text(left_lines)
            right_final = process_text(right_lines)

            # 파일 저장
            left_file = os.path.join(output_dir, f"{pdf_name}_page{page_num}_left_final.txt")
            with open(left_file, "w", encoding="utf-8") as f:
                f.write(left_final)

            right_file = os.path.join(output_dir, f"{pdf_name}_page{page_num}_right_final.txt")
            with open(right_file, "w", encoding="utf-8") as f:
                f.write(right_final)


for pdf_file in Path(pdf_dir).glob("*.pdf"):
    process_pdf(pdf_file, output_dir)

Processing 2025091207551323K_02_02.pdf ...
