- 복사할 웹페이지 URL 입력(webwave)
- 저장할 디렉토리 입력

In [3]:
import os

# 뒤에 / 없음
root_url = "https://ooctvx.webwave.dev"
page_dir_name = "./test_webpage"
os.makedirs(page_dir_name, exist_ok=False)

크롤링 및 추출 함수 정의

In [5]:
import re

import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def scroll_down(driver, scroll_pause_time=0.5):
    # 현재 페이지 높이 가져오기
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # 현재 스크롤 위치에서 일정 부분 내리기
        driver.execute_script("window.scrollBy(0, 1500);")
        time.sleep(scroll_pause_time)

        # 새로운 페이지 높이 계산
        new_height = driver.execute_script("return document.body.scrollHeight")

        # 페이지 높이가 변하지 않거나, 스크롤바가 아래까지 내려갔으면 스크롤 다운 종료
        if new_height == last_height and driver.execute_script("return window.innerHeight + window.pageYOffset >= document.body.offsetHeight"):
            break

        # 페이지 높이 업데이트
        last_height = new_height

def crawl_website(url, wait_time=10, scroll_pause_time=0.5):
    # Chrome 드라이버 초기화
    driver = webdriver.Chrome()
    driver.get(url)

    # 페이지 로딩 대기
    wait = WebDriverWait(driver, wait_time)
    wait.until(EC.presence_of_element_located((By.TAG_NAME, 'body')))

    # 페이지 맨 아래까지 스크롤 다운
    scroll_down(driver, scroll_pause_time)

    # 페이지 소스 코드 가져오기
    page_source = driver.page_source
    driver.quit()
    
    return page_source

def open_html_file(html_file_path):
    with open(html_file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
    return html_content

def extract_file_paths_from_html_content(html_content):
    # 이미지 파일 경로 추출
    # image_pattern = r'["\']([^"\']+\.)(png|jpg|jpeg|gif|webp)["\']'
    image_pattern = r'(?:["\']|:&quot;)([^{"\']+?\.)(png|jpg|jpeg|gif|webp)(?:["\']|&quot;)'
    image_paths = re.findall(image_pattern, html_content)
    # quot_pattern = r'(?<=:&quot;)([^"\']*?\.)(png|jpg|jpeg|gif|webp)(?=&quot;)'
    # quot_paths = re.findall(quot_pattern, html_content)
    # image_paths.extend(quot_paths)

    # CSS 파일 경로 추출
    # css_pattern = r'["\']([^"\']+\.css)["\']'
    css_pattern = r'(?:["\']|:&quot;)([^{"\']+?\.css)(?:["\']|&quot;)'
    css_paths = re.findall(css_pattern, html_content)

    # JavaScript 파일 경로 추출
    # js_pattern = r'["\']([^"\']+\.js)["\']'
    js_pattern = r'(?:["\']|:&quot;)([^{"\']+?\.js)(?:["\']|&quot;)'
    js_paths = re.findall(js_pattern, html_content)

    # 파일 경로 리스트 반환
    return {
        "images": [path[0] + path[1] for path in image_paths],
        "css": css_paths,
        "js": js_paths
    }

def extract_url_from_html_content(html_content, root_url):
    # url_pattern = fr'(?:["\']|:&quot;)({root_url}[^"\']+?)(?:["\']|&quot;)'
    url_pattern = fr'href=(?:["\']|:&quot;)((?!http)[/#][^"\']+?(?<!\.json))(?:["\']|&quot;)'
    urls = re.findall(url_pattern, html_content)

    return {root_url + cur for cur in urls}

def save_html(html_content, save_path):
    try:
        with open(save_path, mode="w") as f:
            f.write(html_content)
    except:
        print(f"Error occured saving {save_path}")
    return save_path

import io
from PIL import Image

def convert_to_webp(input_data, output_path="./webp_data.webp", input_type="path"):
    
    if not output_path.endswith(".webp"):
        output_path = os.path.splitext(output_path)[0] + ".webp"

    if input_type == 'path':
        # 경로일 경우
        img = Image.open(input_data)
        img.save(output_path, 'WEBP')
        return output_path

    elif input_type == 'binary':
        # 바이너리일 경우
        img = Image.open(io.BytesIO(input_data))
        webp_bytes = io.BytesIO()
        img.save(webp_bytes, format='WEBP')
        webp_bytes.seek(0)
        with open(output_path, 'wb') as f:
            f.write(webp_bytes.getvalue())
        return output_path

    elif input_type == 'pil':
        # PIL 객체일 경우
        webp_bytes = io.BytesIO()
        input_data.save(webp_bytes, format='WEBP')
        webp_bytes.seek(0)
        with open(output_path, 'wb') as f:
            f.write(webp_bytes.getvalue())
        return output_path

    else:
        raise ValueError('Invalid input type. Must be "path", "binary", or "pil".')

웹페이지 구성 URL 크롤링

In [6]:
import queue

crawl_url_queue = queue.Queue()
crawl_url_queue.put(root_url)
crawl_done_urls = set()

while not crawl_url_queue.empty():
    target_url = crawl_url_queue.get()
    if target_url in crawl_done_urls:
        continue

    html_content = crawl_website(target_url)
    urls = extract_url_from_html_content(html_content, root_url=root_url)
    
    if target_url == root_url:
        save_html(html_content, os.path.join(page_dir_name, "home.html"))
    else:
        save_html(html_content, os.path.join(page_dir_name, 
                                             target_url.replace(root_url+"/","")+".html"))

    crawl_done_urls.add(target_url)
    for url in urls:
        if url in crawl_done_urls:
            continue
        else:
            crawl_url_queue.put(url)

크롤링된 구성 URL 전부 다운로드 필요

In [7]:
crawl_done_urls

{'https://ooctvx.webwave.dev/',
 'https://ooctvx.webwave.dev//about-us',
 'https://ooctvx.webwave.dev//contact',
 'https://ooctvx.webwave.dev//gallery',
 'https://ooctvx.webwave.dev//offer',
 'https://ooctvx.webwave.dev//offer?anchorElement=wSection_11&amp;scrollMargin=50',
 'https://ooctvx.webwave.dev//offer?anchorElement=wSection_23&amp;scrollMargin=50',
 'https://ooctvx.webwave.dev//portfolio',
 'https://ooctvx.webwave.dev//pricing'}

다운로드 함수 정의

In [8]:
import re
import os
import requests

def download_single(url:str, file_path):

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.5735.134 Safari/537.36',
        'Referer': 'https://www.google.com/',  # 이전 페이지의 주소를 여기에 추가
        'X-Requested-With': 'XMLHttpRequest',  # AJAX 요청 여부를 나타내는 헤더
        'Accept-Language': 'en-US,en;q=0.9',  # 사용자가 선호하는 언어 설정
        'Connection': 'keep-alive',  # 서버와의 연결 유지 설정
    }

    response = requests.get(url, headers=headers)

    with open(file_path, 'wb') as f:
        f.write(response.content)

def download_file(url:str, root_http:str):

    try:
        if not url.startswith("http"):
            file_path = "." + url
            url = root_http + url
        else:
            http_pattern = r'^http.*\.com'
            match_str = re.search(http_pattern, url)[0]
            file_path = "." + url.replace(match_str, "")
        
        file_name = os.path.basename(file_path)
        dir_name = os.path.dirname(file_path)

        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.5735.134 Safari/537.36',
            'Referer': 'https://www.google.com/',  # 이전 페이지의 주소를 여기에 추가
            'X-Requested-With': 'XMLHttpRequest',  # AJAX 요청 여부를 나타내는 헤더
            'Accept-Language': 'en-US,en;q=0.9',  # 사용자가 선호하는 언어 설정
            'Connection': 'keep-alive',  # 서버와의 연결 유지 설정
        }
        
        if os.path.exists(file_path):
            return file_path[1:]
        
        response = requests.get(url, headers=headers, timeout=20)

        if response.status_code != 200:
            raise Exception("Crawl Blocked")
        
        os.makedirs(dir_name, exist_ok=True)
        with open(file_path, 'wb') as f:
            f.write(response.content)

        ext = os.path.splitext(url)[1].lower()
        if ext in ['.jpg', '.jpeg', '.png', '.gif']:
            webp_file_path = file_path.replace(ext, ".webp")
            if os.path.exists(webp_file_path):
                return file_path[1:]
            
            convert_to_webp(response.content, output_path=webp_file_path, input_type="binary")
        
        return file_path[1:]

    except Exception as e:
        print(e, " occured while processing ", file_name)
        return url

html 재구성 및 필요 파일 크롤링

In [9]:
from glob import glob

download_single(f"{root_url}/manifest.json", os.path.join(page_dir_name, "manifest.json"))
html_fns = glob(os.path.join(page_dir_name, "*.html"))
html_fns

['./test_webpage/offer.html',
 './test_webpage/portfolio.html',
 './test_webpage/gallery.html',
 './test_webpage/pricing.html',
 './test_webpage/offer?anchorElement=wSection_23&amp;scrollMargin=50.html',
 './test_webpage/offer?anchorElement=wSection_11&amp;scrollMargin=50.html',
 './test_webpage/about-us.html',
 './test_webpage/home.html',
 './test_webpage/contact.html']

In [10]:
from tqdm import tqdm

for html_fn in html_fns:
    print("PROCESSING", html_fn)

    html_content = open_html_file(html_fn)
    file_paths = extract_file_paths_from_html_content(html_content)

    print("Image paths:", len(file_paths["images"]))
    print("CSS paths:", len(file_paths["css"]))
    print("JavaScript paths:", len(file_paths["js"]))

    print("IMAGE FILE CRAWL")
    for cur in tqdm(file_paths["images"]):
        if cur.startswith("./"): continue
        cur_file_path = download_file(url=cur, root_http='https://yourbrand-18274.kxcdn.com')
        html_content = html_content.replace(cur, cur_file_path)

    print("CSS FILE CRAWL")
    for cur in tqdm(file_paths["css"]):
        if cur.startswith("./"): continue
        cur_file_path = download_file(url=cur, root_http='https://yourbrand-18274.kxcdn.com')
        html_content = html_content.replace(cur, cur_file_path)

    print("JS FILE CRAWL")
    for cur in tqdm(file_paths["js"]):
        if cur.startswith("./"): continue
        if "service-worker.js" in cur: continue
        if "datePickerService.js" in cur: continue

        cur_file_path = download_file(url=cur, root_http='https://yourbrand-18274.kxcdn.com')
        html_content = html_content.replace(cur, cur_file_path)
    
    html_content = html_content.replace(f"{root_url}/", "./")
    html_content = html_content.replace('data-element-type="button', 'data-element-type="image')

    with open(html_fn, 'w', encoding='utf-8') as file:
        file.write(html_content)

    print("----------")

PROCESSING ./test_webpage/offer.html
Image paths: 50
CSS paths: 7
JavaScript paths: 22
IMAGE FILE CRAWL


100%|██████████| 50/50 [00:00<00:00, 38657.18it/s]


CSS FILE CRAWL


100%|██████████| 7/7 [00:00<00:00, 20082.17it/s]


JS FILE CRAWL


100%|██████████| 22/22 [00:00<00:00, 32117.89it/s]


----------
PROCESSING ./test_webpage/portfolio.html
Image paths: 50
CSS paths: 7
JavaScript paths: 22
IMAGE FILE CRAWL


100%|██████████| 50/50 [00:00<00:00, 46510.36it/s]


CSS FILE CRAWL


100%|██████████| 7/7 [00:00<00:00, 20488.58it/s]


JS FILE CRAWL


100%|██████████| 22/22 [00:00<00:00, 38803.49it/s]


----------
PROCESSING ./test_webpage/gallery.html
Image paths: 53
CSS paths: 7
JavaScript paths: 22
IMAGE FILE CRAWL


100%|██████████| 53/53 [00:00<00:00, 39275.28it/s]


CSS FILE CRAWL


100%|██████████| 7/7 [00:00<00:00, 22025.60it/s]


JS FILE CRAWL


100%|██████████| 22/22 [00:00<00:00, 35098.78it/s]


----------
PROCESSING ./test_webpage/pricing.html
Image paths: 48
CSS paths: 7
JavaScript paths: 22
IMAGE FILE CRAWL


100%|██████████| 48/48 [00:00<00:00, 33632.91it/s]


CSS FILE CRAWL


100%|██████████| 7/7 [00:00<00:00, 20178.78it/s]


JS FILE CRAWL


100%|██████████| 22/22 [00:00<00:00, 32791.29it/s]


----------
PROCESSING ./test_webpage/offer?anchorElement=wSection_23&amp;scrollMargin=50.html
Image paths: 50
CSS paths: 7
JavaScript paths: 22
IMAGE FILE CRAWL


100%|██████████| 50/50 [00:00<00:00, 42904.09it/s]


CSS FILE CRAWL


100%|██████████| 7/7 [00:00<00:00, 20734.55it/s]


JS FILE CRAWL


100%|██████████| 22/22 [00:00<00:00, 32140.26it/s]


----------
PROCESSING ./test_webpage/offer?anchorElement=wSection_11&amp;scrollMargin=50.html
Image paths: 49
CSS paths: 7
JavaScript paths: 22
IMAGE FILE CRAWL


100%|██████████| 49/49 [00:00<00:00, 44659.04it/s]


CSS FILE CRAWL


100%|██████████| 7/7 [00:00<00:00, 27160.16it/s]


JS FILE CRAWL


100%|██████████| 22/22 [00:00<00:00, 28808.83it/s]


----------
PROCESSING ./test_webpage/about-us.html
Image paths: 74
CSS paths: 7
JavaScript paths: 22
IMAGE FILE CRAWL


100%|██████████| 74/74 [00:00<00:00, 42728.32it/s]


CSS FILE CRAWL


100%|██████████| 7/7 [00:00<00:00, 26570.25it/s]


JS FILE CRAWL


100%|██████████| 22/22 [00:00<00:00, 28158.28it/s]


----------
PROCESSING ./test_webpage/home.html
Image paths: 110
CSS paths: 7
JavaScript paths: 22
IMAGE FILE CRAWL


100%|██████████| 110/110 [00:00<00:00, 32929.37it/s]


CSS FILE CRAWL


100%|██████████| 7/7 [00:00<00:00, 22141.88it/s]


JS FILE CRAWL


100%|██████████| 22/22 [00:00<00:00, 24966.10it/s]


----------
PROCESSING ./test_webpage/contact.html
Image paths: 38
CSS paths: 7
JavaScript paths: 22
IMAGE FILE CRAWL


100%|██████████| 38/38 [00:00<00:00, 42377.97it/s]


CSS FILE CRAWL


100%|██████████| 7/7 [00:00<00:00, 29360.13it/s]


JS FILE CRAWL


100%|██████████| 22/22 [00:00<00:00, 40015.04it/s]

----------





make flask file

In [23]:
def make_flask_app_py(page_dir_name):

    app_template = """from flask import Flask

app = Flask(__name__, static_folder=".", static_url_path="/")
"""

    for html_fn in glob(os.path.join(page_dir_name, "*.html")):
        routing = re.search(fr"(?<={page_dir_name}).*(?=\.html)", html_fn)[0]
        function_name = re.sub(r'\W+', '_', routing[1:])
        
        if "home" in routing:
            routing = "/"
        
        route_template = f"""
@app.route("{routing}")
def {function_name}():
    return app.send_static_file("{html_fn.replace("./", "")}")
"""

        app_template += route_template

    app_template += f"""
if __name__ == '__main__':
    app.run(debug=True, port=8000, host="0.0.0.0")
"""

    return app_template

In [24]:
falsk_app_py = make_flask_app_py(page_dir_name)

In [25]:
with open(f"{page_dir_name.replace('./', '')}_app.py", mode="w") as f:
    f.write(falsk_app_py)