### Test with RecursiveUrlLoader (discarded)

In [None]:
import re
import time
from bs4 import BeautifulSoup
from langchain_community.document_loaders import RecursiveUrlLoader

# Extracts text content from HTML, removing extra newlines and formatting it for readability.
def bs4_extractor(html: str) -> str:
    """
    Extracts clean text from the given HTML content, removing extra newlines for better readability.
    """
    soup = BeautifulSoup(html, "lxml")
    return re.sub(r"\n\n+", "\n\n", soup.text).strip()

# Extracts the HTML content, preserving text and <img> tags, and placing the image in the text where found.
def extract_text_with_images(html: str) -> str:
    """
    Extracts both text and <img> tags from the given HTML content, placing images within the text where found.
    """
    soup = BeautifulSoup(html, "lxml")
    
    output = ""
    for element in soup.descendants:
        if element.name not in ['script', 'style'] and isinstance(element, str):
            output += element.strip() + "\n\n"
        elif element.name == "img":
            img_tag = f'<img src="{element.get("src")}", data-src="{element.get("data-src")}", alt="{element.get("alt")}", width="{element.get("width")}", height="{element.get("height")}">\n\n'
            output += img_tag
    
    return re.sub(r"\n\n+", "\n", output).strip()

# Saves the given content to a file with the specified filename.
def save_to_file(content: str, filename: str) -> None:
    """
    Saves the content to a file with the given filename.
    """
    with open(filename, "w", encoding="utf-8") as file:
        file.write(content)

# Process the loaded content and save the results
def process_html_content(html_content: str, doc_i) -> None:
    """
    Processes the HTML content by extracting text and images, and saves the results into files.
    It also extracts the title and uses it to name the saved files.
    """
    # Use BeautifulSoup to extract the title
    soup = BeautifulSoup(html_content, "lxml")
    title = soup.title.string if soup.title else "untitled"
    
    # Clean title for valid filename usage
    safe_title = re.sub(r'[\/:*?"<>|]', "_", title)
    
    # Extract text content
    extracted_text = bs4_extractor(html_content)
    
    # Extract text with images
    text_with_images = extract_text_with_images(html_content)
    
    # Save files using the title as part of the filename
    save_to_file(html_content, f"{safe_title}[{doc_i}]_original_html.txt")
    save_to_file(extracted_text, f"{safe_title}[{doc_i}]_text.txt")
    save_to_file(text_with_images, f"{safe_title}[{doc_i}]_text_with_images.txt")
    
    print(f"Files saved with the title '{safe_title}'")

# Define the URL to be processed
currenturl = "https://www.gamersky.com/z/bmwukong/1314156_195585/"

# Use RecursiveUrlLoader without an extractor to get the original HTML content
loader_html = RecursiveUrlLoader(
    currenturl,
    max_depth=10,
    use_async=False,
    extractor=None,  # No extractor here to get the original HTML
    metadata_extractor=None,
    exclude_dirs=(),
    timeout=10,
    check_response_status=True,
    continue_on_failure=True,
    prevent_outside=True,
    base_url=None,
)

# Load the original HTML content
docs_html = loader_html.load()

# Process all documents loaded from RecursiveUrlLoader
if docs_html and len(docs_html) > 0:
    for i, doc in enumerate(docs_html):
        html_content = doc.page_content
        
        # Optional: Include document index as part of the filename to differentiate files
        print(f"Processing document {i + 1}/{len(docs_html)}")

        # Process the HTML content by extracting text and images for each document
        process_html_content(html_content, i)
else:
    print("Failed to load any content from the URL.")


In [None]:
from IPython.display import Markdown, display


display(Markdown(html_content))

In [None]:
from langchain_community.document_loaders import RecursiveUrlLoader

# 定义要抓取的初始 URL
currenturl = "https://www.gamersky.com/z/bmwukong/"

# 配置 RecursiveUrlLoader
loader = RecursiveUrlLoader(
    currenturl,
    max_depth=2,  # 设置递归抓取深度，例如 3 表示抓取当前页面及其链接的两级页面
    use_async=False,  # 是否异步抓取
    extractor=None,  # 提取器设为 None 以获取原始 HTML
    metadata_extractor=None,  # 不使用元数据提取器
    exclude_dirs=(),  # 可选，排除不需要抓取的目录
    timeout=5,  # 每个页面的抓取超时时间
    check_response_status=True,  # 是否检查 HTTP 响应状态码
    continue_on_failure=True,  # 是否在遇到错误时继续抓取
    prevent_outside=False,  # 防止抓取超出指定 URL 域名或目录的链接
    base_url=currenturl,  # 确保只抓取从这个 URL 开始的页面
)

# 加载文档，返回一个包含所有递归抓取到页面的列表
docs = loader.load()

# 处理抓取到的文档
for i, doc in enumerate(docs):
    print(f"\n-----------------")
    print(f"Document {i+1}:")  
    print(doc.page_content.__len__())  # 输出每个文档的内容
    print(doc.metadata)  # 输出每个文档的内容
    


### 1. Complete page scrwling and data extraction

In [1]:
import requests
import time
from lxml import html, etree
from urllib.parse import urljoin
import os
from datetime import datetime
from tqdm import tqdm  # Progress bar
import json
import urllib.request
from langchain_core.messages import HumanMessage
from langchain_openai import ChatOpenAI 
from langchain_nvidia_ai_endpoints import ChatNVIDIA
from io import BytesIO
import base64

In [29]:

# Log function to save logs to a file with date and time
def log_message(message, filename="docs/mmgamerag.log"):
    """
    Saves the provided log message to a file with the current date and time.
    """
    current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    with open(filename, "a", encoding="utf-8") as log_file:
        log_file.write(f"[{current_time}] {message}\n")


# Base64 encode the image
def get_base64_encoded_image(image_url):
    """
    Fetches the image from the given URL and returns its Base64 encoded string.
    """
    return ''  # Temporarily returning an empty string for base64

# Save content to file, for back up only, structured data is saved in JSON file.
def save_content_to_file(content, filename):
    """
    Saves the provided content to a file.
    """
    with open(filename, "w", encoding="utf-8") as file:
        file.write(content)
    log_message(f"Content successfully saved to {filename}")

def save_image_to_file(img_src):
    """
    Save the image from the provided URL to a specified directory with a safe filename.
    
    Parameters:
    img_src (str): The URL of the image to be saved.
    """
    # Clean the URL to make it filename-safe
    filename_safe_url = img_src.replace(":", "=").replace("/", "|")
    
    # Specify the save path
    save_directory = "docs/rawdata/img"
    os.makedirs(save_directory, exist_ok=True)

    # Define the filename
    filename = os.path.join(save_directory, f"{filename_safe_url}")
    
    # Download and save the image
    urllib.request.urlretrieve(img_src, filename)
    log_message(f"Image saved as: {filename}")


# Save content to JSON file in a specified format
def save_data_json_with_format(content, filename):
    """
    Saves the provided content to a JSON file with specified indentation format.
    Ensures the content is appended correctly to an existing JSON array.
    """
    # Check if the file exists and load its content if it does
    if os.path.exists(filename):
        with open(filename, 'r', encoding="utf-8") as json_file:
            try:
                existing_data = json.load(json_file)
            except json.JSONDecodeError:
                existing_data = []
    else:
        existing_data = []
    
    # Append new content to the existing data
    existing_data.extend(content)
    
    # Save the updated data back to the file
    with open(filename, 'w', encoding="utf-8") as json_file:
        json.dump(existing_data, json_file, indent=4, ensure_ascii=False)
    
    log_message(f"JSON content successfully saved to {filename}")   

# Extract image description from the image
def get_image_description(image_src, content_before_image_str, content_after_image_str):
    image_description=''
    # return image_description
    imgdesllm = ChatOpenAI(name="image_des_llm", model="gpt-4o-mini")

    # imgdesllm = ChatNVIDIA(
    # model="meta/llama-3.2-90b-vision-instruct",
    # api_key="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", 
    # top_p=0.7,
    # )


    # ~~~~~~~~~~~Transfer image src to base64 and then send to llm for description handling faster.
    # Clean the URL to make it filename-safe
    filename_safe_url = image_src.replace(":", "=").replace("/", "|")
    filename_safe_url = 'docs/rawdata/img/' + filename_safe_url
    from PIL import Image
    image_open = Image.open(filename_safe_url)

    # Convert image to base64
    buffered = BytesIO()
    image_open.save(buffered, format="JPEG")
    img_base64 = base64.b64encode(buffered.getvalue()).decode()
    # ~~~~~~~~~~~

    message = HumanMessage(
    content=[
        {"type": "text", "text": f"用300字以内的中文描述这张图片（以下简称此图）的内容。并将此图的上文和下文中的内容详细总结到此图的描述中。此图的上文：\n{content_before_image_str}\n此图的下文：\n{content_after_image_str}。如果在此图的上文和下文中找到了图片标签<img>，则按原来的顺序注明所有图片的src（从标签<img>中获取），不要遗漏任何的图片标签。严格遵循以下格式：\n 图片描述。\n上文图片的src:\n 此图片的src:\n{image_src}\n 下文图片的src:\n"},
        # {"type": "image_url", "image_url": {"url": image_src}},
        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}},
        ],
    )
    img_response = imgdesllm.invoke([message])
    print(f'\n------------------------\n{img_response.content}')
    image_description=img_response.content

    return image_description

def get_all_image_description(file_path):
    """
    Reads the JSON file, updates the image description for each item, and writes the updated data back to a temporary file
    in batches. After processing all items, the temporary file is renamed to replace the original file.
    
    Args:
        file_path (str): The path to the JSON file.
    """
    # Log the start of the process
    log_message(f"=== Starting to get image description from file: {file_path} === ")

    # Read the JSON file
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
        log_message(f"Successfully loaded the JSON file: {file_path}")
    except Exception as e:
        log_message(f"Error loading the JSON file: {file_path}. Error: {e}")
        return
    
    temp_file_path = file_path + ".tmp"
    batch_size = 100  # Set batch size to 100

    # Process the data in batches of 100 items
    for index, item in enumerate(tqdm(data, desc="Processing images")):
        try:
            # Extract relevant fields
            image_src = item.get('src', '')
            content_before_image_str = item.get('content_before_image', '')
            content_after_image_str = item.get('content_after_image', '')
            
            # Call the image description function
            image_description = get_image_description(image_src, content_before_image_str, content_after_image_str)
            
            # Update the image_description field in the item
            item['image_description'] = image_description
            
            # Every 100 items, write the data to the temporary file
            if (index + 1) % batch_size == 0:
                with open(temp_file_path, 'w', encoding='utf-8') as temp_file:
                    json.dump(data, temp_file, ensure_ascii=False, indent=4)
                log_message(f"Batch write: Processed and wrote {index+1}/{len(data)} items.")
        except Exception as e:
            log_message(f"Error processing item {index+1} and src {image_src}. Error: {e}")

    # Write remaining items if total number is not a multiple of batch_size
    if len(data) % batch_size != 0:
        try:
            with open(temp_file_path, 'w', encoding='utf-8') as temp_file:
                json.dump(data, temp_file, ensure_ascii=False, indent=4)
            log_message(f"Final batch write: Processed and wrote all remaining items.")
        except Exception as e:
            log_message(f"Error writing final batch to temporary file: {temp_file_path}. Error: {e}")

    # Rename the temporary file to the original file after processing all items
    try:
        os.replace(temp_file_path, file_path)
        log_message(f"Successfully replaced original file with updated data: {file_path}")
    except Exception as e:
        log_message(f"Error replacing the original file with updated data: {file_path}. Error: {e}")
    
    # Log the completion of the process
    log_message(f"=== Finished getting image description from file: {file_path} ===")
    


# Extract text and images from the part with class="Mid2L_con" and save to docs first, then JSON. n means how many lines of text was stored before and after each image.
def extract_text_and_images(currenturl, tree, n=20):
    """
    Extracts text and images from the part of the webpage with class="Mid2L_con", 
    and splits into two parts: one with text only, and one with text + images.
    Saves content to files first in 'docs', then processes and saves image metadata to JSON files.
    """
    
    if tree:

        # Extract the page title
        # title = tree.xpath('//title/text()') # regular title
        # if title:
        #     title_text = title[0].strip()
        # else:
        #     title_text = 'No Title'
        
        title_text = ''
        # Extract title with detailed data and author info, such as "2023-08-21 14:06:02 来源：游民星空 作者：LIN木木 编辑：LIN木木　浏览：17536"
        mid2L_tit_elements = tree.xpath('//div[@class="Mid2L_tit"]')
        if mid2L_tit_elements:
            title_text = mid2L_tit_elements[0].text_content()           
            # Remove leading and trailing whitespace and empty lines in the middle
            lines = [line.strip() for line in title_text.splitlines() if line.strip()] 
            title_text = "\n".join(lines)

        # Extract the part of the page with class="Mid2L_con"
        mid2l_con = tree.xpath('//div[@class="Mid2L_con"]')

        # Clean the URL to make it filename-safe
        filename_safe_url = currenturl.replace(":", "=").replace("/", "|")

        if mid2l_con:
            text_content_list = [f"Title: {title_text}"]
            text_with_images_list = [f"Title: {title_text}"]
            txt_data_list = []
            stop_extraction = False

            # First pass: gather all text and image elements
            for element in mid2l_con[0].iter():
                if stop_extraction:
                    break

                # If it's a text node, extract the text and tail
                if element.text and isinstance(element.tag, str):
                    text = element.text.strip()
                else:
                    text = ""

                # Also check the 'tail' for text outside the tag
                if element.tail:
                    tail_text = element.tail.strip()
                else:
                    tail_text = ""

                # Combine text and tail_text
                combined_text = text + " " + tail_text if text or tail_text else ""

                # Append the combined text to the list if it's not empty and does not contain certain phrases
                if combined_text:
                    # Check if the combined text starts with "本文由游民星空"
                    if combined_text.startswith("本文由游民星空"):
                        stop_extraction = True
                    else:
                        # Check if combined_text contains any of the unwanted phrases
                        unwanted_phrases = [
                            "更多相关内容请关注",
                            "责任编辑",
                            "友情提示：",
                            "本文是否解决了您的问题",
                            "已解决",
                            "未解决",
                            "黑神话：悟空专区",
                            "上一页",
                            "下一页"
                        ]
                        if not any(phrase in combined_text for phrase in unwanted_phrases):
                            text_content_list.append(combined_text)
                            text_with_images_list.append(combined_text)


                # If it's an <img> tag
                if element.tag == 'img' and not stop_extraction:
                    img_src = element.get('src')
                    img_data_src = element.get('data-src', img_src)  # Use data-src if available, otherwise fallback to src
                    img_alt = element.get('alt', '')
                    img_title = element.get('title', '')
                    img_width = element.get('width', '')
                    img_height = element.get('height', '')

                    # Convert relative paths to absolute URLs
                    img_src = urljoin(currenturl, img_data_src)

                    img_src=img_src.replace('_S.jpg', '.jpg')

                    # Save the raw image to a file
                    save_image_to_file(img_src)

                    # Replace the placeholder with the actual image tag
                    img_tag = f'<img src="{img_src}" alt="{img_alt}" width="{img_width}" height="{img_height}" title="{img_title}">'
                    text_with_images_list.append(img_tag)

            # Convert text_content_list to a single string
            text_content_str = '\n'.join(text_content_list)
            text_with_images_list_str = '\n'.join(text_with_images_list)

            # Save content to docs folder first
            text_only_filename = os.path.join("docs/rawdata/", f"{filename_safe_url}_text_only.txt")
            text_with_images_filename = os.path.join("docs/rawdata/", f"{filename_safe_url}_text_with_images.html")
            save_content_to_file(text_content_str, text_only_filename)
            save_content_to_file(text_with_images_list_str, text_with_images_filename)                

            # Get img data
            img_data_list = []
            for img_index, line in enumerate(text_with_images_list):
                if line.startswith("<img"):
                    # Extract image attributes
                    img_src = line.split('src="')[1].split('"')[0]
                    img_alt = line.split('alt="')[1].split('"')[0]
                    img_width = line.split('width="')[1].split('"')[0]
                    img_height = line.split('height="')[1].split('"')[0]
                    img_title = line.split('title="')[1].split('"')[0]
                    
                    # Get Base64 encoded image content (currently returning empty string)
                    img_base64 = get_base64_encoded_image(img_src)
                    
                    # Get n lines before and after the image
                    content_before_image = []
                    content_after_image = []
                    
                    # Extract n lines before the image, stop if another <img> tag is encountered
                    for i in range(img_index-1, max(0, img_index-n)-1, -1):
                        # if '<img' in text_with_images_list[i]:
                        #     break
                        content_before_image.append(text_with_images_list[i])
                    content_before_image.reverse()
                    
                    # Extract n lines after the image, stop if another <img> tag is encountered
                    for i in range(img_index+1, min(len(text_with_images_list), img_index+1+n)):
                        # if '<img' in text_with_images_list[i]:
                        #     break
                        content_after_image.append(text_with_images_list[i])
                    
                    content_before_image_str = '\n'.join(content_before_image)
                    content_after_image_str = '\n'.join(content_after_image)
                    image_descrip_str = ''
                    
                    # Add the image metadata to the img_data_list
                    img_data_list.append({
                        "src": img_src,
                        "base64": img_base64,  # Temporarily set to an empty string
                        "title": img_title,
                        "alt": img_alt,
                        "content_before_image": content_before_image_str,
                        "image_description": image_descrip_str,
                        "content_after_image": content_after_image_str,
                        "url": currenturl,  # Current page url
                        "type": "img"
                    })


            txt_data_list.append({
                    "txt": text_content_str,
                    "url": currenturl,
                    "type": "text"
                })

            # Save the entire text_content_str directly to mmtext.json
            save_data_json_with_format(txt_data_list, "docs/mmtext.json")

            # Save the image metadata to JSON as a list of objects
            save_data_json_with_format(img_data_list, "docs/mmimg.json")

            return f"Content saved to files in docs and JSON files processed."
        else:
            return "No content found with class='Mid2L_con'."
    else:
        return "Failed to fetch content."

# Load existing links from the JSON file
def load_existing_links(filename):
    """
    Loads existing links from the specified JSON file.
    If the file doesn't exist, it returns an empty list.
    """
    if os.path.exists(filename):
        with open(filename, 'r', encoding="utf-8") as json_file:
            return json.load(json_file)
    return []
    


# Global variable to track new links added across function calls
new_link_count = 0

def save_link_to_json(new_link, filename="docs/links.json"):
    """
    Saves the provided link to the specified JSON file.
    If the link contains .shtml?, it removes the string after .shtml.
    If the link already exists, it returns False.
    If the link is new and added, it returns True.
    Logs the added link along with the updated total count of new links.
    """
    global new_link_count  # Use the global counter for new links

    # Check if the new_link contains '.shtml?'
    if ".shtml?" in new_link:
        new_link = new_link.split(".shtml?")[0] + ".shtml"
    
    links = load_existing_links(filename)
    
    if new_link not in links:
        links.append(new_link)
        new_link_count += 1  # Increment the global counter for a new link
        with open(filename, 'w', encoding="utf-8") as json_file:
            json.dump(links, json_file, indent=4, ensure_ascii=False)
        log_message(f"New link added to links.json: {new_link}. Total links: {new_link_count}")
        return True
    else:
        log_message(f"Link already in links.json: {new_link}")
        return False    



# Global variable to track new URLs added across function calls
new_url_count = 0

# Save crawled URLs to the JSON file and count added URLs
def save_crawled_url_to_json(new_url, filename="docs/crawled_urls.json"):
    """
    Saves the provided URL to the specified JSON file.
    If the URL already exists, it returns False.
    If the URL is new and added, it returns True.
    Logs the added URL along with the updated total count of new URLs.
    """
    global new_url_count  # Use the global counter for new URLs

    # Check if the file exists and load existing URLs
    if os.path.exists(filename):
        with open(filename, 'r', encoding="utf-8") as json_file:
            urls = json.load(json_file)
    else:
        urls = [] 
    
    log_message(f"-----------------------------------------------------------")
    
    # Check if the URL is new
    if new_url not in urls:
        urls.append(new_url)
        new_url_count += 1  # Increment the global counter for a new URL
        with open(filename, 'w', encoding="utf-8") as json_file:
            json.dump(urls, json_file, indent=4, ensure_ascii=False)
        log_message(f"New crawled url added to crawled_urls.json: {new_url}. Total urls: {new_url_count}")
        return True 
    else:
        log_message(f"Url already in crawled_urls.json: {new_url}")
        return False
    

# Check if the link exists in the JSON file
def check_link_in_json(new_link, filename="docs/links.json"):
    """
    Checks if the provided link exists in the specified JSON file.
    If the link contains .shtml?, it removes the string after .shtml.
    Returns True if the link is found, otherwise False.
    """
    # Check if the new_link contains '.shtml?'
    if ".shtml?" in new_link:
        new_link = new_link.split(".shtml?")[0] + ".shtml"
    
    # Load existing links from the JSON file
    links = load_existing_links(filename)
    
    # Return True if the link is found, otherwise False
    if new_link in links:
        return True
    else:
        return False



# Send request with retry mechanism
def fetch_url_with_retries(url, max_retries=2):
    """
    Attempts to fetch content from the given URL, retrying up to max_retries times.
    If the request fails, it waits for 1 second before retrying.
    """
    retries = 0
    while retries < max_retries:
        try:
            response = requests.get(url, timeout=3)  # Set timeout to 3 seconds
            
            # If the status code is 200, the request was successful, return the content
            if response.status_code == 200:
                log_message(f"Success on attempt {retries + 1} for {url}")
                return response.content
            
            # If the status code is not 200, log the failure reason
            else:
                log_message(f"Attempt {retries + 1} failed with status code {response.status_code}")
        
        except requests.RequestException as e:
            # Capture request exceptions like timeout or connection errors
            log_message(f"Attempt {retries + 1} failed with error: {e}")
        
        # Increment retry count
        retries += 1
        
        # Wait for 1 second before retrying
        time.sleep(1)

    # If max retries are exceeded, return None or handle the error accordingly
    log_message(f"Failed to fetch the URL after {max_retries} attempts.")
    return None

# New function to crawl the webpage and its linked pages up to a given depth
def crawl_and_extract(url, keyword="黑神话", linkdepth=2):
    """
    Crawls the webpage starting from the given URL, and checks for links within the page.
    If a page contains the term specified in 'keyword' in either "Mid2L_con" class or in the title,
    or in the whole HTML content, it saves the link in 'links.json'.
    Crawls up to the given linkdepth (including the original URL).
    """
    def crawl(url, current_depth, max_depth, pbar):

        # Check if the link has been crawled
        if save_crawled_url_to_json(url) == False:
            return


        # Check if the link exists in the JSON file and it is in the max depth, if yes, just return.
        if check_link_in_json(url) == True and current_depth == max_depth:
            log_message(f"Link found in links.json: {url} (Depth: {current_depth})")
            return
        
        # Fetch the page content
        html_content = fetch_url_with_retries(url)
        if not html_content:
            log_message(f"Failed to fetch content for {url} (Depth: {current_depth})")
            return
        
        # Parse the HTML using lxml
        tree = html.fromstring(html_content)

        # Check if class="Mid2L_con" or title contains the keyword
        mid2l_con_elements = tree.xpath('//div[@class="Mid2L_con"]')
        title_elements = tree.xpath('//title/text()')
        
        # Check if keyword exists in Mid2L_con or Title
        mid2l_con_text = mid2l_con_elements[0].text_content() if mid2l_con_elements else ""
        title_text = title_elements[0] if title_elements else ""
        
        links = []

        if mid2l_con_text:
            if keyword in mid2l_con_text or keyword in title_text:
                if current_depth == 0: current_depth = 1   # 0 is for url without mid2l_con, so we set it to 1
                log_message(f"Found '{keyword}' in Mid2L_con or Title at {url} (Depth: {current_depth})")
                linkexist = save_link_to_json(url)  # Save the link to JSON
                if linkexist == True:
                    extract_text_and_images(url, tree)
                if current_depth < max_depth:
                    # Get all links on the page
                    alinks = tree.xpath('//div[@class="Mid2L_con"]//a[@href]/@href')
                    links = [urljoin(url, link) for link in alinks if link.startswith(('http', '/'))]
                    links = list(set(links))
                    # Remove unwanted link, links starting with 'javascript:', and those ending with '.jpg' or '.png'
                    unwanted_link = "" # "https://www.gamersky.com/z/bmwukong/"
                    filtered_links = [link for link in links if link != unwanted_link and not link.startswith('javascript:') and not link.endswith(('.jpg', '.png'))]
                    links = filtered_links
                    log_message(f"Found {len(links)} links on {url} (Depth: {current_depth}). Crawling deeper...")

        else:
            # If not found in Mid2L_con, check the full HTML content
            if keyword in title_text: 
                current_depth = 0
                log_message(f"Found '{keyword}' in full HTML at {url} (Depth: {current_depth})")
            # if keyword in html_content.decode('utf-8', errors='ignore'):                
                linkexist = save_link_to_json(url)  # Save the link to JSON
                if linkexist == True:
                    pass
                    # extract_text_and_images(url, tree)   # Don't extract if it is just an overview

                if current_depth < max_depth:
                    # Get all links on the page
                    alinks = tree.xpath('//a[@href]/@href')
                    links = [urljoin(url, link) for link in alinks if link.startswith(('http', '/'))]
                    links = list(set(links))
                    # Remove unwanted link, links starting with 'javascript:', and those ending with '.jpg' or '.png'
                    unwanted_link = "" # "https://www.gamersky.com/z/bmwukong/"
                    filtered_links = [link for link in links if link != unwanted_link and not link.startswith('javascript:') and not link.endswith(('.jpg', '.png'))]
                    links = filtered_links
                    log_message(f"Found {len(links)} links on {url} (Depth: {current_depth}). Crawling deeper...")
            else:
                log_message(f"No '{keyword}' found at {url} (Depth: {current_depth})")

        
        if current_depth < max_depth and links:
            current_depth = current_depth + 1
            # Recursively crawl the found links, with increased depth
            for link in tqdm(links, desc=f"Crawling depth {current_depth}/{max_depth}", leave=False, position=1, dynamic_ncols=True):
                crawl(link, current_depth, max_depth, pbar)
                pbar.update(1)



    # Write the start message
    log_message("=== Crawl Start ===")

    with tqdm(total=100, desc="Crawling", position=0, dynamic_ncols=True) as pbar:
        crawl(url, current_depth=0, max_depth=linkdepth, pbar=pbar)

    # Write the end message with two empty lines
    log_message("=== Crawl End ===\n\n")

# Test URL and Keyword
url = "https://www.gamersky.com/z/bmwukong/"
keyword = "黑神话"

crawl_and_extract(url, keyword=keyword, linkdepth=2) # Crawl the URL and its linked pages up to a depth
# get_all_image_description('docs/mmimg.json') # Get image description for all images in the JSON file

# image_src='http://img1.gamersky.com/image2024/08/20240819_qy_372_15/image077.jpg'
# content_before_image_str =''
# content_after_image_str=''
# content_before_image_str='Title: 《黑神话悟空》珍玩图鉴 珍玩获取方法及效果一览\n2024-08-20 10:19:28 来源：游民星空[原创] 作者：瑞破受气包  我要投稿\n第13页：特品-金棕衣 \n展开 \n特品-金棕衣 \n获取方法：【 \n可能是焦面鬼王概率掉落 】。从第三回【极乐谷-长生大道】土地庙出发，进入土地庙前方木门之后往左前方走，击败前方雪地上的焦面鬼王（超大巨人）即可获得。具体路线请参考下文。\n<img src=\"http://img1.gamersky.com/image2024/08/20240819_qy_372_15/image073.jpg\" alt=\"游民星空\" width=\"\" height=\"\" title=\"\">\n<img src=\"http://img1.gamersky.com/image2024/08/20240819_qy_372_15/image075.jpg\" alt=\"游民星空\" width=\"\" height=\"\" title=\"\">\n从第三回【极乐谷-长生大道】土地庙出发，进入土地庙前方木门之后往左前方走。 '
# content_after_image_str='继续沿路前进，在拐弯处往右走，可以看到一大片雪地，还有一个超大巨人，巨人就是焦面鬼王，击杀即可获得。 \n<img src=\"http://img1.gamersky.com/image2024/08/20240819_qy_372_15/image079.jpg\" alt=\"游民星空\" width=\"\" height=\"\" title=\"\">\n11 \n12 \n13 \n14 \n15 \n16 \n17 \n18 \n19 \n20 \n21 \n0 \n0 \n文章内容导航 \n第1页：上品-猫睛宝串 \n第2页：上品-玛瑙罐 \n第3页：上品-不求人 \n第4页：上品-砗磲佩 '

# get_image_description(image_src, content_before_image_str, content_after_image_str)


  if tree:
Crawling: 506it [04:21,  1.93it/s]                         


KeyboardInterrupt: 

### 2. Vector store

In [2]:
from dotenv import load_dotenv,find_dotenv
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain_chroma import Chroma
from langchain.schema.document import Document
from langchain_core.output_parsers import StrOutputParser
from IPython.display import Markdown, display

load_dotenv(find_dotenv()) 

# Preparation of documents for RAG-------------------------
# Vectorstore, for retrieval
embedding_model=OpenAIEmbeddings(model="text-embedding-3-large")

vectorstore_path = "vectorstore/chromadb-mmgamerag"
if os.path.exists(vectorstore_path):
    print(f"Loaded vectorstore from disk: {vectorstore_path}")
else:
    # Initialize an empty vectorstore and persist to disk
    print(f"Initialized an empty vectorstore in {vectorstore_path}")

vectorstore = Chroma(
                embedding_function=embedding_model,
                persist_directory=vectorstore_path,
                ) 

def add_text_documents_to_vectorstore(vectorstore, documents):
    # Retrieve existing documents from the vectorstore
    existing_docs = vectorstore.get()
    
    existing_urls = [metadata['url'] for metadata in existing_docs['metadatas']] #???metadata
    print(f"Quantity of existing_urls: {len(existing_urls)}")
    # print(existing_urls)
    # Filter out documents that already exist based on URL
    new_documents = [doc for doc in documents if doc.metadata["url"] not in existing_urls]

    if new_documents:
        vectorstore.add_documents(new_documents)
        # vectorstore.persist()  # Persist the vectorstore after adding documents
        print(f"Added {len(new_documents)} new text documents.")
    else:
        print("No new text documents to add.")


def add_img_documents_to_vectorstore(vectorstore, documents):
    # Retrieve existing documents from the vectorstore
    existing_docs = vectorstore.get()
    
    # Use `get` to avoid KeyError if some metadata does not have 'src'
    existing_srcs = [metadata.get('src') for metadata in existing_docs['metadatas'] if 'src' in metadata]
    print(f"Quantity of existing_srcs: {len(existing_srcs)}")
    # print(existing_srcs)
    
    # Filter out documents that already exist based on src
    new_documents = [doc for doc in documents if doc.metadata.get("src") not in existing_srcs]

    if new_documents:
        vectorstore.add_documents(new_documents)
        # vectorstore.persist()  # Persist the vectorstore after adding documents
        print(f"Added {len(new_documents)} new img documents.")
    else:
        print("No new img documents to add.")


def add_txt_img():
    txt_data_list = []
    img_data_list = []

    # Directly load and assign to txt_data_list from mmtext.json
    with open('docs/mmtext.json', 'r', encoding='utf-8') as text_file:
        txt_data_list = json.load(text_file)  # Assuming the JSON structure matches the required format

    # Directly load and assign to img_data_list from mmimg.json
    with open('docs/mmimg.json', 'r', encoding='utf-8') as img_file:
        img_data_list = json.load(img_file)  # Assuming the JSON structure matches the required format

    # Add texts
    mmtexts = [
        Document(page_content=item['txt'], metadata={"url": item['url'], "type": item['type']})
        for item in txt_data_list
    ]

    # Add documents and save to vectorstore
    add_text_documents_to_vectorstore(vectorstore, mmtexts)


    # Add imgs
    mmimgs = [
        Document(
            page_content="\ncontent_before_image:\n" + item['content_before_image'] + "\n\nimage_description:\n" + item['image_description'] + "\n\ncontent_after_image:\n" + item['content_after_image'] + '\n',  
            metadata={"url": item['url'], "type": item['type'], "src": item['src']}
        )
        for item in img_data_list  # Iterate over each item in img_data_list
    ]

    # Add documents and save to vectorstore
    add_img_documents_to_vectorstore(vectorstore, mmimgs)

add_txt_img() # Add texts and images to vectorstore

Initialized an empty vectorstore in vectorstore/chromadb-mmgamerag
Quantity of existing_urls: 0
Added 38 new text documents.
Quantity of existing_srcs: 0
Added 139 new img documents.


### 3. Retrieval

In [None]:
# retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

# retrieved_docs = retriever.invoke("猫睛宝串")
# retrieved_docs

retrieved_docs = vectorstore.similarity_search_with_relevance_scores(query="君子牌", k=5, filter={"type": "text"})

# Iterate over retrieved_docs and extract the url, page_content, and score
for doc, score in retrieved_docs:
    url = doc.metadata.get('url', 'No URL found')  # Extract the URL from the metadata
    type = doc.metadata.get('type', '') 
    page_content = doc.page_content  # Get the page content
    # print(f"URL: {url}\nContent: {page_content}\nScore: {score}\nType: {type}\n")

retrieved_docs = vectorstore.similarity_search_with_relevance_scores(query="君子牌", k=5, filter={"type": "img"})

# Iterate over retrieved_docs and extract the url, page_content, and score
for doc, score in retrieved_docs:
    url = doc.metadata.get('url', 'No URL found')  # Extract the URL from the metadata
    type = doc.metadata.get('type', '') 
    src = doc.metadata.get('src', '') 
    page_content = doc.page_content  # Get the page content
    print(f"URL: {url}\nSRC: {src}\nContent: {page_content}\nScore: {score}\nType: {type}\n")



### 4. Q&A with LLM

In [None]:
from IPython.display import Markdown, display, Image

mmgamellm = ChatOpenAI(name="MMGameRag", model_name="gpt-4o-mini", temperature=0.6, streaming=True)

def format_docs(docs_with_scores):
    """
    Formats the retrieved documents into a string with their content, URL, and score,
    and lists them in order with numbering.
    """
    formatted_docs = []
    
    # Iterate over the documents and their associated scores
    for i, (doc, score) in enumerate(docs_with_scores, 1):  # Enumerate to add numbering starting from 1
        imgsrc = doc.metadata.get('src', '')
        if imgsrc: # Image
            formatted_doc = (
                f"{i}.\n"
                f"Image Content:\n{doc.page_content}\n"  # Content of the document
                f"Page Url: {doc.metadata.get('url', '')}\n"  # Assuming URL is stored in metadata
                f"Image Src: {doc.metadata.get('src', '')}\n"  # Assuming URL is stored in metadata
                f"Score: {score}\n"  # Similarity score for the document
            )
        else:  # Text
            formatted_doc = (
                f"{i}.\n"
                f"Text Content:\n{doc.page_content}\n"  # Content of the document
                f"Page Url: {doc.metadata.get('url', '')}\n"  # Assuming URL is stored in metadata
                f"Score: {score}\n"  # Similarity score for the document
            )
        formatted_docs.append(formatted_doc)  # Add formatted document to the list
    
    return "\n".join(formatted_docs)  # Join all formatted documents into a single string

# Prompt for code generation
prompt_template = """你是《黑神话：悟空》这款游戏的AI助手，根据Question和Context专门为玩家提供详尽的游戏攻略并以Markdown的格式输出.请注意：
1. 在Image中找到与Question和Answer最相关的图像。每个Image都有Text before image，Image descriptioin和Text after image，可以用来判断这个Image应该被插入到与文本答案最匹配的上下文的哪个段落当中。格式如下：
    
    文本答案段落
    [![](图像1的Src)](图像1的Url)
    文本答案段落
    [![](图像2的Src)](图像2的Url)
    文本答案段落
    ...

2. 在输出答案的最后，根据问题找到context中的最相关的几个参考文档，并列出Url链接，以供用户参考原始文档。

Question: 
{question}

Context: 
{context}

Image:
{image}

Answer:
"""

prompt_code = ChatPromptTemplate.from_template(prompt_template)

chain = (
    prompt_code
    | mmgamellm
    | StrOutputParser()
)

gamer_question = "黑神话一共有多少上品珍宝？举几个例子"
context_retrieval = format_docs(vectorstore.similarity_search_with_score(query=gamer_question, k=5, filter={"type": "text"}))
# print(context_retrieval + "\n------------------------\n")
img_retrieval = format_docs(vectorstore.similarity_search_with_score(query=gamer_question, k=5, filter={"type": "img"}))
# print(img_retrieval + "\n------------------------\n")
result = chain.invoke({
    "question": gamer_question, 
    "context": context_retrieval,
    "image": img_retrieval
})


display(Markdown(result))
# display(Image(url="http://img1.gamersky.com/image2024/08/20240819_qy_372_15/image001_S.jpg"))
