### Test with RecursiveUrlLoader (discarded)

In [37]:
import re
import time
from bs4 import BeautifulSoup
from langchain_community.document_loaders import RecursiveUrlLoader

# Extracts text content from HTML, removing extra newlines and formatting it for readability.
def bs4_extractor(html: str) -> str:
    """
    Extracts clean text from the given HTML content, removing extra newlines for better readability.
    """
    soup = BeautifulSoup(html, "lxml")
    return re.sub(r"\n\n+", "\n\n", soup.text).strip()

# Extracts the HTML content, preserving text and <img> tags, and placing the image in the text where found.
def extract_text_with_images(html: str) -> str:
    """
    Extracts both text and <img> tags from the given HTML content, placing images within the text where found.
    """
    soup = BeautifulSoup(html, "lxml")
    
    output = ""
    for element in soup.descendants:
        if element.name not in ['script', 'style'] and isinstance(element, str):
            output += element.strip() + "\n\n"
        elif element.name == "img":
            img_tag = f'<img src="{element.get("src")}", data-src="{element.get("data-src")}", alt="{element.get("alt")}", width="{element.get("width")}", height="{element.get("height")}">\n\n'
            output += img_tag
    
    return re.sub(r"\n\n+", "\n", output).strip()

# Saves the given content to a file with the specified filename.
def save_to_file(content: str, filename: str) -> None:
    """
    Saves the content to a file with the given filename.
    """
    with open(filename, "w", encoding="utf-8") as file:
        file.write(content)

# Process the loaded content and save the results
def process_html_content(html_content: str, doc_i) -> None:
    """
    Processes the HTML content by extracting text and images, and saves the results into files.
    It also extracts the title and uses it to name the saved files.
    """
    # Use BeautifulSoup to extract the title
    soup = BeautifulSoup(html_content, "lxml")
    title = soup.title.string if soup.title else "untitled"
    
    # Clean title for valid filename usage
    safe_title = re.sub(r'[\/:*?"<>|]', "_", title)
    
    # Extract text content
    extracted_text = bs4_extractor(html_content)
    
    # Extract text with images
    text_with_images = extract_text_with_images(html_content)
    
    # Save files using the title as part of the filename
    save_to_file(html_content, f"{safe_title}[{doc_i}]_original_html.txt")
    save_to_file(extracted_text, f"{safe_title}[{doc_i}]_text.txt")
    save_to_file(text_with_images, f"{safe_title}[{doc_i}]_text_with_images.txt")
    
    print(f"Files saved with the title '{safe_title}'")

# Define the URL to be processed
currenturl = "https://www.gamersky.com/z/bmwukong/1314156_195585/"

# Use RecursiveUrlLoader without an extractor to get the original HTML content
loader_html = RecursiveUrlLoader(
    currenturl,
    max_depth=10,
    use_async=False,
    extractor=None,  # No extractor here to get the original HTML
    metadata_extractor=None,
    exclude_dirs=(),
    timeout=10,
    check_response_status=True,
    continue_on_failure=True,
    prevent_outside=True,
    base_url=None,
)

# Load the original HTML content
docs_html = loader_html.load()

# Process all documents loaded from RecursiveUrlLoader
if docs_html and len(docs_html) > 0:
    for i, doc in enumerate(docs_html):
        html_content = doc.page_content
        
        # Optional: Include document index as part of the filename to differentiate files
        print(f"Processing document {i + 1}/{len(docs_html)}")

        # Process the HTML content by extracting text and images for each document
        process_html_content(html_content, i)
else:
    print("Failed to load any content from the URL.")


Processing document 1/2
Files saved with the title '黑神话悟空精魄_黑神话悟空攻略秘籍_ 游民星空 Gamersky.com '
Processing document 2/2
Files saved with the title '黑神话悟空精魄_黑神话悟空攻略秘籍_ 游民星空 Gamersky.com '


In [None]:
from IPython.display import Markdown, display


display(Markdown(html_content))

In [79]:
from langchain_community.document_loaders import RecursiveUrlLoader

# 定义要抓取的初始 URL
currenturl = "https://www.gamersky.com/z/bmwukong/"

# 配置 RecursiveUrlLoader
loader = RecursiveUrlLoader(
    currenturl,
    max_depth=2,  # 设置递归抓取深度，例如 3 表示抓取当前页面及其链接的两级页面
    use_async=False,  # 是否异步抓取
    extractor=None,  # 提取器设为 None 以获取原始 HTML
    metadata_extractor=None,  # 不使用元数据提取器
    exclude_dirs=(),  # 可选，排除不需要抓取的目录
    timeout=5,  # 每个页面的抓取超时时间
    check_response_status=True,  # 是否检查 HTTP 响应状态码
    continue_on_failure=True,  # 是否在遇到错误时继续抓取
    prevent_outside=False,  # 防止抓取超出指定 URL 域名或目录的链接
    base_url=currenturl,  # 确保只抓取从这个 URL 开始的页面
)

# 加载文档，返回一个包含所有递归抓取到页面的列表
docs = loader.load()

# 处理抓取到的文档
for i, doc in enumerate(docs):
    print(f"\n-----------------")
    print(f"Document {i+1}:")  
    print(doc.page_content.__len__())  # 输出每个文档的内容
    print(doc.metadata)  # 输出每个文档的内容
    


Unable to load from https://www.gamersky.com/z/bmwukong/25?gsAppOpenWithNewWindow=true. Received error Received HTTP status 404 of type ValueError



-----------------
Document 1:
425874
{'source': 'https://www.gamersky.com/z/bmwukong/', 'content_type': 'text/html', 'title': '黑神话悟空游戏专区_黑神话悟空中文版下载及攻略秘籍 _ 游民星空 GamerSky.com', 'description': '游民星空黑神话悟空游戏专题,提供悟空中文版下载,悟空攻略,悟空修改器,悟空武器,悟空装备,悟空boss,悟空剧情,悟空配置,悟空DLC,视频,汉化,补丁等游戏资料。《黑神话：悟空》是一款以中国神话为背景的动作角色扮演游戏。你将扮演一位“天命人”，为了探寻昔日传说的真相，踏上一条充满危险与惊奇的西游之路。本作除了重新去塑造一个个大主角外，还运用顶尖的画面、丰富的细节、沉浸的战斗体验、足量的剧情演绎，去还原心中一直存在的东方魔幻世界，谱写充满东方韵味的超级英雄史诗。', 'language': 'en'}

-----------------
Document 2:
51627
{'source': 'https://www.gamersky.com/handbook/202408/1803625_8.shtml?gsAppOpenWithNewWindow=true', 'content_type': 'text/html', 'title': '《黑神话悟空》武器图鉴 武器获取方法及铸造材料一览_特品-鳞棍·亢金-游民星空 GamerSky.com', 'description': '《黑神话悟空》中的武器可以通过推进主线、支线剧情或铸造获得，许多小伙伴可能还不太清楚一些武器该如何解锁，下面就为大家带来《黑神话悟空》武器图鉴，希望对大家有所帮助。', 'language': None}

-----------------
Document 3:
53474
{'source': 'https://www.gamersky.com/handbook/202408/1807939_2.shtml?gsAppOpenWithNewWindow=true', 'content_type': 'text/html', 'title': '《黑神话悟空》全种子收集攻略 种子齐备五十二难解锁方法_碧藕

### 1. Complete page scrwling and data extraction

In [12]:
import requests
import time
from lxml import html, etree
from urllib.parse import urljoin
import os
from datetime import datetime
from tqdm import tqdm  # Progress bar
import json
import urllib.request

In [13]:

# Log function to save logs to a file with date and time
def log_message(message, filename="docs/mmgamerag.log"):
    """
    Saves the provided log message to a file with the current date and time.
    """
    current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    with open(filename, "a", encoding="utf-8") as log_file:
        log_file.write(f"[{current_time}] {message}\n")


# Base64 encode the image
def get_base64_encoded_image(image_url):
    """
    Fetches the image from the given URL and returns its Base64 encoded string.
    """
    return ''  # Temporarily returning an empty string for base64

# Save content to file, for back up only, structured data is saved in JSON file.
def save_content_to_file(content, filename):
    """
    Saves the provided content to a file.
    """
    with open(filename, "w", encoding="utf-8") as file:
        file.write(content)
    log_message(f"Content successfully saved to {filename}")

def save_image_to_file(img_src):
    """
    Save the image from the provided URL to a specified directory with a safe filename.
    
    Parameters:
    img_src (str): The URL of the image to be saved.
    """
    # Clean the URL to make it filename-safe
    filename_safe_url = img_src.replace(":", "=").replace("/", "|")
    
    # Specify the save path
    save_directory = "docs/rawdata/img"
    os.makedirs(save_directory, exist_ok=True)

    # Define the filename
    filename = os.path.join(save_directory, f"{filename_safe_url}.jpg")
    
    # Download and save the image
    urllib.request.urlretrieve(img_src, filename)
    log_message(f"Image saved as: {filename}")


# Save content to JSON file in a specified format
def save_data_json_with_format(content, filename):
    """
    Saves the provided content to a JSON file with specified indentation format.
    Ensures the content is appended correctly to an existing JSON array.
    """
    # Check if the file exists and load its content if it does
    if os.path.exists(filename):
        with open(filename, 'r', encoding="utf-8") as json_file:
            try:
                existing_data = json.load(json_file)
            except json.JSONDecodeError:
                existing_data = []
    else:
        existing_data = []
    
    # Append new content to the existing data
    existing_data.extend(content)
    
    # Save the updated data back to the file
    with open(filename, 'w', encoding="utf-8") as json_file:
        json.dump(existing_data, json_file, indent=4, ensure_ascii=False)
    
    log_message(f"JSON content successfully saved to {filename}")   

# Extract text and images from the part with class="Mid2L_con" and save to docs first, then JSON. n means how many lines of text was stored before and after each image.
def extract_text_and_images(currenturl, tree, n=3):
    """
    Extracts text and images from the part of the webpage with class="Mid2L_con", 
    and splits into two parts: one with text only, and one with text + images.
    Saves content to files first in 'docs', then processes and saves image metadata to JSON files.
    """
    
    if tree:

        # Extract the page title
        # title = tree.xpath('//title/text()') # regular title
        # if title:
        #     title_text = title[0].strip()
        # else:
        #     title_text = 'No Title'
        
        # Extract title with detailed data and author info, such as "2023-08-21 14:06:02 来源：游民星空 作者：LIN木木 编辑：LIN木木　浏览：17536"
        mid2L_tit_elements = tree.xpath('//div[@class="Mid2L_tit"]')
        if mid2L_tit_elements:
            title_text = mid2L_tit_elements[0].text_content()           
            # Remove leading and trailing whitespace and empty lines in the middle
            lines = [line.strip() for line in title_text.splitlines() if line.strip()] 
            title_text = "\n".join(lines)

        # Extract the part of the page with class="Mid2L_con"
        mid2l_con = tree.xpath('//div[@class="Mid2L_con"]')

        # Clean the URL to make it filename-safe
        filename_safe_url = currenturl.replace(":", "=").replace("/", "|")

        if mid2l_con:
            text_content_list = [f"Title: {title_text}"]
            text_with_images_list = [f"Title: {title_text}"]
            txt_data_list = []
            img_data_list = []
            img_elements = []
            stop_extraction = False

            # First pass: gather all text and image elements
            for element in mid2l_con[0].iter():
                if stop_extraction:
                    break

                # If it's a text node, extract the text and tail
                if element.text and isinstance(element.tag, str):
                    text = element.text.strip()
                else:
                    text = ""

                # Also check the 'tail' for text outside the tag
                if element.tail:
                    tail_text = element.tail.strip()
                else:
                    tail_text = ""

                # Combine text and tail_text
                combined_text = text + " " + tail_text if text or tail_text else ""

                # Append the combined text to the list if it's not empty and does not contain certain phrases
                if combined_text:
                    # Check if the combined text starts with "本文由游民星空"
                    if combined_text.startswith("本文由游民星空"):
                        stop_extraction = True
                    else:
                        # Check if combined_text contains any of the unwanted phrases
                        unwanted_phrases = [
                            "更多相关内容请关注",
                            "责任编辑",
                            "友情提示：",
                            "本文是否解决了您的问题",
                            "已解决",
                            "未解决",
                            "黑神话：悟空专区",
                            "上一页",
                            "下一页"
                        ]
                        if not any(phrase in combined_text for phrase in unwanted_phrases):
                            text_content_list.append(combined_text)
                            text_with_images_list.append(combined_text)


                # If it's an <img> tag, store the img element for processing later
                if element.tag == 'img' and not stop_extraction:
                    img_src = element.get('src')
                    img_data_src = element.get('data-src', img_src)  # Use data-src if available, otherwise fallback to src
                    img_alt = element.get('alt', '')
                    img_title = element.get('title', '')
                    img_width = element.get('width', '')
                    img_height = element.get('height', '')

                    # Convert relative paths to absolute URLs
                    img_src = urljoin(currenturl, img_data_src)

                    img_elements.append({
                        "src": img_src,
                        "alt": img_alt,
                        "title": img_title,
                        "width": img_width,
                        "height": img_height
                    })

                    # Add placeholder for image tag in the content
                    text_with_images_list.append('<img placeholder>')

            # After gathering text and images, process them outside the loop
            for idx, img_element in enumerate(img_elements):
                img_src = img_element["src"]
                img_alt = img_element["alt"]
                img_title = img_element["title"]
                img_width = img_element["width"]
                img_height = img_element["height"]

                # Get the Base64 encoded image content (currently returning empty string)
                img_base64 = get_base64_encoded_image(img_src)

                # Save the raw image to a file
                save_image_to_file(img_src)

                # Find the index of the current image in the content
                img_index = text_with_images_list.index('<img placeholder>', idx)

                # Get n lines before and after the image
                txt_bef_img = []
                txt_aft_img = []
                txt_bef_img_str = ''
                txt_aft_img_str = ''

                # Extract n lines before the image, stop if another <img> tag is encountered
                for i in range(img_index-1, max(0, img_index-n)-1, -1):
                    if '<img' in text_with_images_list[i]:
                        break
                    txt_bef_img.append(text_with_images_list[i])
                    txt_bef_img.reverse()

                # Extract n lines after the image, stop if another <img> tag is encountered
                for i in range(img_index+1, min(len(text_with_images_list), img_index+1+n)):
                    if '<img' in text_with_images_list[i]:
                        break
                    txt_aft_img.append(text_with_images_list[i])

                txt_bef_img_str = '\n'.join(txt_bef_img)
                txt_aft_img_str = '\n'.join(txt_aft_img)

                # Add the image metadata to the img_data_list
                img_data_list.append({
                    "src": img_src,
                    "base64": img_base64,  # Temporarily set to an empty string
                    "title": img_title,
                    "alt": img_alt,
                    "txt_bef_img": txt_bef_img_str,
                    "txt_aft_img": txt_aft_img_str,
                    "url": currenturl,
                    "type": "img"
                })

                # Replace the placeholder with the actual image tag
                img_tag = f'<img src="{img_src}" alt="{img_alt}" width="{img_width}" height="{img_height}" title="{img_title}">'
                text_with_images_list[img_index] = img_tag

            # Convert text_content_list to a single string
            text_content_str = '\n'.join(text_content_list)

            # Save content to docs folder first
            text_only_filename = os.path.join("docs/rawdata/", f"{filename_safe_url}_text_only.txt")
            text_with_images_filename = os.path.join("docs/rawdata/", f"{filename_safe_url}_text_with_images.html")
            save_content_to_file(text_content_str, text_only_filename)
            save_content_to_file('\n'.join(text_with_images_list), text_with_images_filename)

            txt_data_list.append({
                    "txt": text_content_str,
                    "url": currenturl,
                    "type": "text"
                })

            # Save the entire text_content_str directly to mmtext.json
            save_data_json_with_format(txt_data_list, "docs/mmtext.json")

            # Save the image metadata to JSON as a list of objects
            save_data_json_with_format(img_data_list, "docs/mmimg.json")

            return f"Content saved to files in docs and JSON files processed."
        else:
            return "No content found with class='Mid2L_con'."
    else:
        return "Failed to fetch content."

# Load existing links from the JSON file
def load_existing_links(filename="docs/links.json"):
    """
    Loads existing links from the specified JSON file.
    If the file doesn't exist, it returns an empty list.
    """
    if os.path.exists(filename):
        with open(filename, 'r', encoding="utf-8") as json_file:
            return json.load(json_file)
    return []

# Save new link to the JSON file
def save_link_to_json(new_link, filename="docs/links.json"):
    """
    Saves the provided link to the specified JSON file.
    If the link contains .shtml?, it removes the string after .shtml.
    If the link already exists, it returns False.
    If the link is new and added, it returns True.
    """
    # Check if the new_link contains '.shtml?'
    if ".shtml?" in new_link:
        new_link = new_link.split(".shtml?")[0] + ".shtml"
    
    links = load_existing_links(filename)
    if new_link not in links:
        links.append(new_link)
        with open(filename, 'w', encoding="utf-8") as json_file:
            json.dump(links, json_file, indent=4, ensure_ascii=False)
        log_message(f"New link added to JSON: {new_link}")
        return False
    else:
        log_message(f"Link alredy in JSON.")
        return True


# Check if the link exists in the JSON file
def check_link_in_json(new_link, filename="docs/links.json"):
    """
    Checks if the provided link exists in the specified JSON file.
    If the link contains .shtml?, it removes the string after .shtml.
    Returns True if the link is found, otherwise False.
    """
    # Check if the new_link contains '.shtml?'
    if ".shtml?" in new_link:
        new_link = new_link.split(".shtml?")[0] + ".shtml"
    
    # Load existing links from the JSON file
    links = load_existing_links(filename)
    
    # Return True if the link is found, otherwise False
    if new_link in links:
        return True
    else:
        return False



# Send request with retry mechanism
def fetch_url_with_retries(url, max_retries=2):
    """
    Attempts to fetch content from the given URL, retrying up to max_retries times.
    If the request fails, it waits for 1 second before retrying.
    """
    retries = 0
    while retries < max_retries:
        try:
            response = requests.get(url, timeout=3)  # Set timeout to 3 seconds
            
            # If the status code is 200, the request was successful, return the content
            if response.status_code == 200:
                log_message(f"-----------------------------------------------------------")
                log_message(f"Success on attempt {retries + 1} for {url}")
                return response.content
            
            # If the status code is not 200, log the failure reason
            else:
                log_message(f"Attempt {retries + 1} failed with status code {response.status_code}")
        
        except requests.RequestException as e:
            # Capture request exceptions like timeout or connection errors
            log_message(f"Attempt {retries + 1} failed with error: {e}")
        
        # Increment retry count
        retries += 1
        
        # Wait for 1 second before retrying
        time.sleep(1)

    # If max retries are exceeded, return None or handle the error accordingly
    log_message(f"Failed to fetch the URL after {max_retries} attempts.")
    return None

# New function to crawl the webpage and its linked pages up to a given depth
def crawl_and_extract(url, keyword="黑神话", linkdepth=2):
    """
    Crawls the webpage starting from the given URL, and checks for links within the page.
    If a page contains the term specified in 'keyword' in either "Mid2L_con" class or in the title,
    or in the whole HTML content, it saves the link in 'links.json'.
    Crawls up to the given linkdepth (including the original URL).
    """
    def crawl(url, current_depth, max_depth, pbar):

        # Check if the link exists in the JSON file and it is in the max depth, if yes, just return.
        if check_link_in_json(url) == True and current_depth == max_depth:
            # log_message(f"Link found in JSON: {url} (Depth: {current_depth})")
            return
        
        # Fetch the page content
        html_content = fetch_url_with_retries(url)
        if not html_content:
            log_message(f"Failed to fetch content for {url} (Depth: {current_depth})")
            return
        
        # Parse the HTML using lxml
        tree = html.fromstring(html_content)

        # Check if class="Mid2L_con" or title contains the keyword
        mid2l_con_elements = tree.xpath('//div[@class="Mid2L_con"]')
        title_elements = tree.xpath('//title/text()')
        
        # Check if keyword exists in Mid2L_con or Title
        mid2l_con_text = mid2l_con_elements[0].text_content() if mid2l_con_elements else ""
        title_text = title_elements[0] if title_elements else ""
        
        links = []

        if mid2l_con_text:
            if keyword in mid2l_con_text or keyword in title_text:
                log_message(f"Found '{keyword}' in Mid2L_con or Title at {url} (Depth: {current_depth})")
                linkexist = save_link_to_json(url)  # Save the link to JSON
                if linkexist == False:
                    extract_text_and_images(url, tree)
                if current_depth < max_depth:
                    # Get all links on the page
                    alinks = tree.xpath('//div[@class="Mid2L_con"]//a[@href]/@href')
                    links = [urljoin(url, link) for link in alinks if link.startswith(('http', '/'))]
                    links = list(set(links))
                    # Remove unwanted link, links starting with 'javascript:', and those ending with '.jpg' or '.png'
                    unwanted_link = "https://www.gamersky.com/z/bmwukong/"
                    filtered_links = [link for link in links if link != unwanted_link and not link.startswith('javascript:') and not link.endswith(('.jpg', '.png'))]
                    links = filtered_links
                    log_message(f"Found {len(links)} links on {url} (Depth: {current_depth}). Crawling deeper...")
        else:
            # If not found in Mid2L_con, check the full HTML content
            if keyword in title_text: 
                log_message(f"Found '{keyword}' in full HTML at {url} (Depth: {current_depth})")
            # if keyword in html_content.decode('utf-8', errors='ignore'):                
                linkexist = save_link_to_json(url)  # Save the link to JSON
                if linkexist == False:
                    pass
                    # extract_text_and_images(url, tree)   # Don't extract if it is just an overview
                if current_depth < max_depth:
                    # Get all links on the page
                    alinks = tree.xpath('//a[@href]/@href')
                    links = [urljoin(url, link) for link in alinks if link.startswith(('http', '/'))]
                    links = list(set(links))
                    # Remove unwanted link, links starting with 'javascript:', and those ending with '.jpg' or '.png'
                    unwanted_link = "https://www.gamersky.com/z/bmwukong/"
                    filtered_links = [link for link in links if link != unwanted_link and not link.startswith('javascript:') and not link.endswith(('.jpg', '.png'))]
                    links = filtered_links
                    log_message(f"Found {len(links)} links on {url} (Depth: {current_depth}). Crawling deeper...")
            else:
                log_message(f"No '{keyword}' found at {url} (Depth: {current_depth})")

        
        if current_depth < max_depth and links:
            # Recursively crawl the found links, with increased depth
            for link in tqdm(links, desc=f"Crawling depth {current_depth}/{max_depth}", leave=False, position=1, dynamic_ncols=True):
                crawl(link, current_depth + 1, max_depth, pbar)
                pbar.update(1)

    # Start crawling from the given URL 
    crawled_urls = set()  # To avoid re-crawling the same URL

    # Write the start message
    log_message("=== Crawl Start ===")

    with tqdm(total=100, desc="Crawling", position=0, dynamic_ncols=True) as pbar:
        crawl(url, current_depth=1, max_depth=linkdepth, pbar=pbar)

    # Write the end message with two empty lines
    log_message("=== Crawl End ===\n\n")

# Test URL and Keyword
url = "https://www.gamersky.com/handbook/202408/1803760.shtml"
keyword = "黑神话"
crawl_and_extract(url, keyword=keyword, linkdepth=2)


  if tree:
Crawling:  39%|███▉      | 39/100 [00:14<00:22,  2.72it/s]


### 2. Vector store

In [5]:
from dotenv import load_dotenv,find_dotenv
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain_chroma import Chroma
from langchain.schema.document import Document
from langchain_core.output_parsers import StrOutputParser
from IPython.display import Markdown, display

load_dotenv(find_dotenv()) 

# Preparation of documents for RAG-------------------------
# Vectorstore, for retrieval
embedding_model=OpenAIEmbeddings(model="text-embedding-3-large")

vectorstore_path = "vectorstore/chromadb-mmgamerag"
if os.path.exists(vectorstore_path):
    print(f"Loaded vectorstore from disk: {vectorstore_path}")
else:
    # Initialize an empty vectorstore and persist to disk
    print(f"Initialized an empty vectorstore in {vectorstore_path}")

vectorstore = Chroma(
                embedding_function=embedding_model,
                persist_directory=vectorstore_path,
                ) 

def add_text_documents_to_vectorstore(vectorstore, documents):
    # Retrieve existing documents from the vectorstore
    existing_docs = vectorstore.get()
    
    existing_urls = [metadata['url'] for metadata in existing_docs['metadatas']] #???metadata
    print(f"Quantity of existing_urls: {len(existing_urls)}")
    # print(existing_urls)
    # Filter out documents that already exist based on URL
    new_documents = [doc for doc in documents if doc.metadata["url"] not in existing_urls]

    if new_documents:
        vectorstore.add_documents(new_documents)
        # vectorstore.persist()  # Persist the vectorstore after adding documents
        print(f"Added {len(new_documents)} new text documents.")
    else:
        print("No new text documents to add.")


def add_img_documents_to_vectorstore(vectorstore, documents):
    # Retrieve existing documents from the vectorstore
    existing_docs = vectorstore.get()
    
    # Use `get` to avoid KeyError if some metadata does not have 'src'
    existing_srcs = [metadata.get('src') for metadata in existing_docs['metadatas'] if 'src' in metadata]
    print(f"Quantity of existing_srcs: {len(existing_srcs)}")
    print(existing_srcs)
    
    # Filter out documents that already exist based on src
    new_documents = [doc for doc in documents if doc.metadata.get("src") not in existing_srcs]

    if new_documents:
        vectorstore.add_documents(new_documents)
        # vectorstore.persist()  # Persist the vectorstore after adding documents
        print(f"Added {len(new_documents)} new img documents.")
    else:
        print("No new img documents to add.")


def add_txt_img():
    txt_data_list = []
    img_data_list = []

    # Directly load and assign to txt_data_list from mmtext.json
    with open('docs/mmtext.json', 'r', encoding='utf-8') as text_file:
        txt_data_list = json.load(text_file)  # Assuming the JSON structure matches the required format

    # Directly load and assign to img_data_list from mmimg.json
    with open('docs/mmimg.json', 'r', encoding='utf-8') as img_file:
        img_data_list = json.load(img_file)  # Assuming the JSON structure matches the required format

    # Add texts
    mmtexts = [
        Document(page_content=item['txt'], metadata={"url": item['url'], "type": item['type']})
        for item in txt_data_list
    ]

    # Add documents and save to vectorstore
    add_text_documents_to_vectorstore(vectorstore, mmtexts)


    def get_img_description(src):
        return ""

    # Add imgs
    mmimgs = [
        Document(
            page_content="Text before image: " + item['txt_bef_img'] + "\nImage descriptioin: " + get_img_description(item['src']) + "\nText after image:" + item['txt_aft_img'],  
            metadata={"url": item['url'], "type": item['type'], "src": item['src']}
        )
        for item in img_data_list  # Iterate over each item in img_data_list
    ]

    # Add documents and save to vectorstore
    add_img_documents_to_vectorstore(vectorstore, mmimgs)

Loaded vectorstore from disk: vectorstore/chromadb-mmgamerag


### 3. Retrieval

In [6]:
# retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

# retrieved_docs = retriever.invoke("猫睛宝串")
# retrieved_docs

retrieved_docs = vectorstore.similarity_search_with_relevance_scores(query="君子牌", k=5, filter={"type": "text"})

# Iterate over retrieved_docs and extract the url, page_content, and score
for doc, score in retrieved_docs:
    url = doc.metadata.get('url', 'No URL found')  # Extract the URL from the metadata
    type = doc.metadata.get('type', '') 
    page_content = doc.page_content  # Get the page content
    # print(f"URL: {url}\nContent: {page_content}\nScore: {score}\nType: {type}\n")

retrieved_docs = vectorstore.similarity_search_with_relevance_scores(query="君子牌", k=5, filter={"type": "img"})

# Iterate over retrieved_docs and extract the url, page_content, and score
for doc, score in retrieved_docs:
    url = doc.metadata.get('url', 'No URL found')  # Extract the URL from the metadata
    type = doc.metadata.get('type', '') 
    src = doc.metadata.get('src', '') 
    page_content = doc.page_content  # Get the page content
    print(f"URL: {url}\nSRC: {src}\nContent: {page_content}\nScore: {score}\nType: {type}\n")



URL: https://www.gamersky.com/handbook/202408/1803760_29.shtml
SRC: http://img1.gamersky.com/image2024/08/20240823_qy_372_2/image031_S.jpg
Content: Text before image: 展开 
获取方法：击败黄花观中的虫羽士有概率掉落。 
上品-君子牌 
Image descriptioin: 
Text after image:
Score: 0.262758268011249
Type: img

URL: https://www.gamersky.com/handbook/202408/1803760_26.shtml
SRC: http://img1.gamersky.com/image2024/08/20240822_qy_372_50/image013_S.jpg
Content: Text before image: 第26页：特品-仙箓 
特品-仙箓 
展开 
Image descriptioin: 
Text after image:
Score: 0.12574522394022047
Type: img

URL: https://www.gamersky.com/handbook/202408/1803760_24.shtml
SRC: http://img1.gamersky.com/image2024/08/20240822_qy_372_4/image016_S.jpg
Content: Text before image: 第24页：上品-三清令 
上品-三清令 
展开 
Image descriptioin: 
Text after image:
Score: 0.10174461676199487
Type: img

URL: https://www.gamersky.com/handbook/202408/1803760_33.shtml
SRC: http://img1.gamersky.com/image2024/08/20240824_qy_372_2/image030_S.jpg
Content: Text before image: 第33页：上品-阳燧珠 
上品-阳燧珠

### 4. Q&A with LLM

In [8]:
from IPython.display import Markdown, display, Image

mmgamellm = ChatOpenAI(name="MMGameRag", model_name="gpt-4o-mini", temperature=0.6, streaming=True)

def format_docs(docs_with_scores):
    """
    Formats the retrieved documents into a string with their content, URL, and score,
    and lists them in order with numbering.
    """
    formatted_docs = []
    
    # Iterate over the documents and their associated scores
    for i, (doc, score) in enumerate(docs_with_scores, 1):  # Enumerate to add numbering starting from 1
        imgsrc = doc.metadata.get('src', '')
        if imgsrc: # Image
            formatted_doc = (
                f"{i}.\n"
                f"Image Content:\n{doc.page_content}\n"  # Content of the document
                f"Page Url: {doc.metadata.get('url', '')}\n"  # Assuming URL is stored in metadata
                f"Image Src: {doc.metadata.get('src', '')}\n"  # Assuming URL is stored in metadata
                f"Score: {score}\n"  # Similarity score for the document
            )
        else:  # Text
            formatted_doc = (
                f"{i}.\n"
                f"Text Content:\n{doc.page_content}\n"  # Content of the document
                f"Page Url: {doc.metadata.get('url', '')}\n"  # Assuming URL is stored in metadata
                f"Score: {score}\n"  # Similarity score for the document
            )
        formatted_docs.append(formatted_doc)  # Add formatted document to the list
    
    return "\n".join(formatted_docs)  # Join all formatted documents into a single string

# Prompt for code generation
prompt_template = """你是《黑神话：悟空》这款游戏的AI助手，根据Question和Context专门为玩家提供详尽的游戏攻略并以Markdown的格式输出.请注意：
1. 在Image中找到与Question和Answer最相关的图像。每个Image都有Text before image，Image descriptioin和Text after image，可以用来判断这个Image应该被插入到与文本答案最匹配的上下文的哪个段落当中。格式如下：
    
    文本答案段落
    [![](图像1的Src)](图像1的Url)
    文本答案段落
    [![](图像2的Src)](图像2的Url)
    文本答案段落
    ...

2. 在输出答案的最后，根据问题找到context中的最相关的几个参考文档，并列出Url链接，以供用户参考原始文档。

Question: 
{question}

Context: 
{context}

Image:
{image}

Answer:
"""

prompt_code = ChatPromptTemplate.from_template(prompt_template)

chain = (
    prompt_code
    | mmgamellm
    | StrOutputParser()
)

gamer_question = "黑神话一共有多少上品珍宝？举几个例子"
context_retrieval = format_docs(vectorstore.similarity_search_with_score(query=gamer_question, k=5, filter={"type": "text"}))
# print(context_retrieval + "\n------------------------\n")
img_retrieval = format_docs(vectorstore.similarity_search_with_score(query=gamer_question, k=5, filter={"type": "img"}))
# print(img_retrieval + "\n------------------------\n")
result = chain.invoke({
    "question": gamer_question, 
    "context": context_retrieval,
    "image": img_retrieval
})


display(Markdown(result))
# display(Image(url="http://img1.gamersky.com/image2024/08/20240819_qy_372_15/image001_S.jpg"))


在《黑神话：悟空》中，上品的珍玩一共有37种。以下是几个上品的例子及其获取方法：

1. **上品-不求人**
   - **获取方法**：将主线章节推进至第二回后，返回第一回的任意土地庙购买获得。
   - [![](http://img1.gamersky.com/image2024/08/20240824_qy_372_2/image030_S.jpg)](https://www.gamersky.com/handbook/202408/1803760_33.shtml)

2. **上品-兽与佛**
   - **获取方法**：在第三回小雷音寺中击败双刀僧（拿着双弯刺的红袍僧人）概率掉落。
   - [![](http://img1.gamersky.com/image2024/08/20240822_qy_372_4/image006_S.jpg)](https://www.gamersky.com/handbook/202408/1803760_20.shtml)

3. **上品-君子牌**
   - **获取方法**：击败黄花观中的虫羽士有概率掉落。
   - [![](http://img1.gamersky.com/image2024/08/20240823_qy_372_2/image031_S.jpg)](https://www.gamersky.com/handbook/202408/1803760_29.shtml)

4. **上品-卵中骨**
   - **获取方法**：在第四回盘丝洞中击碎地上的虫卵有概率获得（非唯一）。
   - [![](http://img1.gamersky.com/image2024/08/20240822_qy_372_4/image006_S.jpg)](https://www.gamersky.com/handbook/202408/1803760_21.shtml)

5. **上品-阳燧珠**
   - **获取方法**：获取方法未详细说明。
   - [![](http://img1.gamersky.com/image2024/08/20240824_qy_372_2/image030_S.jpg)](https://www.gamersky.com/handbook/202408/1803760_33.shtml)

这些上品珍玩不仅在游戏中具有独特的效果，还能帮助玩家在冒险中更为顺利。你可以通过不同的方式获得这些珍玩，提升你的游戏体验。

### 参考文档
- [上品-不求人](https://www.gamersky.com/handbook/202408/1803760_3.shtml)
- [上品-兽与佛](https://www.gamersky.com/handbook/202408/1803760_19.shtml)
- [上品-君子牌](https://www.gamersky.com/handbook/202408/1803760_29.shtml)
- [上品-卵中骨](https://www.gamersky.com/handbook/202408/1803760_21.shtml)
- [上品-阳燧珠](https://www.gamersky.com/handbook/202408/1803760_33.shtml)