In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install gradio
!pip install duckduckgo_search
#!pip install faiss-cpu
#!pip install faiss-gpu


## **Download models**

In [10]:
import kagglehub

# Download latest version
path = kagglehub.model_download("deepseek-ai/deepseek-r1/transformers/deepseek-r1-distill-qwen-7b")

print("Path to model files:", path)

Path to model files: /kaggle/input/deepseek-r1/transformers/deepseek-r1-distill-qwen-7b/1


## Loading Model and tokenizer using the Transformers library


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_folder = "/kaggle/input/deepseek-r1/transformers/deepseek-r1-distill-qwen-7b/1"
model_name = model_folder
print(model_name, ' - model')
#"Qwen/Qwen2.5-7B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="cuda"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)


## No RAG, search only

In [None]:
import os
import time
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
from duckduckgo_search import DDGS  # 请确保已安装 duckduckgo_search

# ------------------------------
# 1. 主生成模型加载部分
# ------------------------------
model_parent_folder = "/kaggle/input/deepseek-r1/transformers"
available_models = [d for d in os.listdir(model_parent_folder) 
                    if os.path.isdir(os.path.join(model_parent_folder, d))]
print("可用模型：", available_models)

def load_main_model(selected_model):
    # 假设每个模型目录下都有一个版本目录“1”
    model_path = os.path.join(model_parent_folder, selected_model, "1")
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype="auto",
        device_map="cuda"
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    return model, tokenizer

main_model_cache = {}

# ------------------------------
# 2. 外部搜索接口（使用 DuckDuckGo 的 DDGS）
# ------------------------------
def external_api_search(query):
    context = ""
    with DDGS() as ddgs:
        results = ddgs.text(query, max_results=5)  # 返回前5个结果
    if results:
        for res in results:
            context += res.get("body", "") + "\n"
    return context

# ------------------------------
# 3. 分块生成函数（分段生成策略）
# ------------------------------
def chunk_generation(prompt, model, tokenizer, max_chunk=512, max_iter=4):
    full_output = ""
    current_prompt = prompt
    for _ in range(max_iter):
        inputs = tokenizer(current_prompt, return_tensors="pt").to(model.device)
        input_length = inputs.input_ids.shape[1]
        max_new_tokens = min(max_chunk, 2048 - input_length)  # 根据输入长度自动调整
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,         # 限制生成的新 token 数量
            min_length=10,                         # 设置最小生成长度
            do_sample=True,                        # 启用采样
            temperature=0.7,                       # 控制随机性
            top_k=50,                              # 限制采样范围
            top_p=0.95,                            # 核采样
            num_beams=3,                           # 集束搜索
            early_stopping=True,                   # 启用提前终止
            repetition_penalty=1.2,                # 减少重复
            no_repeat_ngram_size=3,                # 禁止重复 n-gram
            use_cache=True,                        # 启用缓存加速
            pad_token_id=model.config.eos_token_id,  # 使用 EOS token 作为 padding
            output_scores=False,
            output_attentions=False,
            output_hidden_states=False,
            num_return_sequences=1
        )
        chunk = tokenizer.decode(outputs[0], skip_special_tokens=True)
        full_output += chunk
        if model.config.eos_token_id in outputs[0]:
            break
        current_prompt = full_output
    return full_output

# ------------------------------
# 4. 混合生成函数（只使用主模型和联网搜索，不使用 RAG 功能）
# 并同时返回生成文本和统计信息
# ------------------------------
def hybrid_generation(query, model_choice, use_external_search, use_chunk_generation):
    # 加载用户选择的主模型（使用缓存）
    if model_choice not in main_model_cache:
        main_model_cache[model_choice] = load_main_model(model_choice)
    main_model, main_tokenizer = main_model_cache[model_choice]
    
    # 如果启用了联网搜索，则调用 DuckDuckGo 获取上下文
    if use_external_search:
        search_context = external_api_search(query)
    else:
        search_context = ""
    
    combined_query = ""
    if search_context:
        combined_query += f"搜索上下文: {search_context}\n"
    combined_query += f"原问题: {query}"
    
    # 开始计时
    start_time = time.time()
    
    if use_chunk_generation:
        reply = chunk_generation(combined_query, main_model, main_tokenizer, max_chunk=512, max_iter=4)
    else:
        inputs = main_tokenizer(combined_query, return_tensors="pt").to(main_model.device)
        input_length = inputs.input_ids.shape[1]
        max_new_tokens = min(800, 2048 - input_length)  # 根据输入长度自动调整
        outputs = main_model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,         # 限制生成的新 token 数量
            min_length=10,                         # 设置最小生成长度
            do_sample=True,                        # 启用采样
            temperature=0.7,                       # 控制随机性
            top_k=50,                              # 限制采样范围
            top_p=0.95,                            # 核采样
            num_beams=3,                           # 集束搜索
            early_stopping=True,                   # 启用提前终止
            repetition_penalty=1.2,                # 减少重复
            no_repeat_ngram_size=3,                # 禁止重复 n-gram
            use_cache=True,                        # 启用缓存加速
            pad_token_id=main_model.config.eos_token_id,  # 使用 EOS token 作为 padding
            output_scores=False,
            output_attentions=False,
            output_hidden_states=False,
            num_return_sequences=1
        )
        reply = main_tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    elapsed_time = time.time() - start_time
    # 获取生成的 token 数量
    # 注意：如果使用分块生成策略，需要另外统计 token 数量
    tokens_generated = outputs.shape[-1] if not use_chunk_generation else len(main_tokenizer.encode(reply))
    stats = (
        f"生成 token 数量: {tokens_generated}\n"
        f"生成耗时: {elapsed_time:.2f} 秒\n"
        f"生成速度: {tokens_generated/elapsed_time:.2f} tokens/second"
    )

    # 生成完成后尝试清理缓存
    torch.cuda.empty_cache()
    
    # 返回一个 tuple，第一个元素为最终回答，第二个元素为统计信息
    return "最终回答：" + reply, stats

# ------------------------------
# 5. 构建 Gradio 界面
# ------------------------------
# 这里我们设置两个输出组件：一个显示回答，一个显示统计信息
interface = gr.Interface(
    fn=hybrid_generation,
    inputs=[
        gr.Textbox(lines=2, placeholder="请输入你的问题..."),
        gr.Dropdown(choices=available_models, label="选择主模型"),
        gr.Checkbox(label="启用 DuckDuckGo 联网搜索", value=True),
        gr.Checkbox(label="启用分块生成策略", value=False)
    ],
    outputs=[
        gr.Textbox(label="生成回答"),
        gr.Textbox(label="统计信息")
    ],
    title="混合对话生成系统",
    description="选择主模型、是否启用联网搜索和分块生成策略，系统将整合搜索上下文和动态生成参数生成最终回复，并显示生成统计信息。"
)

interface.launch(share=True)
