<a href="https://colab.research.google.com/github/Lilo-Denise/Personal-AI-Teaching-Assistant/blob/main/Personal_AI_Teaching_Assistant.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# === Cell 1: 挂载云盘与路径设置 ===
import os
from google.colab import drive

# 1. 挂载 Google Drive
drive.mount('/content/drive')

# 2. 定义永久存储路径
# 所有文件都会存在你的 Google Drive -> Personal_TA_Project 文件夹下
PROJECT_ROOT = "/content/drive/MyDrive/Personal_TA_Project"
DATA_PATH = os.path.join(PROJECT_ROOT, "uploaded_pdfs") # 存放原始PDF
DB_PATH = os.path.join(PROJECT_ROOT, "vector_db")       # 存放AI的记忆

# 3. 自动创建文件夹
os.makedirs(DATA_PATH, exist_ok=True)
os.makedirs(DB_PATH, exist_ok=True)

print(f"✅ 云盘挂载成功！")
print(f"📂 文件存储位置: {DATA_PATH}")
print(f"🧠 记忆存储位置: {DB_PATH}")

Mounted at /content/drive
✅ 云盘挂载成功！
📂 文件存储位置: /content/drive/MyDrive/Personal_TA_Project/uploaded_pdfs
🧠 记忆存储位置: /content/drive/MyDrive/Personal_TA_Project/vector_db


In [4]:
# === Cell 2: 修正版安装 (Fix Dependencies) ===
!apt-get install poppler-utils
# 强制安装最新版的 transformers 以修复报错
!pip install -q git+https://github.com/huggingface/transformers
!pip install -q langchain langchain-community chromadb gradio pypdf
!pip install -q torch accelerate bitsandbytes sentence-transformers
!pip install -q pillow pdf2image protobuf

print("✅ 环境修复并安装完成！请继续。")

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.12).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sentence-transformers 5.1.2 requires transformers<5.0.0,>=4.41.0, but you have transformers 5.0.0.dev0 which is incompatible.[0m[31m
[0m✅ 环境修复并安装完成！请继续。


In [5]:
# === Cell 3: 定义多模态处理工具 ===
import os
from pdf2image import convert_from_path
from transformers import pipeline
from langchain.schema import Document
from langchain_community.document_loaders import PyPDFLoader
from PIL import Image

print("正在初始化视觉模型 (用于看图说话)...")
# 加载一个轻量级的看图模型
image_captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base", device=0)

def process_pdf_multimodal(file_path):
    """读取PDF，提取文字 + 描述图片"""
    file_name = os.path.basename(file_path)
    print(f"正在深入分析文件: {file_name} ...")

    documents = []

    # 1. 尝试提取每一页的图片
    try:
        images = convert_from_path(file_path)
    except Exception as e:
        print(f"图片提取失败: {e}")
        images = []

    # 2. 提取每一页的文字
    loader = PyPDFLoader(file_path)
    text_pages = loader.load()

    # 3. 逐页合并
    # 注意：防止图片和文字页数不匹配的情况
    max_pages = min(len(images), len(text_pages))

    for i in range(max_pages):
        image = images[i]
        text_page = text_pages[i]

        # 让 AI 生成图片描述
        try:
            caption = image_captioner(image, max_new_tokens=50)[0]['generated_text']
            visual_text = f"[Visual Description: {caption}]"
        except:
            visual_text = "[Visual Description: No clear image detected]"

        # 合并：原文 + 视觉描述
        combined_content = f"{text_page.page_content}\n\n{visual_text}"

        doc = Document(
            page_content=combined_content,
            metadata={"source": file_name, "page": i + 1}
        )
        documents.append(doc)

    return documents

print("✅ 多模态工具准备就绪！")

正在初始化视觉模型 (用于看图说话)...


config.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Device set to use cuda:0


✅ 多模态工具准备就绪！


In [6]:
# === Cell 4: 知识库管理 (含永久存储) ===
import shutil
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

# 1. 初始化 Embeddings (把文字变成向量)
print("正在加载 Embedding 模型...")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# 2. 尝试加载已有的数据库
if os.path.exists(DB_PATH) and len(os.listdir(DB_PATH)) > 0:
    print(f"发现已有知识库，正在从 {DB_PATH} 加载...")
    vector_db = Chroma(persist_directory=DB_PATH, embedding_function=embeddings)
else:
    print("未发现已有知识库，初始化为空库。")
    vector_db = None

def add_files_to_knowledge_base(uploaded_files):
    """处理上传文件，存入 Drive，并更新数据库"""
    global vector_db

    if not uploaded_files:
        return "⚠️ 请先选择文件！"

    new_docs = []

    for file_obj in uploaded_files:
        # A. 复制文件到 Google Drive
        filename = os.path.basename(file_obj.name)
        destination = os.path.join(DATA_PATH, filename)
        shutil.copy(file_obj.name, destination)

        # B. 多模态处理
        docs = process_pdf_multimodal(destination)
        new_docs.extend(docs)

    if not new_docs:
        return "❌ 文件解析失败，可能是空文件。"

    # C. 切分文本
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    splits = text_splitter.split_documents(new_docs)

    # D. 存入/更新数据库
    if vector_db is None:
        vector_db = Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory=DB_PATH)
    else:
        vector_db.add_documents(splits)

    return f"✅ 成功！已添加 {len(uploaded_files)} 个文件，记忆库已更新。"

print("✅ 存储管理模块准备就绪！")

正在加载 Embedding 模型...


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

发现已有知识库，正在从 /content/drive/MyDrive/Personal_TA_Project/vector_db 加载...


  vector_db = Chroma(persist_directory=DB_PATH, embedding_function=embeddings)


✅ 存储管理模块准备就绪！


In [7]:
# === Cell 5: 加载 Qwen2 模型 (更稳定) ===
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_community.llms import HuggingFacePipeline

# 换用 Qwen2-1.5B-Instruct，这是一个非常稳定且强大的小模型
model_id = "Qwen/Qwen2-1.5B-Instruct"

print(f"正在下载新模型 {model_id} (速度快，且不报错)...")

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)

# 创建生成管道
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    temperature=0.1,       # 低温度，保证回答事实，不胡编乱造
    top_p=0.95,
    repetition_penalty=1.15
)

local_llm = HuggingFacePipeline(pipeline=pipe)

print("✅ 新的 AI 大脑 (Qwen2) 加载完成！")

正在下载新模型 Qwen/Qwen2-1.5B-Instruct (速度快，且不报错)...


Device set to use cuda:0


✅ 新的 AI 大脑 (Qwen2) 加载完成！


In [None]:
# === Cell 6: 最终完美版 (修复聊天文字颜色) ===
import gradio as gr
import pandas as pd
import os

# --- 1. 核心逻辑 (不变) ---
def answer_question_styled(message, history):
    if vector_db is None or vector_db._collection.count() == 0:
        return "⚠️ Knowledge base empty.", "<div class='source-item' style='color:#ef4444'>Please upload files first.</div>"

    retriever = vector_db.as_retriever(search_kwargs={"k": 4})
    relevant_docs = retriever.invoke(message)

    sources_html_parts = []
    context_str = ""

    for i, doc in enumerate(relevant_docs):
        s_name = doc.metadata.get('source', 'Unknown')
        p_num = doc.metadata.get('page', '?')
        sources_html_parts.append(
            f"<div class='source-item'>"
            f"<span class='source-tag'>#{i+1}</span> "
            f"<span class='source-name'>{s_name}</span> "
            f"<span class='source-page'>Pg.{p_num}</span>"
            f"</div>"
        )
        context_str += f"Content {i+1} (Source: {s_name}, Page: {p_num}):\n{doc.page_content}\n\n"

    prompt = f"""<|system|>
You are a direct academic assistant.
Answer strictly based on context. No fluff.
Cite inline like [1].
Context:
{context_str}
<|end|>
<|user|>
{message}
<|end|>
<|assistant|>"""

    response = local_llm.invoke(prompt)
    clean_response = response.split("<|assistant|>")[-1].strip()
    sources_html = "<div class='source-box'>" + "".join(sources_html_parts) + "</div>"
    return clean_response, sources_html

# --- 2. 辅助逻辑 ---
def get_file_list_df():
    if not os.path.exists(DATA_PATH): return pd.DataFrame(columns=["File Name"])
    files = [f for f in os.listdir(DATA_PATH) if f.endswith('.pdf')]
    return pd.DataFrame(files, columns=["File Name"])

def refresh_options():
    if not os.path.exists(DATA_PATH): return []
    return [f for f in os.listdir(DATA_PATH) if f.endswith('.pdf')]

def handle_upload(files):
    msg = add_files_to_knowledge_base(files)
    return msg, get_file_list_df(), gr.update(choices=refresh_options())

def handle_delete(file_name):
    msg, _ = delete_file_logic(file_name)
    return msg, get_file_list_df(), gr.update(choices=refresh_options(), value=None)

# --- 3. 强制修复 CSS (包含聊天文字修复) ---
fix_css = """
/* 1. 全局强制黑底白字 */
body, .gradio-container {
    background-color: #000000 !important;
    color: #ffffff !important;
    font-family: 'Helvetica Neue', Arial, sans-serif !important;
}

/* 2. 修复标题 */
.main-header {
    font-size: 42px !important;
    font-weight: 800 !important;
    background: linear-gradient(90deg, #a78bfa, #ffffff);
    -webkit-background-clip: text;
    -webkit-text-fill-color: transparent;
    margin-bottom: 20px !important;
    padding-left: 5px;
}

/* 3. Section Label 颜色 */
.section-label, .section-label * {
    color: #a78bfa !important;
    font-size: 12px !important;
    text-transform: uppercase;
    letter-spacing: 2px;
    font-weight: 700 !important;
    margin-top: 20px !important;
    margin-bottom: 5px !important;
}

/* 4. 输入框背景 */
textarea, input {
    background-color: #111111 !important;
    color: #ffffff !important;
    border: 1px solid #333333 !important;
}
.status-box, .status-box label, .status-box textarea {
    background-color: #111111 !important;
    border-color: #333333 !important;
}

/* 5. Drop File Here 文字颜色 */
.file-box, .file-box span, .file-box div {
    background-color: #111111 !important;
    color: #e4e4e7 !important;
    border-color: #333333 !important;
}

/* 6. 表格修复 */
.table-box, .table-box tr, .table-box td, .table-box th, .table-box span {
    background-color: #111111 !important;
    color: #ffffff !important;
    border-color: #333333 !important;
}

/* 7. 按钮统一紫色 */
.purple-btn {
    background: linear-gradient(135deg, #8b5cf6 0%, #d8b4fe 100%) !important;
    border: none !important;
    color: #000000 !important;
    font-weight: 800 !important;
    border-radius: 8px !important;
}

/* 8. 去除所有边框 */
.gradio-container .block, .gradio-container .form {
    border: none !important;
    background: transparent !important;
}

/* 9. 【关键修复】聊天框文字颜色 */
.chat-window { height: 600px !important; }

/* 用户气泡样式 */
.message-row.user-row .message {
    background-color: #27272a !important; /* 深灰背景 */
    border: none !important;
}
/* 【核心点】强制用户文字变白，包括段落和span */
.message-row.user-row .message,
.message-row.user-row .message span,
.message-row.user-row .message p {
     color: #ffffff !important;
}

/* AI回复样式 */
.message-row.bot-row .message {
    background-color: transparent !important; /* 透明背景 */
    padding-left: 0 !important;
    border: none !important;
}
/* 【核心点】强制AI文字变白，包括Markdown里的粗体和代码 */
.message-row.bot-row .message,
.message-row.bot-row .message span,
.message-row.bot-row .message p,
.message-row.bot-row .message strong,
.message-row.bot-row .message code {
     color: #ffffff !important;
}


/* 10. Sources 样式 */
.source-item {
    background-color: #111111;
    border-left: 3px solid #8b5cf6;
    padding: 10px;
    margin-bottom: 8px;
    border-radius: 4px;
    font-size: 13px;
}
.source-tag { color: #8b5cf6; font-weight: bold; margin-right: 8px; }
.source-name { color: white; font-weight: 500; }
.source-page { color: #71717a; float: right; }
"""

# --- 4. 界面构建 ---
with gr.Blocks(css=fix_css, theme=gr.themes.Monochrome()) as demo:

    with gr.Row():
        # === 左侧 ===
        with gr.Column(scale=1):
            gr.HTML("<div class='main-header'>PERSONAL AI<br>LEARNING<br>ASSISTANT</div>")

            # UPLOAD
            gr.Markdown("UPLOAD MATERIALS", elem_classes=["section-label"])
            file_input = gr.File(file_count="multiple", file_types=[".pdf"], container=True, elem_classes=["file-box"])

            # Status
            upload_status = gr.Textbox(show_label=False, placeholder="System Ready.", interactive=False, elem_classes=["status-box"], max_lines=1)

            upload_btn = gr.Button("Process & Save", elem_classes=["purple-btn"])

            # LIBRARY
            gr.Markdown("KNOWLEDGE LIBRARY", elem_classes=["section-label"])
            file_table = gr.Dataframe(
                headers=["File Name"],
                value=get_file_list_df(),
                interactive=False,
                elem_classes=["table-box"]
            )

            # MANAGE
            gr.Markdown("MAINTENANCE", elem_classes=["section-label"])
            delete_dropdown = gr.Dropdown(label=None, choices=refresh_options(), interactive=True, container=True, elem_classes=["file-box"])
            delete_btn = gr.Button("Delete File", elem_classes=["purple-btn"])

        # === 右侧 ===
        with gr.Column(scale=3):
            chatbot = gr.Chatbot(
                label=None, show_label=False, type="messages",
                avatar_images=(None, None), elem_classes=["chat-window"], bubble_full_width=True
            )

            gr.Markdown("VERIFIED SOURCES", elem_classes=["section-label"])
            sources_output = gr.HTML("<div style='color:#555; font-style:italic'>No citations yet.</div>")

            gr.Markdown("YOUR QUERY", elem_classes=["section-label"])
            with gr.Row():
                msg_input = gr.Textbox(show_label=False, placeholder="Ask a question...", container=False, scale=6, elem_classes=["status-box"])
                submit_btn = gr.Button("Ask", elem_classes=["purple-btn"], scale=2)

            clear_btn = gr.Button("Clear History", elem_classes=["purple-btn"])

    # --- 5. 事件绑定 ---
    def process_query(message, history):
        history.append({"role": "user", "content": message})
        yield history, "Searching...", ""
        ai_text, sources_html = answer_question_styled(message, history)
        history.append({"role": "assistant", "content": ai_text})
        yield history, "", sources_html

    msg_input.submit(process_query, [msg_input, chatbot], [chatbot, msg_input, sources_output])
    submit_btn.click(process_query, [msg_input, chatbot], [chatbot, msg_input, sources_output])

    upload_btn.click(handle_upload, inputs=[file_input], outputs=[upload_status, file_table, delete_dropdown])
    delete_btn.click(handle_delete, inputs=[delete_dropdown], outputs=[upload_status, file_table, delete_dropdown])
    clear_btn.click(lambda: ([], "<div style='color:#555'>Cleared.</div>"), None, [chatbot, sources_output])

demo.launch(debug=True, share=True)

  chatbot = gr.Chatbot(


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://02ff6dba87df524187.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


ERROR:    Exception in ASGI application
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/uvicorn/protocols/http/httptools_impl.py", line 409, in run_asgi
    result = await app(  # type: ignore[func-returns-value]
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/uvicorn/middleware/proxy_headers.py", line 60, in __call__
    return await self.app(scope, receive, send)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/fastapi/applications.py", line 1134, in __call__
    await super().__call__(scope, receive, send)
  File "/usr/local/lib/python3.12/dist-packages/starlette/applications.py", line 113, in __call__
    await self.middleware_stack(scope, receive, send)
  File "/usr/local/lib/python3.12/dist-packages/starlette/middleware/errors.py", line 186, in __call__
    raise exc
  File "/usr/local/lib/python3.12/dist-packages/starlette/middleware/error

正在深入分析文件: Week 4_ LM - RNN.pdf ...
正在深入分析文件: Week 5_ Language Models_ Transformers.pdf ...
