In [None]:
# nb47_safety_moderation_rules.ipynb
# Stage 5: Text Adventure - Safety and Content Moderation

# Cell1:  Shared Cache Bootstrap
import os, pathlib, torch
import sys
from datetime import datetime

# Shared cache configuration (複製到每本 notebook)
AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "../ai_warehouse/cache")

for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)
print("[Cache]", AI_CACHE_ROOT, "| GPU:", torch.cuda.is_available())

In [None]:
# %% Cell 2: Safety Word Lists and Age Rating System
import re
import json
from typing import Dict, List, Tuple, Optional
from enum import Enum
from dataclasses import dataclass


class AgeRating(Enum):
    """Age rating categories for content filtering"""

    GENERAL = "G"  # All ages
    TEEN = "T"  # 13+
    MATURE = "M"  # 17+
    ADULT = "A"  # 18+


class ContentType(Enum):
    """Types of potentially harmful content"""

    VIOLENCE = "violence"
    SEXUAL = "sexual"
    PROFANITY = "profanity"
    HATE_SPEECH = "hate_speech"
    DRUG_REFERENCE = "drug_reference"
    GAMBLING = "gambling"


@dataclass
class ContentFilter:
    """Configuration for content filtering"""

    age_rating: AgeRating
    blocked_types: List[ContentType]
    severity_threshold: float = 0.5  # 0.0-1.0
    replacement_style: str = "euphemistic"  # euphemistic, censored, alternative


class SafetyModerator:
    """Content safety and moderation system for text adventure games"""

    def __init__(self, filter_config: ContentFilter):
        self.config = filter_config
        self.word_lists = self._load_word_lists()
        self.replacement_dict = self._load_replacements()

    def _load_word_lists(self) -> Dict[ContentType, List[str]]:
        """Load categorized word lists for content detection"""
        return {
            ContentType.VIOLENCE: [
                "殺死",
                "謀殺",
                "暴力",
                "攻擊",
                "傷害",
                "血腥",
                "殘忍",
                "屠殺",
                "砍殺",
                "刺殺",
                "毆打",
                "虐待",
                "折磨",
            ],
            ContentType.SEXUAL: [
                "性行為",
                "色情",
                "裸體",
                "性器官",
                "做愛",
                "性交",
                "誘惑",
                "調情",
                "挑逗",
                "親密",
                "床戲",
            ],
            ContentType.PROFANITY: [
                "混蛋",
                "王八蛋",
                "狗屎",
                "他媽的",
                "幹你娘",
                "操",
                "臭婊子",
                "賤人",
                "白痴",
                "智障",
                "廢物",
            ],
            ContentType.HATE_SPEECH: [
                "種族歧視",
                "性別歧視",
                "仇恨",
                "歧視",
                "偏見",
                "排外",
                "極端主義",
                "恐怖主義",
            ],
            ContentType.DRUG_REFERENCE: [
                "毒品",
                "吸毒",
                "大麻",
                "海洛因",
                "古柯鹼",
                "搖頭丸",
                "興奮劑",
                "致幻劑",
                "藥物濫用",
            ],
            ContentType.GAMBLING: [
                "賭博",
                "賭場",
                "下注",
                "彩票",
                "老虎機",
                "撲克牌賭博",
                "賭債",
                "賭癮",
                "博弈",
            ],
        }

    def _load_replacements(self) -> Dict[str, Dict[str, str]]:
        """Load replacement text for different styles"""
        return {
            "euphemistic": {
                "殺死": "擊敗",
                "暴力": "衝突",
                "血腥": "激烈",
                "性行為": "親密接觸",
                "裸體": "衣著不整",
                "混蛋": "壞人",
                "他媽的": "該死的",
                "毒品": "禁藥",
                "賭博": "遊戲",
            },
            "censored": {
                "殺死": "K**",
                "暴力": "**力",
                "血腥": "**腥",
                "性行為": "***為",
                "裸體": "**體",
                "混蛋": "**蛋",
                "他媽的": "***的",
                "毒品": "**品",
                "賭博": "**博",
            },
            "alternative": {
                "殺死": "阻止",
                "暴力": "對抗",
                "血腥": "危險",
                "性行為": "浪漫",
                "裸體": "換裝",
                "混蛋": "反派",
                "他媽的": "可惡",
                "毒品": "藥草",
                "賭博": "競技",
            },
        }

    def detect_content_issues(self, text: str) -> Dict[ContentType, float]:
        """Detect potentially problematic content and return severity scores"""
        issues = {}
        text_lower = text.lower()

        for content_type, words in self.word_lists.items():
            if content_type not in self.config.blocked_types:
                continue

            matches = 0
            total_words = len(text.split())

            for word in words:
                if word.lower() in text_lower:
                    matches += 1

            # Calculate severity based on word frequency
            severity = min(matches / max(total_words, 1) * 10, 1.0)
            if severity > 0:
                issues[content_type] = severity

        return issues

    def moderate_content(self, text: str) -> Tuple[str, bool, Dict]:
        """Moderate content and return cleaned text, approval status, and report"""
        issues = self.detect_content_issues(text)

        # Check if content violates age rating
        violations = {
            k: v for k, v in issues.items() if v >= self.config.severity_threshold
        }

        if not violations:
            return text, True, {"issues": issues, "violations": []}

        # Apply content filtering
        cleaned_text = self._apply_replacements(text)

        report = {
            "issues": issues,
            "violations": list(violations.keys()),
            "age_rating": self.config.age_rating.value,
            "action": "content_filtered",
        }

        return cleaned_text, False, report

    def _apply_replacements(self, text: str) -> str:
        """Apply content replacements based on style configuration"""
        replacements = self.replacement_dict.get(self.config.replacement_style, {})

        for original, replacement in replacements.items():
            text = re.sub(re.escape(original), replacement, text, flags=re.IGNORECASE)

        return text

    def generate_alternative_content(
        self, original_text: str, context: str = ""
    ) -> str:
        """Generate alternative content when original is blocked"""
        if self.config.replacement_style == "euphemistic":
            return f"[故事轉向更溫和的發展] {context}中，情況得到了和平的解決。"
        elif self.config.replacement_style == "censored":
            return f"[內容已過濾] 由於內容不適合當前設定，故事將跳過這個部分。"
        else:  # alternative
            return f"[替代敘述] {context}中，角色們選擇了更合適的行動方案。"

In [None]:
# %% Cell 3: Content Detector with Scoring
class AdvancedContentDetector:
    """More sophisticated content detection using pattern matching and context"""

    def __init__(self):
        self.violence_patterns = [
            r"(殺|砍|刺|打|揍|毆|擊).{0,3}(死|傷|害|倒)",
            r"(血|屍|骨|肉).{0,2}(流|飛|散|碎)",
            r"(武器|刀|劍|槍|炸彈).{0,5}(攻擊|傷害|殺)",
        ]

        self.sexual_patterns = [
            r"(脫|除|褪).{0,2}(衣|褲|裙)",
            r"(撫摸|愛撫|親吻).{0,3}(身體|肌膚)",
            r"(床|房間|私密).{0,5}(活動|行為)",
        ]

        self.context_modifiers = {
            "遊戲": 0.7,  # Gaming context reduces severity
            "虛構": 0.8,  # Fictional context
            "幻想": 0.8,  # Fantasy context
            "現實": 1.2,  # Reality increases severity
            "教學": 0.5,  # Educational context
        }

    def analyze_with_context(self, text: str, game_context: Dict) -> Dict:
        """Analyze content considering game context and narrative style"""
        base_score = self._pattern_based_detection(text)

        # Apply context modifiers
        context_modifier = 1.0
        for ctx_word, modifier in self.context_modifiers.items():
            if ctx_word in text or ctx_word in game_context.get("genre", ""):
                context_modifier *= modifier

        # Adjust scores based on narrative tone
        narrative_tone = game_context.get("tone", "neutral")
        if narrative_tone == "comedic":
            context_modifier *= 0.6
        elif narrative_tone == "dark":
            context_modifier *= 1.3
        elif narrative_tone == "educational":
            context_modifier *= 0.4

        adjusted_scores = {k: v * context_modifier for k, v in base_score.items()}

        return {
            "base_scores": base_score,
            "context_modifier": context_modifier,
            "final_scores": adjusted_scores,
            "game_context": game_context,
        }

    def _pattern_based_detection(self, text: str) -> Dict[ContentType, float]:
        """Use regex patterns for more nuanced detection"""
        scores = {}

        # Violence detection
        violence_matches = sum(
            1 for pattern in self.violence_patterns if re.search(pattern, text)
        )
        if violence_matches > 0:
            scores[ContentType.VIOLENCE] = min(violence_matches * 0.3, 1.0)

        # Sexual content detection
        sexual_matches = sum(
            1 for pattern in self.sexual_patterns if re.search(pattern, text)
        )
        if sexual_matches > 0:
            scores[ContentType.SEXUAL] = min(sexual_matches * 0.4, 1.0)

        return scores

In [None]:
# %% Cell 4: RAG Integration with Safety Checks
from sentence_transformers import SentenceTransformer
import numpy as np


class SafeGameNarrator:
    """RAG-powered game narrator with integrated safety moderation"""

    def __init__(
        self, moderator: SafetyModerator, embedding_model: str = "BAAI/bge-m3"
    ):
        self.moderator = moderator
        self.embedder = SentenceTransformer(embedding_model)
        self.safe_alternatives_db = self._build_safe_alternatives()

    def _build_safe_alternatives(self) -> Dict[str, List[str]]:
        """Build database of safe alternative narrative elements"""
        return {
            "conflict_resolution": [
                "雙方達成了和解",
                "通過談判解決了爭端",
                "找到了共同點",
                "智慧戰勝了暴力",
                "和平的解決方案出現了",
            ],
            "character_interaction": [
                "角色間建立了友誼",
                "相互理解加深了",
                "合作關係形成了",
                "信任逐漸建立",
                "共同目標團結了大家",
            ],
            "plot_advancement": [
                "故事朝著積極的方向發展",
                "新的機會出現了",
                "希望的曙光照耀著前路",
                "智慧指引著行動",
                "勇氣克服了困難",
            ],
            "world_building": [
                "世界充滿了奇妙的景色",
                "神秘而美麗的地方等待探索",
                "古老的智慧隱藏在各處",
                "自然的力量帶來和諧",
                "文明展現出進步的光輝",
            ],
        }

    def generate_safe_narrative(
        self, prompt: str, context: Dict, max_attempts: int = 3
    ) -> Dict:
        """Generate narrative content with safety checking and fallbacks"""
        result = {
            "final_text": "",
            "safety_report": {},
            "generation_attempts": 0,
            "safety_fallback_used": False,
        }

        for attempt in range(max_attempts):
            result["generation_attempts"] = attempt + 1

            # Generate content (simplified - would use LLM in practice)
            generated_text = self._generate_text_snippet(prompt, context)

            # Check safety
            moderated_text, is_safe, safety_report = self.moderator.moderate_content(
                generated_text
            )
            result["safety_report"] = safety_report

            if is_safe:
                result["final_text"] = moderated_text
                return result

            # If not safe, try to find alternative narrative
            if attempt < max_attempts - 1:
                prompt = self._create_safer_prompt(prompt, safety_report)
            else:
                # Final fallback to safe alternatives
                result["final_text"] = self._get_safe_fallback(context)
                result["safety_fallback_used"] = True

        return result

    def _generate_text_snippet(self, prompt: str, context: Dict) -> str:
        """Simplified text generation - replace with actual LLM call"""
        # This is a placeholder - in real implementation, would call LLM
        templates = [
            f"在{context.get('location', '某處')}，{prompt}的情況發展為...",
            f"角色面對{prompt}的挑戰時，決定採取行動...",
            f"故事在{prompt}的轉折點上，展現出新的可能性...",
        ]
        return np.random.choice(templates)

    def _create_safer_prompt(self, original_prompt: str, safety_report: Dict) -> str:
        """Modify prompt to generate safer content"""
        violations = safety_report.get("violations", [])

        safe_modifiers = {
            ContentType.VIOLENCE: "以和平的方式",
            ContentType.SEXUAL: "以純潔的友誼",
            ContentType.PROFANITY: "以禮貌的對話",
            ContentType.HATE_SPEECH: "以相互尊重",
        }

        for violation in violations:
            if violation in safe_modifiers:
                original_prompt = f"{safe_modifiers[violation]}, {original_prompt}"

        return f"請以適合全年齡的方式描述: {original_prompt}"

    def _get_safe_fallback(self, context: Dict) -> str:
        """Get safe fallback content when generation fails safety check"""
        context_type = context.get("scene_type", "plot_advancement")
        alternatives = self.safe_alternatives_db.get(
            context_type, self.safe_alternatives_db["plot_advancement"]
        )
        return np.random.choice(alternatives)

In [None]:
# %% Cell 5: Game Integration Example
class SafeGameEngine:
    """Text adventure game engine with integrated safety moderation"""

    def __init__(self, age_rating: AgeRating = AgeRating.TEEN):
        # Configure safety filter based on age rating
        if age_rating == AgeRating.GENERAL:
            blocked_types = list(ContentType)  # Block everything
            threshold = 0.1
        elif age_rating == AgeRating.TEEN:
            blocked_types = [
                ContentType.SEXUAL,
                ContentType.PROFANITY,
                ContentType.HATE_SPEECH,
            ]
            threshold = 0.3
        elif age_rating == AgeRating.MATURE:
            blocked_types = [ContentType.SEXUAL, ContentType.HATE_SPEECH]
            threshold = 0.5
        else:  # ADULT
            blocked_types = [ContentType.HATE_SPEECH]
            threshold = 0.7

        filter_config = ContentFilter(
            age_rating=age_rating,
            blocked_types=blocked_types,
            severity_threshold=threshold,
            replacement_style="euphemistic",
        )

        self.moderator = SafetyModerator(filter_config)
        self.narrator = SafeGameNarrator(self.moderator)
        self.content_log = []  # Log for monitoring

    def process_player_action(self, action: str, game_state: Dict) -> Dict:
        """Process player action with safety checks"""
        # First check if player input is appropriate
        input_moderated, input_safe, input_report = self.moderator.moderate_content(
            action
        )

        if not input_safe:
            return {
                "success": False,
                "message": "抱歉，該行動不被允許。請嘗試其他方式。",
                "safety_report": input_report,
            }

        # Generate narrative response
        context = {
            "location": game_state.get("current_location", "未知地點"),
            "scene_type": game_state.get("scene_type", "plot_advancement"),
            "tone": game_state.get("narrative_tone", "neutral"),
            "genre": "文字冒險遊戲",
        }

        narrative_result = self.narrator.generate_safe_narrative(
            prompt=f"玩家執行了動作: {input_moderated}", context=context
        )

        # Log the interaction for monitoring
        self.content_log.append(
            {
                "timestamp": str(np.datetime64("now")),
                "player_input": action,
                "input_safe": input_safe,
                "narrative_safe": not narrative_result["safety_fallback_used"],
                "age_rating": self.moderator.config.age_rating.value,
            }
        )

        return {
            "success": True,
            "narrative": narrative_result["final_text"],
            "safety_report": narrative_result["safety_report"],
            "fallback_used": narrative_result["safety_fallback_used"],
        }

    def get_safety_statistics(self) -> Dict:
        """Get safety moderation statistics"""
        if not self.content_log:
            return {"total_interactions": 0}

        total = len(self.content_log)
        safe_inputs = sum(1 for log in self.content_log if log["input_safe"])
        safe_narratives = sum(1 for log in self.content_log if log["narrative_safe"])

        return {
            "total_interactions": total,
            "safe_input_rate": safe_inputs / total,
            "safe_narrative_rate": safe_narratives / total,
            "current_age_rating": self.moderator.config.age_rating.value,
            "blocked_content_types": [
                ct.value for ct in self.moderator.config.blocked_types
            ],
        }

In [None]:
# %% Cell 6: Smoke Test
print("=== 內容安全系統煙霧測試 ===\n")

# Test different age ratings
test_content = [
    "角色拔出劍來攻擊敵人",  # Mild violence
    "他們互相親吻擁抱",  # Mild sexual
    "這個混蛋真是可惡",  # Profanity
    "探索神秘的森林",  # Safe content
]

age_ratings = [AgeRating.GENERAL, AgeRating.TEEN, AgeRating.MATURE]

for rating in age_ratings:
    print(f"Age Rating: {rating.value}")
    game = SafeGameEngine(rating)

    for content in test_content:
        result = game.process_player_action(
            content,
            {"current_location": "測試環境", "scene_type": "conflict_resolution"},
        )

        status = "✅ 允許" if result["success"] else "❌ 被阻擋"
        print(f"  '{content}' -> {status}")

    stats = game.get_safety_statistics()
    print(
        f"  安全率: 輸入 {stats['safe_input_rate']:.1%}, 敘述 {stats['safe_narrative_rate']:.1%}\n"
    )

# Test content detection accuracy
print("=== 內容檢測準確性測試 ===")
detector = AdvancedContentDetector()

test_cases = [
    ("遊戲中的角色進行了激烈的戰鬥", {"genre": "遊戲"}),
    ("現實中發生了暴力事件", {"genre": "新聞"}),
    ("教學演示如何安全地使用工具", {"genre": "教育"}),
]

for text, context in test_cases:
    analysis = detector.analyze_with_context(text, context)
    print(f"文本: '{text}'")
    print(f"情境: {context}")
    print(f"基礎分數: {analysis['base_scores']}")
    print(f"情境調整: {analysis['context_modifier']:.2f}")
    print(f"最終分數: {analysis['final_scores']}\n")

print("煙霧測試完成 ✅")

In [None]:
# 年齡分級對應的安全設定
SAFETY_CONFIGS = {
    AgeRating.GENERAL: {
        "blocked_types": list(ContentType),  # 全部封鎖
        "threshold": 0.1,
        "replacement_style": "alternative",
    },
    AgeRating.TEEN: {
        "blocked_types": [
            ContentType.SEXUAL,
            ContentType.PROFANITY,
            ContentType.HATE_SPEECH,
        ],
        "threshold": 0.3,
        "replacement_style": "euphemistic",
    },
    AgeRating.MATURE: {
        "blocked_types": [ContentType.SEXUAL, ContentType.HATE_SPEECH],
        "threshold": 0.5,
        "replacement_style": "censored",
    },
}