### 下載Hugging face shibing624/bert4ner-base-chinese

In [21]:
import os
import logging
from pathlib import Path
from transformers import AutoTokenizer, pipeline

# 設置日誌
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

def setup_model(model_name: str = "shibing624/bert4ner-base-chinese"):
    """
    設置和初始化模型
    
    Args:
        model_name: Hugging Face 模型名稱
    """
    try:
        logger.info(f"正在載入模型: {model_name}") 
        classifier = pipeline(
            "token-classification",
            model=model_name,
            aggregation_strategy="simple"
        )
        logger.info("模型載入完成")
        return classifier
        
    except Exception as e:
        logger.error(f"模型載入失敗: {str(e)}")
        raise

def process_text(text: str, classifier) -> list:
    """
    處理文本並進行命名實體識別
    
    Args:
        text: 輸入文本
        classifier: NER pipeline
    """
    try:
        logger.info(f"處理文本: {text}")
        results = classifier(text)
        
        # 整理結果
        entities = []
        for item in results:
            entities.append([
                item['word'],
                item['entity_group']
            ])
        
        return entities
        
    except Exception as e:
        logger.error(f"文本處理失敗: {str(e)}")
        raise

def main():
    try:
        # 初始化模型
        classifier = setup_model()
        
        # 測試文本
        sample_text = "中華民國民眾黨主席柯文哲涉政治獻金假帳案，調查局北機站清查金流發現，民眾黨利用「網紅帶貨」銷售手法，先藉由「學姐」黃瀞瑩等人高知名度，吸引選民捐贈政治獻金，再用「折扣碼」發放KP競選小物，進而從中抽佣分潤，抽佣的錢疑來自政治獻金，涉及違反政治獻金法，黃瀞瑩與「戰狼小姐姐」陳智菡、許甫、吳怡萱等4人恐由證人轉列被告偵辦"
        
        # 處理文本
        entities = process_text(sample_text, classifier)
        
        logger.info(f"識別結果: {entities}")
        
    except Exception as e:
        logger.error(f"程序執行出錯: {str(e)}")
        raise

if __name__ == "__main__":
    main()

2024-12-18 17:04:23,991 - INFO - 正在載入模型: shibing624/bert4ner-base-chinese
2024-12-18 17:04:24,253 - INFO - 模型載入完成
2024-12-18 17:04:24,253 - INFO - 處理文本: 中華民國民眾黨主席柯文哲涉政治獻金假帳案，調查局北機站清查金流發現，民眾黨利用「網紅帶貨」銷售手法，先藉由「學姐」黃瀞瑩等人高知名度，吸引選民捐贈政治獻金，再用「折扣碼」發放KP競選小物，進而從中抽佣分潤，抽佣的錢疑來自政治獻金，涉及違反政治獻金法，黃瀞瑩與「戰狼小姐姐」陳智菡、許甫、吳怡萱等4人恐由證人轉列被告偵辦
2024-12-18 17:04:24,288 - INFO - 識別結果: [['中 華 民 國 民 眾 黨', 'ORG'], ['柯 文 哲', 'PER'], ['黃 瀞 瑩', 'PER'], ['黃 瀞 瑩', 'PER'], ['陳 智 菡', 'PER'], ['許 甫', 'PER'], ['吳 怡 萱', 'PER']]


### 引用本地端

設置模型路徑

model_path = "./examples/outputs/cner_bertsoftmax/best_model"  # 請確保這個路徑指向你的模型目錄

In [19]:
import os
import logging
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
from torch import nn
import json

# 設置日誌
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

def load_model_config(model_path: str):
    """
    載入模型配置
    """
    try:
        config_path = os.path.join(model_path, "config.json")
        with open(config_path, 'r', encoding='utf-8') as f:
            config = json.load(f)
        return config
    except Exception as e:
        logger.error(f"載入配置文件失敗: {str(e)}")
        raise

def setup_model(model_path: str = "./best_model"):
    """
    從本地檔案載入模型
    """
    try:
        logger.info(f"正在從本地載入模型: {model_path}")
        
        # 檢查必要文件
        required_files = [
            "config.json",
            "model.safetensors",  # 使用 safetensors 而不是 pytorch_model.bin
            "tokenizer_config.json",
            "vocab.txt"
        ]
        
        for file in required_files:
            file_path = os.path.join(model_path, file)
            if not os.path.exists(file_path):
                logger.warning(f"注意: 找不到文件: {file}")
        
        # 載入 tokenizer
        tokenizer = AutoTokenizer.from_pretrained(
            model_path,
            local_files_only=True
        )
        
        # 載入模型
        model = AutoModelForTokenClassification.from_pretrained(
            model_path,
            local_files_only=True
        )
        
        # 設置為評估模式
        model.eval()
        
        logger.info("模型載入完成")
        return model, tokenizer
        
    except Exception as e:
        logger.error(f"模型載入失敗: {str(e)}")
        raise

def process_text(text: str, model, tokenizer) -> list:
    """
    處理文本並進行命名實體識別
    """
    try:
        logger.info(f"處理文本: {text}")
        
        # 讀取標籤映射
        config = load_model_config(model.config.name_or_path)
        id2label = config.get('id2label', {})
        
        # 對文本進行編碼
        inputs = tokenizer(
            text,
            return_tensors="pt",
            padding=True,
            truncation=True
        )
        
        # 進行預測
        with torch.no_grad():
            outputs = model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=2)
        
        # 解碼結果
        tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
        predictions = predictions[0].tolist()
        
        # 整理實體
        entities = []
        current_entity = []
        current_label = None
        
        for token, pred_id in zip(tokens, predictions):
            if token in ['[CLS]', '[SEP]', '[PAD]']:
                continue
                
            label = id2label.get(str(pred_id), 'O')
            
            if label.startswith('B-'):
                if current_entity:
                    entities.append([''.join(current_entity), current_label])
                current_entity = [token.replace('##', '')]
                current_label = label[2:]
            elif label.startswith('I-') and current_entity:
                current_entity.append(token.replace('##', ''))
            elif label == 'O':
                if current_entity:
                    entities.append([''.join(current_entity), current_label])
                    current_entity = []
                    current_label = None
        
        if current_entity:
            entities.append([''.join(current_entity), current_label])
        
        return entities
        
    except Exception as e:
        logger.error(f"文本處理失敗: {str(e)}")
        raise

def main():
    try:
        # 設置模型路徑
        model_path = "./examples/outputs/cner_bertspan/best_model"
        
        # 檢查模型目錄是否存在
        if not os.path.exists(model_path):
            raise FileNotFoundError(f"找不到模型目錄: {model_path}")
        
        # 初始化模型
        model, tokenizer = setup_model(model_path)
        
        # 測試文本
        sample_text = "您好，我是常建良有多模態客服機器人開發應用，(北京國科會、玉山、電商客服開發經驗)。支援語音、文字、檔案上傳"
        
        # 處理文本
        entities = process_text(sample_text, model, tokenizer)
        
        logger.info(f"識別結果: {entities}")
        
    except Exception as e:
        logger.error(f"程序執行出錯: {str(e)}")
        raise

if __name__ == "__main__":
    main()

2024-12-18 17:03:55,110 - INFO - 正在從本地載入模型: ./examples/outputs/cner_bertspan/best_model
Some weights of BertForTokenClassification were not initialized from the model checkpoint at ./examples/outputs/cner_bertspan/best_model and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2024-12-18 17:03:55,158 - INFO - 模型載入完成
2024-12-18 17:03:55,159 - INFO - 處理文本: 您好，我是常建良有多模態客服機器人開發應用，(北京國科會、玉山、電商客服開發經驗)。支援語音、文字、檔案上傳
2024-12-18 17:03:55,175 - INFO - 識別結果: []


In [22]:
from nerpy import NERModel

model = NERModel("bert", "shibing624/bert4ner-base-chinese")
sentences = [ 
    "李明在上海的騰訊公司擔任工程師"
]
# set split_on_space=False if you use Chinese text
predictions, raw_outputs, entities = model.predict(sentences, split_on_space=False)
print(predictions, entities)

ModuleNotFoundError: No module named 'loguru'