In [25]:
!python -m spacy download zh_core_web_sm


Collecting zh-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.8.0/zh_core_web_sm-3.8.0-py3-none-any.whl (48.5 MB)
     ---------------------------------------- 0.0/48.5 MB ? eta -:--:--
     ---------------------------------------- 0.0/48.5 MB ? eta -:--:--
     ---------------------------------------- 0.0/48.5 MB ? eta -:--:--
     ---------------------------------------- 0.0/48.5 MB ? eta -:--:--
     ---------------------------------------- 0.0/48.5 MB ? eta -:--:--
     ---------------------------------------- 0.3/48.5 MB ? eta -:--:--
     ---------------------------------------- 0.3/48.5 MB ? eta -:--:--
     --------------------------------------- 0.5/48.5 MB 524.3 kB/s eta 0:01:32
     --------------------------------------- 0.5/48.5 MB 524.3 kB/s eta 0:01:32
     --------------------------------------- 0.5/48.5 MB 524.3 kB/s eta 0:01:32
      -------------------------------------- 0.8/48.5 MB 493.2 kB/s eta 0:01:37


In [31]:
import spacy
import pandas as pd
nlp_zh = spacy.load("zh_core_web_sm")

# 對中文語句進行解析
doc = nlp_zh('台灣是一個位於亞洲東部的島嶼國家。')

columns=['詞', '詞類', '詞性標注', '單詞依存關係', '是否為純字母組成', '是否為停用詞']
dim = list(map(lambda x: [x.text, x.pos_, x.tag_, x.dep_, x.is_alpha, x.is_stop], doc))
pd.DataFrame(dim, columns=columns)

Unnamed: 0,詞,詞類,詞性標注,單詞依存關係,是否為純字母組成,是否為停用詞
0,台灣,NOUN,NN,nsubj,True,False
1,是,VERB,VC,cop,True,True
2,一,NUM,CD,nummod,True,True
3,個,NUM,M,mark:clf,True,False
4,位於,NOUN,NN,nsubj,True,False
5,亞洲,VERB,VV,compound:vc,True,False
6,東部,VERB,VV,acl,True,False
7,的,PART,DEC,mark,True,True
8,島嶼,NOUN,NN,compound:nn,True,False
9,國家,NOUN,NN,ROOT,True,False


In [44]:
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline

# 使用適合中文的 Hugging Face 模型
model_name = "ckiplab/bert-base-chinese-ner"  # 這是台灣 CKIP 提供的 BERT NER 模型

# 加載模型與 tokenizer
model = AutoModelForTokenClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 建立 NER pipeline
ner = pipeline('ner', model=model, tokenizer=tokenizer)

# 測試 NER 功能
text = "台灣是一個位於亞洲東部的島嶼國家。"
doc = ner(text)
columns=['詞', '詞類', '詞性標注']
dim = list(map(lambda x: [x['word'], x['start'], x['end']], doc))
pd.DataFrame(dim, columns=columns)


Unnamed: 0,詞,詞類,詞性標注
0,台,0,1
1,灣,1,2
2,亞,7,8
3,洲,8,9


In [40]:
from transformers import AutoModelForTokenClassification, AutoTokenizer, AutoConfig
 
model = AutoModelForTokenClassification.from_pretrained('uer/roberta-base-finetuned-cluener2020-chinese')
tokenizer = AutoTokenizer.from_pretrained('uer/roberta-base-finetuned-cluener2020-chinese')

ner = pipeline('ner', model=model, tokenizer=tokenizer)
doc = ner('台灣是一個位於亞洲東部的島嶼國家。')
print(doc)
columns=['詞', '詞類', '詞性標注']
dim = list(map(lambda x: [x['word'], x['start'], x['end']], doc))
pd.DataFrame(dim, columns=columns)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity': 'B-address', 'score': np.float32(0.8082326), 'index': 1, 'word': '台', 'start': 0, 'end': 1}, {'entity': 'I-address', 'score': np.float32(0.5574732), 'index': 2, 'word': '灣', 'start': 1, 'end': 2}, {'entity': 'I-address', 'score': np.float32(0.6146184), 'index': 9, 'word': '洲', 'start': 8, 'end': 9}]


Unnamed: 0,詞,詞類,詞性標注
0,台,0,1
1,灣,1,2
2,洲,8,9
