# spaCy 功能測試

## 安裝套件

In [1]:
# 安裝套件及預先訓練的模型
# 這些都要在自己的虛擬環境下安裝，路徑才會對
!pip install -U spacy
!python -m spacy download en_core_web_sm # 小型的英文模型
!python -m spacy download en_core_web_md
!python -m spacy download zh_core_web_sm # 小型的中文模型

You should consider upgrading via the '/Users/hanklee/.pyenv/versions/3.8.0/bin/python3.8 -m pip install --upgrade pip' command.[0m
Collecting en-core-web-sm==3.2.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.2.0
You should consider upgrading via the '/Users/hanklee/.pyenv/versions/3.8.0/bin/python -m pip install --upgrade pip' command.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Collecting zh-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.2.0/zh_core_web_sm-3.2.0-py3-none-any.whl (49.5 MB)
     |████████████████████████████████| 49.5 MB 1.3 MB/s            
Collecting spacy-pkuseg<0.1.0,>=0.0.27
  Downloading spacy_pkuseg-0.0.28-cp38-cp38-macosx_10_9_x86_64.whl (2.4 

## 載入相關套件及預先訓練的模型

In [3]:
# 載入相關套件
import spacy

In [5]:
# 載入詞向量模型
nlp = spacy.load("en_core_web_sm")

In [7]:
# 分詞及取得詞性標籤(POS Tagging)
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text, token.pos_, token.dep_)

Apple PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.K. PROPN dobj
startup VERB dep
for ADP prep
$ SYM quantmod
1 NUM compound
billion NUM pobj


In [8]:
# 取得詳細的詞性標籤(POS Tagging)
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP dobj X.X. False False
startup startup VERB VBD dep xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


In [9]:
# 顯示語意分析圖
from spacy import displacy

displacy.serve(doc, style="dep")




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...



127.0.0.1 - - [14/Mar/2022 04:03:50] "GET / HTTP/1.1" 200 9246
127.0.0.1 - - [14/Mar/2022 04:03:51] "GET /favicon.ico HTTP/1.1" 200 9246


Shutting down server on port 5000.


In [11]:
# 標示實體
text = "When Sebastian Thrun started working on self-driving cars " + \
       "at Google in 2007, few people outside of the company took him seriously."

doc = nlp(text)
# style="ent"：實體
displacy.serve(doc, style="ent")


Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [12]:
# 繁體中文分詞
import spacy

nlp = spacy.load("zh_core_web_sm")
doc = nlp("清華大學位於新竹")
for token in doc:
    print(token.text, token.pos_, token.dep_)

清華 NOUN nsubj
大 ADV advmod
學位 ADV dep
於 ADP case
新竹 PROPN ROOT


In [13]:
# 簡體中文分詞
import spacy

nlp = spacy.load("zh_core_web_sm")
doc = nlp("清华大学位于北京")
for token in doc:
    print(token.text, token.pos_, token.dep_)

清华 PROPN compound:nn
大学 NOUN nsubj
位于 VERB ROOT
北京 PROPN dobj


In [14]:
# 顯示中文語意分析圖
from spacy import displacy

displacy.serve(doc, style="dep")


Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [15]:
# 顯示依存關係
nlp = spacy.load("zh_core_web_sm")
doc = nlp("清华大学位于北京")
for token in doc:
    print(token.text, token.pos_, token.dep_)

清华 PROPN compound:nn
大学 NOUN nsubj
位于 VERB ROOT
北京 PROPN dobj


In [17]:
# 分詞，並判斷是否不在字典中(Out of Vocabulary, OOV)
nlp = spacy.load("en_core_web_md")
tokens = nlp("dog cat banana afskfsd")

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 6.27752 True
cat True 6.0239015 True
banana True 6.650522 True
afskfsd True 6.1036773 True


In [18]:
# 相似度比較
nlp = spacy.load("en_core_web_md")

# 測試兩語句
doc1 = nlp("I like salty fries and hamburgers.")
doc2 = nlp("Fast food tastes very good.")

# 兩語句的相似度比較
print(doc1, "<->", doc2, doc1.similarity(doc2))

# 關鍵字的相似度比較
french_fries = doc1[2:4]
burgers = doc1[5]
print(french_fries, "<->", burgers, french_fries.similarity(burgers))

I like salty fries and hamburgers. <-> Fast food tastes very good. 0.3325843940415502
salty fries <-> hamburgers 0.4129716753959656


  print(doc1, "<->", doc2, doc1.similarity(doc2))
  print(french_fries, "<->", burgers, french_fries.similarity(burgers))
