# 英文辭典 en_core_web_sm ：

## 下載辭典

#### SpaCy 官網：https://spacy.io/usage

In [1]:
!pip install -U spacy

Collecting spacy
  Downloading spacy-3.0.6-cp38-cp38-macosx_10_9_x86_64.whl (12.5 MB)
[K     |████████████████████████████████| 12.5 MB 655 kB/s eta 0:00:01
Collecting cymem<2.1.0,>=2.0.2
  Downloading cymem-2.0.5-cp38-cp38-macosx_10_9_x86_64.whl (31 kB)
Collecting wasabi<1.1.0,>=0.8.1
  Downloading wasabi-0.8.2-py3-none-any.whl (23 kB)
Collecting srsly<3.0.0,>=2.4.1
  Downloading srsly-2.4.1-cp38-cp38-macosx_10_9_x86_64.whl (450 kB)
[K     |████████████████████████████████| 450 kB 67 kB/s  eta 0:00:01
[?25hCollecting typer<0.4.0,>=0.3.0
  Downloading typer-0.3.2-py3-none-any.whl (21 kB)
Collecting spacy-legacy<3.1.0,>=3.0.4
  Downloading spacy_legacy-3.0.6-py2.py3-none-any.whl (12 kB)
Collecting blis<0.8.0,>=0.4.0
  Downloading blis-0.7.4-cp38-cp38-macosx_10_9_x86_64.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 758 kB/s eta 0:00:01
[?25hCollecting catalogue<2.1.0,>=2.0.3
  Downloading catalogue-2.0.4-py3-none-any.whl (16 kB)
Collecting murmurhash<1.1.0,>=0.28.0
 

In [2]:
!pip install -U spacy-lookups-data

Collecting spacy-lookups-data
  Downloading spacy_lookups_data-1.0.2-py2.py3-none-any.whl (97.3 MB)
[K     |████████████████████████████████| 97.3 MB 325 kB/s eta 0:00:01     |███████████████████████████▋    | 83.8 MB 248 kB/s eta 0:00:55
Installing collected packages: spacy-lookups-data
Successfully installed spacy-lookups-data-1.0.2
You should consider upgrading via the '/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


In [3]:
#en_core_web_sm 是 model , sm = small size
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl (13.7 MB)
[K     |████████████████████████████████| 13.7 MB 656 kB/s eta 0:00:01
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.0.0
You should consider upgrading via the '/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


## 詞性標註 Part-of-speech tagging

In [5]:
import spacy
import en_core_web_sm #載入辭典

my_text = "Apple is looking at buying U.K. startup for $1 billion"

nlp = en_core_web_sm.load()


In [6]:
#將字串放入
doc = nlp(my_text)
doc

Apple is looking at buying U.K. startup for $1 billion

In [11]:
for token in doc:
    print(token.text)

Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion


#### Attributes : https://spacy.io/api/token

In [15]:
# .text = 原始文本
# .lemma_ = 這個詞的詞根形式
# .pos_ = （粗粒度)詞性標記（part-of-speech）
# .tag_ = (細粒度)詞性標記
# .dep_ = 依賴性
# .shape_ = 形狀（大小寫）
# .is_alpha = 是否為字母組成
# .is_stop = 是不是stop word

import pandas as pd

cols = ['text', 'lemma', 'pos', 'tag', 'dep', 'shap', 'is_alpha', 'is_stop']

rows = []
for token in doc:
    row = [token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_,
           token.is_alpha, token.is_stop]
    
    rows.append(row)
    
df = pd.DataFrame(rows, columns=cols)
df

Unnamed: 0,text,lemma,pos,tag,dep,shap,is_alpha,is_stop
0,Apple,Apple,PROPN,NNP,nsubj,Xxxxx,True,False
1,is,be,AUX,VBZ,aux,xx,True,True
2,looking,look,VERB,VBG,ROOT,xxxx,True,False
3,at,at,ADP,IN,prep,xx,True,True
4,buying,buy,VERB,VBG,pcomp,xxxx,True,False
5,U.K.,U.K.,PROPN,NNP,dobj,X.X.,False,False
6,startup,startup,NOUN,NN,advcl,xxxx,True,False
7,for,for,ADP,IN,prep,xxx,True,True
8,$,$,SYM,$,quantmod,$,False,False
9,1,1,NUM,CD,compound,d,False,False


## 視覺化文字相依性

In [16]:
from spacy import displacy

#nlp = spacy.load("en_core_web_sm")
nlp = en_core_web_sm.load()
doc = nlp(my_text)

displacy.render(doc, style="dep")

## 語法解析樹 Dependency Parsing

In [17]:
#nlp = spacy.load("en_core_web_sm")
nlp = en_core_web_sm.load()
doc = nlp(my_text)
doc

Apple is looking at buying U.K. startup for $1 billion

### Noun Phrases 名詞短語

In [21]:
for chunk in doc.noun_chunks:
    print(chunk)
    print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text)
    print()

Apple
Apple Apple nsubj looking

U.K.
U.K. U.K. dobj buying



### 實體命名辨識

In [22]:
displacy.render(doc, style="ent")

# 中文詞典 zh_core_web_sm：

##  下載辭典

In [23]:
!python -m spacy download zh_core_web_sm

Collecting zh-core-web-sm==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.0.0/zh_core_web_sm-3.0.0-py3-none-any.whl (49.5 MB)
[K     |████████████████████████████████| 49.5 MB 752 kB/s eta 0:00:01
Collecting spacy-pkuseg<0.1.0,>=0.0.27
  Downloading spacy_pkuseg-0.0.28-cp38-cp38-macosx_10_9_x86_64.whl (2.4 MB)
[K     |████████████████████████████████| 2.4 MB 212 kB/s eta 0:00:01
Installing collected packages: spacy-pkuseg, zh-core-web-sm
Successfully installed spacy-pkuseg-0.0.28 zh-core-web-sm-3.0.0
You should consider upgrading via the '/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('zh_core_web_sm')


## 詞性標註 Part-of-speech tagging

In [24]:
import spacy
import zh_core_web_sm

#利用哪一個詞典做斷詞
nlp = zh_core_web_sm.load()

my_text = "三倍券還可以買公益彩券及買黃金，業者也會推出相關的優惠活動。"

doc = nlp(my_text)
doc

三倍券還可以買公益彩券及買黃金，業者也會推出相關的優惠活動。

In [25]:
for token in doc:
    print(token.text)

三
倍
券還
可以
買
公益
彩券
及
買黃金
，
業者
也
會
推出
相關
的
優惠
活動
。


## Attribute

In [26]:
import pandas as pd

cols = ['text', 'lemma', 'pos', 'tag', 'dep', 'shap', 'is_alpha', 'is_stop']

rows = []
for token in doc:
    
    # Attribute
    row = [token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_,
           token.is_alpha, token.is_stop]
    
    rows.append(row)
    
df = pd.DataFrame(rows, columns=cols)
df

Unnamed: 0,text,lemma,pos,tag,dep,shap,is_alpha,is_stop
0,三,,NUM,CD,dep,x,True,True
1,倍,,NUM,M,mark:clf,x,True,False
2,券還,,VERB,VV,ROOT,xx,True,False
3,可以,,VERB,VV,aux:modal,xx,True,True
4,買,,VERB,VV,ccomp,x,True,False
5,公益,,NOUN,NN,compound:nn,xx,True,False
6,彩券,,NOUN,NN,conj,xx,True,False
7,及,,CCONJ,CC,cc,x,True,True
8,買黃金,,NOUN,NN,dobj,xxx,True,False
9,，,,PUNCT,PU,punct,，,False,True


## 視覺化相依性

In [27]:
from spacy import displacy

#nlp = spacy.load("zh_core_web_sm")
nlp = zh_core_web_sm.load()
doc = nlp(my_text)
displacy.render(doc, style="dep")

### 實體命名辨識

In [30]:
displacy.render(doc, style="ent")

In [32]:
text2 = "銀河系其實滿是外星人？NASA爆銀河奧秘：大多數都死了"
nlp = zh_core_web_sm.load()
doc2 = nlp(text2)

doc2.vector

array([ 0.16723259, -0.6700667 ,  0.97925323, -0.51825786,  0.31154615,
        0.5469232 , -0.06836909, -0.24315631,  0.17094064,  0.05445863,
        0.4444161 , -0.5943327 , -0.62821436, -0.49415278, -0.00581951,
        0.05655156, -0.20414378,  0.00221996,  0.13637055,  0.19772285,
        0.01026848, -0.00603558, -0.54161227, -0.44664335, -0.32252198,
        0.07145989, -0.03847126, -0.25116184, -0.01609054, -0.12755124,
       -0.13498253,  0.6028678 , -0.82629657, -0.13215956,  0.22498178,
       -0.08991627, -0.27909178, -0.25237194,  0.2723955 ,  0.969721  ,
       -0.6090539 ,  0.0369646 ,  0.18440229, -0.5436306 ,  0.6924165 ,
        0.47033   ,  0.05933892,  0.11893684,  0.27713415, -0.3006593 ,
       -0.08915594,  0.706766  , -0.5574853 , -0.0230346 ,  0.03670768,
        0.59210813, -0.00830769, -0.48893183, -0.08817853, -0.10035253,
        0.66523   ,  0.08886814,  0.17716011, -0.2878304 , -0.17199862,
       -0.5939613 , -0.5822407 , -0.08819171, -0.5936948 , -0.38

In [33]:
for token in doc2:
    print('{}, {}'.format(token.text, token.pos_))

銀河, NOUN
系, NOUN
其, PRON
實滿, NOUN
是, VERB
外星人, NOUN
？, PUNCT
NASA, PROPN
爆銀河, NOUN
奧秘, VERB
：, PUNCT
大多, ADV
數, ADV
都, ADV
死, VERB
了, PART


In [34]:
for token in doc2:
    print('{}'.format(token.text))

銀河
系
其
實滿
是
外星人
？
NASA
爆銀河
奧秘
：
大多
數
都
死
了


In [35]:
doc2.similarity(doc)

  doc2.similarity(doc)


0.38726488087555544

In [36]:
suffixes = list(nlp.Defaults.suffixes)
suffixes

['…',
 '……',
 ',',
 ':',
 ';',
 '\\!',
 '\\?',
 '¿',
 '؟',
 '¡',
 '\\(',
 '\\)',
 '\\[',
 '\\]',
 '\\{',
 '\\}',
 '<',
 '>',
 '_',
 '#',
 '\\*',
 '&',
 '。',
 '？',
 '！',
 '，',
 '、',
 '；',
 '：',
 '～',
 '·',
 '।',
 '،',
 '۔',
 '؛',
 '٪',
 '\\.\\.+',
 '…',
 "\\'",
 '"',
 '”',
 '“',
 '`',
 '‘',
 '´',
 '’',
 '‚',
 ',',
 '„',
 '»',
 '«',
 '「',
 '」',
 '『',
 '』',
 '（',
 '）',
 '〔',
 '〕',
 '【',
 '】',
 '《',
 '》',
 '〈',
 '〉',
 '[\\u00A6\\u00A9\\u00AE\\u00B0\\u0482\\u058D\\u058E\\u060E\\u060F\\u06DE\\u06E9\\u06FD\\u06FE\\u07F6\\u09FA\\u0B70\\u0BF3-\\u0BF8\\u0BFA\\u0C7F\\u0D4F\\u0D79\\u0F01-\\u0F03\\u0F13\\u0F15-\\u0F17\\u0F1A-\\u0F1F\\u0F34\\u0F36\\u0F38\\u0FBE-\\u0FC5\\u0FC7-\\u0FCC\\u0FCE\\u0FCF\\u0FD5-\\u0FD8\\u109E\\u109F\\u1390-\\u1399\\u1940\\u19DE-\\u19FF\\u1B61-\\u1B6A\\u1B74-\\u1B7C\\u2100\\u2101\\u2103-\\u2106\\u2108\\u2109\\u2114\\u2116\\u2117\\u211E-\\u2123\\u2125\\u2127\\u2129\\u212E\\u213A\\u213B\\u214A\\u214C\\u214D\\u214F\\u218A\\u218B\\u2195-\\u2199\\u219C-\\u219F\\u21A1\\u21A2\\u21

In [37]:
text3 = "跨年玩很大"
doc3 = nlp(text3)

In [38]:
displacy.render(doc3)