In [9]:
# spaCy (https://spacy.io/)

!pip install spacy
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_md

import spacy
# nlp = spacy.load("en_core_web_sm")  # Smallest model, see https://spacy.io/models
nlp = spacy.load("en_core_web_md")

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Collecting en-core-web-md==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.5.0/en_core_web_md-3.5.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [4]:
# Named Entity Recognition

text = 'GitHub is a development platform inspired by the way you work. From open source to business, you can host and review code, manage projects, and build software alongside 40 million developers.'

doc = nlp(text)
print(doc.ents)
print([(X.text, X.label_) for X in doc.ents])

(GitHub, 40 million)
[('GitHub', 'ORG'), ('40 million', 'CARDINAL')]


In [8]:
# Part-of-Speech tagging

import pandas as pd

# https://spacy.io/api/token#attributes
pd.DataFrame({
    "text": [token.text for token in doc],
    "lemma": [token.lemma_ for token in doc],
    "pos": [token.pos_ for token in doc],
    "tag": [token.tag_ for token in doc],
    "dep": [token.dep_ for token in doc],
    "is_alpha": [token.is_alpha for token in doc],
    "is_stop": [token.is_stop for token in doc],
})

Unnamed: 0,text,lemma,pos,tag,dep,is_alpha,is_stop
0,GitHub,github,NOUN,NN,nsubj,True,False
1,is,be,AUX,VBZ,ROOT,True,True
2,a,a,DET,DT,det,True,True
3,development,development,NOUN,NN,compound,True,False
4,platform,platform,NOUN,NN,attr,True,False
5,inspired,inspire,VERB,VBN,acl,True,False
6,by,by,ADP,IN,agent,True,True
7,the,the,DET,DT,det,True,True
8,way,way,NOUN,NN,pobj,True,False
9,you,you,PRON,PRP,nsubj,True,True


In [13]:
# Noun chunks

pd.DataFrame({
    "text": [chunk.text for chunk in doc.noun_chunks],
    "root_text": [chunk.root.text for chunk in doc.noun_chunks],
    "root_dep": [chunk.root.dep_ for chunk in doc.noun_chunks],
})

Unnamed: 0,text,root_text,root_dep
0,GitHub,GitHub,nsubj
1,a development platform,platform,attr
2,the way,way,pobj
3,you,you,nsubj
4,open source,source,pobj
5,business,business,pobj
6,you,you,nsubj
7,code,code,dobj
8,projects,projects,dobj
9,software,software,dobj


In [15]:
# Visualizing dependencies
from spacy import displacy
displacy.render(doc, style='dep')

In [22]:
# Vectorization

pd.DataFrame({
    "text": [token.text for token in doc],
    "has_vector": [token.has_vector for token in doc],
    "vector_norm": [token.vector_norm for token in doc],  # The L2 norm of the token’s vector representation
})

for token in doc:
    print(token.text, token.vector)
    print()

GitHub [-0.958542   -0.8885067   0.6854658   0.5634964   0.7406775   0.4190148
  1.1173648   0.0131558  -1.0927213  -0.7050892   1.6076323   1.2278186
 -0.9925396  -0.39343774 -0.3300205  -0.42633897  0.23058683  0.659056
 -0.48065463 -0.04616186 -1.3125254  -0.42827985  0.21309543  0.21925814
 -0.7718686  -0.32905096  1.0016267   1.1639466   0.10356015  0.23845612
  0.60400236 -1.0806301   1.256082   -0.41795254 -0.49336562  1.4915501
 -1.6364648   0.01978634 -0.5703151   2.7204041  -1.5517608  -0.16586176
  0.17122298  0.9028639  -0.6210215   0.27839398 -0.5492257   1.7777401
  1.5776463  -0.84760654 -0.27268237  0.6318884  -0.5144081  -1.6937017
  1.0595229  -0.52651596 -0.07239401 -0.5639115   0.36187428 -0.26961416
  0.16449663 -0.1054411  -0.35247427 -0.5175969   0.33490148  0.07078918
  0.13122247 -0.65874803  0.0689415  -2.029108   -0.1200383   0.06342262
 -0.16674577 -0.5232294  -0.5784075  -0.818814    0.0581431  -0.89360404
  0.3875055  -0.46358484 -0.6643305   0.5533908   0

In [23]:
# Using vectors to compute document similarity
# https://spacy.io/usage/linguistic-features#vectors-similarity
#
# The similarity of Doc and Span objects defaults to the average of the token vectors. This means that the vector for “fast food” is the average of the vectors for “fast” and “food”, which isn’t necessarily representative of the phrase “fast food”.
# Vector averaging means that the vector of multiple tokens is insensitive to the order of the words. Two documents expressing the same meaning with dissimilar wording will return a lower similarity score than two documents that happen to contain the same words while expressing different meanings.

doc1 = nlp("I like salty fries and hamburgers.")
doc2 = nlp("Fast food tastes very good.")

# Similarity of two documents
print(doc1, "<->", doc2, doc1.similarity(doc2))
# Similarity of tokens and spans
french_fries = doc1[2:4]
burgers = doc1[5]
print(french_fries, "<->", burgers, french_fries.similarity(burgers))

I like salty fries and hamburgers. <-> Fast food tastes very good. 0.691649353055761
salty fries <-> hamburgers 0.6938489675521851
