In [2]:
import spacy

In [3]:
# Load the installed model
nlp = spacy.load('en_core_web_sm')

In [4]:
doc = nlp(u"Telsa is looking at buying U.S. startup for $6 million")

In [5]:
for token in doc:
  print(token.text, token.pos_, token.dep_)

Telsa NOUN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [6]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']

In [7]:
doc2 = nlp(u"Telsa isnt looking into startups anymore.")

for token in doc2:
  print(token.text)

Telsa
is
nt
looking
into
startups
anymore
.


## Tokenization

In [8]:
mystring = '"We\'re moving to L.A.!"'

In [9]:
doc = nlp(mystring)

In [10]:
for token in doc:
  print(token.text)

"
We
're
moving
to
L.A.
!
"


In [11]:
doc2 = nlp(u"Send us a sail-mail")

In [12]:
from spacy import displacy

In [13]:
doc = nlp(u"Apple is going to build a U.K. factory for $6 million")

In [14]:
displacy.render(doc, style="dep", jupyter=True)

In [15]:
doc = nlp(u"Apple sol 20 thousand airpods for $6 million")

displacy.render(doc, style='ent', jupyter=True)

# Stemming

## Stemming is cutting of words until we get to the root of the word. Not effective for the english language due to the level of complexity

In [16]:
import nltk

In [17]:
from nltk.stem.porter import PorterStemmer

p_stemmer = PorterStemmer()

In [18]:
words = ['run', "runner", "ran", "runs", 'easily', 'fairly']

for word in words:
  print(word + "------>" + p_stemmer.stem(word))

run------>run
runner------>runner
ran------>ran
runs------>run
easily------>easili
fairly------>fairli


In [19]:
from nltk.stem.snowball import SnowballStemmer

s_stemmer = SnowballStemmer(language='english')

In [20]:
for word in words:
  print(word + "------>" + s_stemmer.stem(word))

run------>run
runner------>runner
ran------>ran
runs------>run
easily------>easili
fairly------>fair


## Spacy does not have stemming due to its lack of efficiency

# Lemmatization

## Looks at surronding text to understand the morphology

In [21]:
doc = nlp(u"I am a runner running in a race because I love to run since I ran")

In [23]:
for token in doc:
  print(token.text, '\t', token.pos_, '\t', token.lemma, '\t',token.lemma_)

I 	 PRON 	 4690420944186131903 	 I
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 4690420944186131903 	 I
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 4690420944186131903 	 I
ran 	 VERB 	 12767647472892411841 	 run


## Stop Words

In [24]:
stop_words = nlp.Defaults.stop_words

In [28]:
len(stop_words)

326

In [29]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

In [30]:
pattern1 = [{"LOWER":"solarpower"}]
pattern2 = [{"LOWER":"solar"}, {"IS_PUNCT":True}, {"LOWER":"power"}]
pattern3 = [{"LOWER":"solar"}, {"LOWER":"power"}]

In [36]:
matcher.add('SolarPower',[ pattern1, pattern2, pattern3])

In [37]:
doc = nlp(u"The solar power industry continues to grow a solar power increases")

In [38]:
found_matches = matcher(doc)

In [39]:
found_matches

[(8656102463236116519, 1, 3), (8656102463236116519, 8, 10)]