### Basics

In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
doc = nlp('Tesla is looking at buyin U.S. startup for $6 millions')

In [4]:
for token in doc:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buyin NOUN compound
U.S. PROPN compound
startup NOUN pobj
for ADP prep
$ SYM quantmod
6 NUM nummod
millions NOUN pobj


In [5]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x277008edcc0>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x27700a45ac8>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x27700a45b28>)]

In [10]:
doc2 = nlp("Tesla isn't looking into buying      startups anymore.")

In [11]:
for token in doc2:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
n't PART neg
looking VERB ROOT
into ADP prep
buying VERB pcomp
      SPACE 
startups NOUN dobj
anymore ADV advmod
. PUNCT punct


### Tokenization
- 1 - Split on whitespace
- 2 - Prefix --> Characters at the begginning *$ ( " ¿*
- 3 - Exception --> Special-case rules to split or prevent splitting a word into several tokens --> *let's U.S.*
- 4 - Suffix --> Characters at the end *km ) , . ! "*
- 5 - Exception
- 6 - Done

In [12]:
mystring = '"We\'re moving to L.A!"'
print(mystring)

"We're moving to L.A!"


In [13]:
doc = nlp(mystring)

In [14]:
for token in doc:
    print(token.text)

"
We
're
moving
to
L.A
!
"


In [15]:
doc2 = nlp(u"We're here to help! Send snail-mail, email support@oursite.com or visit us at http://support.com")

In [17]:
for t in doc2:
    print(t.text)

We
're
here
to
help
!
Send
snail
-
mail
,
email
support@oursite.com
or
visit
us
at
http://support.com


In [18]:
doc3 = nlp("Apple to build a Hong Kong factory for $6 million")

In [21]:
for entity in doc3.ents:
    print(entity)
    print(entity.label_)
    print(str(spacy.explain(entity.label_)))
    print('\n')

Apple
ORG
Companies, agencies, institutions, etc.


Hong Kong
GPE
Countries, cities, states


$6 million
MONEY
Monetary values, including unit




In [22]:
from spacy import displacy

In [23]:
displacy.render(doc3,style='ent',jupyter=True)

### Stemming
Spacy doesn't include a stemmer. Instead, it relies entirely on lemmatization.  
Then, NLTK will be used.  

#### Porter Algorithm
Sequence of steps with rules

#### Snowball Algorithm
Improvement over the original Porter Algorithm

In [28]:
import nltk
from nltk.stem import PorterStemmer
p_stemmer = PorterStemmer()

In [41]:
words = ['run', 'runner', 'ran', 'runs', 'easily', 'fairly', 'fairness']

In [42]:
for w in words:
    print(w + '----->' + p_stemmer.stem(w))

run----->run
runner----->runner
ran----->ran
runs----->run
easily----->easili
fairly----->fairli
fairness----->fair


In [43]:
from nltk.stem import SnowballStemmer
s_stemmer = SnowballStemmer(language='english')

In [44]:
for w in words:
    print(w + '----->' + s_stemmer.stem(w))

run----->run
runner----->runner
ran----->ran
runs----->run
easily----->easili
fairly----->fair
fairness----->fair


#### Lemmatization

In [45]:
doc = nlp(u"I'm a runner running on a race because I love to run since I ran today")

In [46]:
for t in doc:
    print(t.text, '\t', t.pos_, '\t', t.lemma, '\t', t.lemma_)

I 	 PRON 	 561228191312463089 	 -PRON-
'm 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
on 	 ADP 	 5640369432778651323 	 on
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 561228191312463089 	 -PRON-
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 561228191312463089 	 -PRON-
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


In [None]:
def show_lemmas(text):
    for t in text:
        print(f'{t.text:{12}} {t.pos_, '\t', t.lemma, '\t', t.lemma_:})