In [4]:
import spacy

In [5]:
# Load the installed model
nlp = spacy.load('en_core_web_sm')

In [4]:
doc = nlp(u"Telsa is looking at buying U.S. startup for $6 million")

In [5]:
for token in doc:
  print(token.text, token.pos_, token.dep_)

Telsa NOUN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [6]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']

In [7]:
doc2 = nlp(u"Telsa isnt looking into startups anymore.")

for token in doc2:
  print(token.text)

Telsa
is
nt
looking
into
startups
anymore
.


## Tokenization

In [8]:
mystring = '"We\'re moving to L.A.!"'

In [9]:
doc = nlp(mystring)

In [10]:
for token in doc:
  print(token.text)

"
We
're
moving
to
L.A.
!
"


In [11]:
doc2 = nlp(u"Send us a sail-mail")

In [12]:
from spacy import displacy

In [13]:
doc = nlp(u"Apple is going to build a U.K. factory for $6 million")

In [14]:
displacy.render(doc, style="dep", jupyter=True)

In [15]:
doc = nlp(u"Apple sol 20 thousand airpods for $6 million")

displacy.render(doc, style='ent', jupyter=True)

# Stemming

## Stemming is cutting of words until we get to the root of the word. Not effective for the english language due to the level of complexity

In [16]:
import nltk

In [17]:
from nltk.stem.porter import PorterStemmer

p_stemmer = PorterStemmer()

In [18]:
words = ['run', "runner", "ran", "runs", 'easily', 'fairly']

for word in words:
  print(word + "------>" + p_stemmer.stem(word))

run------>run
runner------>runner
ran------>ran
runs------>run
easily------>easili
fairly------>fairli


In [19]:
from nltk.stem.snowball import SnowballStemmer

s_stemmer = SnowballStemmer(language='english')

In [20]:
for word in words:
  print(word + "------>" + s_stemmer.stem(word))

run------>run
runner------>runner
ran------>ran
runs------>run
easily------>easili
fairly------>fair


## Spacy does not have stemming due to its lack of efficiency

# Lemmatization

## Looks at surronding text to understand the morphology

In [21]:
doc = nlp(u"I am a runner running in a race because I love to run since I ran")

In [23]:
for token in doc:
  print(token.text, '\t', token.pos_, '\t', token.lemma, '\t',token.lemma_)

I 	 PRON 	 4690420944186131903 	 I
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 4690420944186131903 	 I
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 4690420944186131903 	 I
ran 	 VERB 	 12767647472892411841 	 run


## Stop Words

In [24]:
stop_words = nlp.Defaults.stop_words

In [28]:
len(stop_words)

326

In [6]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

In [30]:
pattern1 = [{"LOWER":"solarpower"}]
pattern2 = [{"LOWER":"solar"}, {"IS_PUNCT":True}, {"LOWER":"power"}]
pattern3 = [{"LOWER":"solar"}, {"LOWER":"power"}]

In [36]:
matcher.add('SolarPower',[ pattern1, pattern2, pattern3])

In [37]:
doc = nlp(u"The solar power industry continues to grow a solar power increases")

In [38]:
found_matches = matcher(doc)

In [39]:
found_matches

[(8656102463236116519, 1, 3), (8656102463236116519, 8, 10)]

In [7]:
filename = "/Users/test/Documents/Software-projects/Python Projects/Deep-Learning-Projects/Deep-Learning-Overfitting-Cook-Book/data/reaganomics.txt"

In [8]:
with open(filename, encoding='ISO-8859-1') as f:
  doc3 = nlp(f.read())

In [15]:
phrase_list = ["voodo economics", "supply-side economics", 'trickle-down economics', "Reagan"]

In [16]:
# Convert phrases to token patterns
phrase_patterns = [[{"LOWER": phrase.lower()}] for phrase in phrase_list]
# # Add the patterns to the matcher
matcher.add("EconMatcher", phrase_patterns)

In [17]:
phrase_patterns

[[{'LOWER': 'voodo economics'}],
 [{'LOWER': 'supply-side economics'}],
 [{'LOWER': 'trickle-down economics'}],
 [{'LOWER': 'reagan'}]]

In [18]:
found_matches = matcher(doc3)

In [19]:
for match_id, start, end in found_matches:
  string_id = nlp.vocab.strings[match_id]
  span=doc3[start:end]
  print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 12 13 Reagan
3680293220734633682 EconMatcher 30 31 Reagan
3680293220734633682 EconMatcher 74 75 Reagan
3680293220734633682 EconMatcher 191 192 Reagan
3680293220734633682 EconMatcher 363 364 Reagan
3680293220734633682 EconMatcher 374 375 Reagan
3680293220734633682 EconMatcher 392 393 Reagan
3680293220734633682 EconMatcher 459 460 Reagan
3680293220734633682 EconMatcher 498 499 Reagan
3680293220734633682 EconMatcher 527 528 Reagan
3680293220734633682 EconMatcher 548 549 Reagan
3680293220734633682 EconMatcher 574 575 Reagan
3680293220734633682 EconMatcher 604 605 Reagan
3680293220734633682 EconMatcher 630 631 Reagan
3680293220734633682 EconMatcher 699 700 Reagan
3680293220734633682 EconMatcher 804 805 Reagan
3680293220734633682 EconMatcher 845 846 Reagan
3680293220734633682 EconMatcher 917 918 Reagan
3680293220734633682 EconMatcher 951 952 Reagan
3680293220734633682 EconMatcher 1009 1010 Reagan
3680293220734633682 EconMatcher 1053 1054 Reagan
368029322073463

## Part of Speech Tagging

In [20]:
doc = nlp(u"I read books on NLP.")

In [22]:
for token in doc:
  print(token.text, token.pos_)

I PRON
read VERB
books NOUN
on ADP
NLP PROPN
. PUNCT


In [23]:
import spacy.attrs


POS_counts = doc.count_by(spacy.attrs.POS)

In [24]:
POS_counts

{95: 1, 100: 1, 92: 1, 85: 1, 96: 1, 97: 1}

In [27]:
TAG_counts = doc.count_by(spacy.attrs.TAG)

for k,v in sorted(TAG_counts.items()):
  print(doc.vocab[k].text,v)

NNS 1
IN 1
. 1
PRP 1
NNP 1
VBD 1


## Visualizing Parts Of Speech

In [29]:
from spacy import displacy

In [30]:
displacy.render(doc, style="dep", jupyter=True)

In [34]:
options = {"distance":110, "compact":'True', "color":"blue", "bg":"yellow", 'font':"Times"}

In [35]:
displacy.render(doc, style="dep", jupyter=True, options=options)