# NLP Practice


## Spacy 

In [1]:
!pip install spacy

Collecting spacy
  Using cached spacy-3.7.5-cp312-cp312-win_amd64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Using cached spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Using cached spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Using cached murmurhash-1.0.10-cp312-cp312-win_amd64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Using cached cymem-2.0.8-cp312-cp312-win_amd64.whl.metadata (8.6 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Using cached preshed-3.0.9-cp312-cp312-win_amd64.whl.metadata (2.2 kB)
Collecting thinc<8.3.0,>=8.2.2 (from spacy)
  Using cached thinc-8.2.5-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Using cached wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Using cached srsly-2.4.8-cp312-cp312-w


[notice] A new release of pip is available: 24.0 -> 24.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.1/12.8 MB 653.6 kB/s eta 0:00:20
     - -------------------------------------- 0.6/12.8 MB 4.1 MB/s eta 0:00:03
     --- ------------------------------------ 1.2/12.8 MB 7.1 MB/s eta 0:00:02
     ------ --------------------------------- 2.1/12.8 MB 8.9 MB/s eta 0:00:02
     ---------- ----------------------------- 3.5/12.8 MB 12.4 MB/s eta 0:00:01
     ------------- -------------------------- 4.4/12.8 MB 13.5 MB/s eta 0:00:01
     ------------------ --------------------- 5.9/12.8 MB 15.6 MB/s eta 0:00:01
     ---------------------- ----------------- 7.1/12.8 MB 16.9 MB/s eta 0:00:01
     -------------------------- -----------


[notice] A new release of pip is available: 24.0 -> 24.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
import spacy 

#loading a spacy model 
nlp = spacy.load('en_core_web_sm')
#creating a doc object the nlp object holds the spacy model that would parse the below text into separate components for us for eg: token, tag etc.
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')

#print each token separately 
#Dependency Parser 
#Every sentence has an inherent structure in which the words have an interdependent relationship with each other. 
# Dependency parsing can be thought of as a directed graph wherein the nodes are words and the edges are relationships between the words.
# It extracts the information on what one word means to another grammatically; whether it is a subject, an auxiliary verb, or a root, and so on. 
# spaCy has a method ‘.dep_’ of the ‘doc’ object which describes the syntactic dependencies of the tokens. 
print(doc)
for token in doc:
    print(token.text,token.tag,token.dep_,token.pos_)

Tesla is looking at buying U.S. startup for $6 million
Tesla 15794550382381185553 nsubj PROPN
is 13927759927860985106 aux AUX
looking 1534113631682161808 ROOT VERB
at 1292078113972184607 prep ADP
buying 1534113631682161808 pcomp VERB
U.S. 15794550382381185553 compound PROPN
startup 15308085513773655218 dobj NOUN
for 1292078113972184607 prep ADP
$ 11283501755624150392 quantmod SYM
6 8427216679587749980 compound NUM
million 8427216679587749980 pobj NUM


In [9]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x2012da205f0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x2012da203b0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x2012da549e0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x2012ddca990>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x2012d3c5fd0>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x2012da54c10>)]

In [11]:
nlp.pipe_names


['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

## Tokenization

In [2]:
import spacy 
nlp = spacy.load('en_core_web_sm')

In [3]:
nlp

<spacy.lang.en.English at 0x213b7296840>

In [4]:
string = '"We\'re moving to L.A.!"'

In [5]:
string

'"We\'re moving to L.A.!"'

In [6]:
doc = nlp(string)

In [11]:
for token in doc:
    print(token.text,token.pos_,token.lemma)

" PUNCT 15884554869126768810
We PRON 16064069575701507746
're AUX 10382539506755952630
moving VERB 13534686644065735227
to ADP 3791531372978436496
L.A. PROPN 11715335255722627455
! PUNCT 17494803046312582752
" PUNCT 15884554869126768810


In [18]:
doc3 = nlp(u"A 5km of ride to New york!")

In [19]:
for token in doc3:
    print(token.text)

A
5
km
of
ride
to
New
york
!


In [20]:
doc3.vocab

<spacy.vocab.Vocab at 0x213b926fe20>

In [21]:
len(doc3.vocab)

777

In [None]:
#once the doc object created we can do the reassignment

In [26]:
#named entities 
for ent in doc3.ents:
    print(ent)
    print(ent.label_)
    print(str(spacy.explain(ent.label_)))

5km
QUANTITY
Measurements, as of weight or distance
New york
GPE
Countries, cities, states


In [27]:
for noun in doc3.noun_chunks:
    print(noun)

A 5km
ride
New york


In [28]:
from spacy import displacy 

doc4 = nlp(u"Apple is going to build a manufacturing factory of $4 million in U.K.")



In [30]:
displacy.render(doc4,style='dep',jupyter=True,options={'distance':100})

In [31]:
doc5= nlp(u"apple is going to get a large revenue this year by selling all ipods at the price of $200")

In [32]:
displacy.render(doc5,style='ent',jupyter=True,options={'distance':100})

In [34]:
displacy.get_doc_settings(doc5)

{'lang': 'en', 'direction': 'ltr'}

In [35]:
displacy.parse_ents(doc5)

{'text': 'apple is going to get a large revenue this year by selling all ipods at the price of $200',
 'ents': [{'start': 0, 'end': 5, 'label': 'ORG', 'kb_id': '', 'kb_url': '#'},
  {'start': 38, 'end': 47, 'label': 'DATE', 'kb_id': '', 'kb_url': '#'},
  {'start': 86, 'end': 89, 'label': 'MONEY', 'kb_id': '', 'kb_url': '#'}],
 'title': None,
 'settings': {'lang': 'en', 'direction': 'ltr'}}

In [36]:
displacy.parse_deps(doc5)

{'words': [{'text': 'apple', 'tag': 'NOUN', 'lemma': None},
  {'text': 'is', 'tag': 'AUX', 'lemma': None},
  {'text': 'going', 'tag': 'VERB', 'lemma': None},
  {'text': 'to', 'tag': 'PART', 'lemma': None},
  {'text': 'get', 'tag': 'VERB', 'lemma': None},
  {'text': 'a', 'tag': 'DET', 'lemma': None},
  {'text': 'large', 'tag': 'ADJ', 'lemma': None},
  {'text': 'revenue', 'tag': 'NOUN', 'lemma': None},
  {'text': 'this', 'tag': 'DET', 'lemma': None},
  {'text': 'year', 'tag': 'NOUN', 'lemma': None},
  {'text': 'by', 'tag': 'ADP', 'lemma': None},
  {'text': 'selling', 'tag': 'VERB', 'lemma': None},
  {'text': 'all', 'tag': 'DET', 'lemma': None},
  {'text': 'ipods', 'tag': 'NOUN', 'lemma': None},
  {'text': 'at', 'tag': 'ADP', 'lemma': None},
  {'text': 'the', 'tag': 'DET', 'lemma': None},
  {'text': 'price', 'tag': 'NOUN', 'lemma': None},
  {'text': 'of', 'tag': 'ADP', 'lemma': None},
  {'text': '$', 'tag': 'SYM', 'lemma': None},
  {'text': '200', 'tag': 'NUM', 'lemma': None}],
 'arcs': [

In [37]:
# displacy.serve(doc5,style='dep')




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


## Stemming

In [39]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting joblib (from nltk)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.5.15-cp312-cp312-win_amd64.whl.metadata (41 kB)
     ---------------------------------------- 0.0/42.0 kB ? eta -:--:--
     ---------------------------------------- 0.0/42.0 kB ? eta -:--:--
     ------------------ ------------------- 20.5/42.0 kB 217.9 kB/s eta 0:00:01
     -------------------------------------- 42.0/42.0 kB 337.6 kB/s eta 0:00:00
Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------- -------------------------------- 0.3/1.5 MB 8.6 MB/s eta 0:00:01
   ---------------------- ----------------- 0.8/1.5 MB 10.8 MB/s eta 0:00:01
   --------------------------------- ------ 1.3/1.5 MB 10.1 MB/s eta 0:00:01
   ---------------------------------------  1.5/1.5 MB 10.7 MB/s eta


[notice] A new release of pip is available: 24.0 -> 24.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [40]:
import nltk

In [41]:
from nltk.stem.porter import PorterStemmer

In [44]:

p_stemmer = PorterStemmer()

In [45]:
words = ['run','runner','running','ran','runs','easily','fairly']

In [47]:
for word in words:
    print(f'original word : {word} stemmed word : {p_stemmer.stem(word)}')

original word : run stemmed word : run
original word : runner stemmed word : runner
original word : running stemmed word : run
original word : ran stemmed word : ran
original word : runs stemmed word : run
original word : easily stemmed word : easili
original word : fairly stemmed word : fairli


In [48]:
from nltk.stem.snowball import SnowballStemmer

In [51]:
snowball_stemmer = SnowballStemmer(language='english')

In [52]:
for word in words: 
    print(f'Original Word : {word},snowball_stemmer : {snowball_stemmer.stem(word)}')

Original Word : run,snowball_stemmer : run
Original Word : runner,snowball_stemmer : runner
Original Word : running,snowball_stemmer : run
Original Word : ran,snowball_stemmer : ran
Original Word : runs,snowball_stemmer : run
Original Word : easily,snowball_stemmer : easili
Original Word : fairly,snowball_stemmer : fair


In [53]:
words1 = ['generous','generation','generously','generate']

In [54]:
for word in words1:
    print(f'Original Word : {word}, SnowballStemmer : {snowball_stemmer.stem(word)}')

Original Word : generous, SnowballStemmer : generous
Original Word : generation, SnowballStemmer : generat
Original Word : generously, SnowballStemmer : generous
Original Word : generate, SnowballStemmer : generat


## Lemmatization

In [55]:
import spacy 

In [56]:
nlp = spacy.load('en_core_web_sm')

In [57]:
doc1 = nlp(u'I am a runner running in a race because I love to run since I ran today!')

In [64]:
for token in doc1:
    print(f'word : {token.text:<{9}}, POS : {token.pos_:<{4}}, Lemma : {token.lemma:<{12}}, Lemma word : {token.lemma_:<{12}} ')

word : I        , POS : PRON, Lemma : 4690420944186131903, Lemma word : I            
word : am       , POS : AUX , Lemma : 10382539506755952630, Lemma word : be           
word : a        , POS : DET , Lemma : 11901859001352538922, Lemma word : a            
word : runner   , POS : NOUN, Lemma : 12640964157389618806, Lemma word : runner       
word : running  , POS : VERB, Lemma : 12767647472892411841, Lemma word : run          
word : in       , POS : ADP , Lemma : 3002984154512732771, Lemma word : in           
word : a        , POS : DET , Lemma : 11901859001352538922, Lemma word : a            
word : race     , POS : NOUN, Lemma : 8048469955494714898, Lemma word : race         
word : because  , POS : SCONJ, Lemma : 16950148841647037698, Lemma word : because      
word : I        , POS : PRON, Lemma : 4690420944186131903, Lemma word : I            
word : love     , POS : VERB, Lemma : 3702023516439754181, Lemma word : love         
word : to       , POS : PART, Lemma : 379153137

## Stop Words

In [65]:
import spacy

In [66]:
nlp = spacy.load('en_core_web_sm')

In [68]:
print(nlp.Defaults.stop_words)

{'are', 'why', 'name', 'throughout', 'say', 'during', 'then', 'though', 'more', 'must', 'side', 're', 'whither', 'four', 'many', 'herself', 'yet', 'top', 'put', 'nothing', 'ten', 'really', 'ever', 'by', 'her', 'did', 'hers', 'someone', 'she', 'sixty', 'we', 'thru', 'whose', 'out', 'nowhere', 'others', 'not', 'amongst', 'could', 'under', 'hereafter', 'most', 'somehow', 'full', 'just', 'nobody', '’s', 'too', 'up', 'on', 'become', '‘ll', 'does', 'everything', 'it', 'those', 'formerly', 'own', '‘ve', 'due', '’m', "n't", 'an', 'where', 'might', 'nor', "'ll", 'perhaps', "'ve", '‘d', 'after', 'part', 'have', 'seemed', 'next', 'yours', 'twelve', 'some', 'myself', 'n’t', 'amount', 'beside', "'s", 'above', 'himself', 'together', 'used', 'hereby', 'within', 'becomes', 'behind', 'show', 'yourself', 'least', 'make', 'much', 'him', 'front', 'see', 'however', 'any', 'than', 'get', 'whenever', 'to', 'thereby', 'such', 'thereafter', 'should', '’ve', 'n‘t', 'upon', 'nevertheless', 'few', 'from', 'well',

In [70]:
# we can manually add stop words 
len(nlp.Defaults.stop_words)

326

In [71]:
nlp.Defaults.stop_words.add('btw')

In [72]:
len(nlp.Defaults.stop_words)

327

In [75]:

nlp.vocab['btw'].is_stop

True

In [76]:
nlp.vocab.lang

'en'

In [78]:
nlp.vocab.writing_system

{'direction': 'ltr', 'has_case': True, 'has_letters': True}

In [79]:
nlp.Defaults.stop_words.remove('beyond')

In [80]:
nlp.vocab['beyond'].is_stop

False

## Phrase Matching and Vocabulary

In [81]:
import spacy

In [82]:
nlp = spacy.load('en_core_web_sm')

In [87]:
from spacy.matcher import Matcher

In [88]:
matcher = Matcher(nlp.vocab)

In [139]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]
pattern3 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]
patterns = [ [{'LOWER':'solarpower'}],[{'LOWER':'solar'},{'LOWER':'power'}],[{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}] ]
matcher.add('Solar Power',patterns)

In [140]:
doc1 = nlp(u'The Solar Power industry continues to grow as demand for solarpower increases. Solar-power cars are gaining popularity.')

In [141]:
found_matches = matcher(doc1)

In [142]:
found_matches

[(6244237227832970528, 1, 3),
 (6244237227832970528, 10, 11),
 (6244237227832970528, 13, 16)]

In [137]:
def on_match(matcher, doc, id, matches):
    print('Matched!', matches)

matcher = Matcher(nlp.vocab)
patterns = [
   [{"LOWER": "hello"}, {"LOWER": "world"}],
   [{"ORTH": "Google"}, {"ORTH": "Maps"}]
]
matcher.add("TEST_PATTERNS", patterns)
doc = nlp("HELLO WORLD on Google Maps.")
matches = matcher(doc)

In [138]:
#matches returns a list of tuples that contains match id , start and end index
for match_id,start, end in matches:
    string_id = nlp.vocab.strings[match_id]
    span_text = doc[start:end]
    print(match_id,start,end,span_text,string_id)


3004906285683798724 0 2 HELLO WORLD TEST_PATTERNS
3004906285683798724 3 5 Google Maps TEST_PATTERNS


In [151]:
#remove older set of patterns 
matcher.remove('TEST_PATTERNS')

ValueError: [E175] Can't remove rule for unknown match pattern ID: TEST_PATTERNS

In [152]:
pattern_new = [[{'LOWER':'solarpower'}],[{'LOWER':'solar'},{'IS_PUNCT':True,'OP':'*'},{'LOWER':'power'}]]

In [153]:
matcher.add('SolarPower',pattern_new)

In [154]:
doc_new = nlp('Solar-Power is solarpower yay!')

In [155]:
matches_new = matcher(doc_new)

In [158]:
matches_new

[(6244237227832970528, 0, 3),
 (8656102463236116519, 0, 3),
 (6244237227832970528, 4, 5),
 (8656102463236116519, 4, 5)]

In [160]:
for string_id,start,end in matches_new:
    text_found = doc_new[start:end]
    print(string_id,start,end,text_found)

6244237227832970528 0 3 Solar-Power
8656102463236116519 0 3 Solar-Power
6244237227832970528 4 5 solarpower
8656102463236116519 4 5 solarpower


#### Phrase Matcher

In [162]:
import spacy
nlp=spacy.load('en_core_web_sm')

In [163]:
from spacy.matcher import PhraseMatcher

In [165]:
matcher = PhraseMatcher(nlp.vocab)

In [167]:
with open('C:/Users/sunny/OneDrive/Desktop/Jyotsana_Projects/udemy_NLP/UPDATED_NLP_COURSE/UPDATED_NLP_COURSE/TextFiles/reaganomics.txt') as f:
    doc3=nlp(f.read())

In [169]:
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']



In [170]:
phrase_pattern = [nlp(text) for text in phrase_list]

In [173]:
type(phrase_pattern[0])

spacy.tokens.doc.Doc

In [177]:
matcher.add('EconMatcher',None,*phrase_pattern)

In [178]:
found_matches = matcher(doc3)

In [179]:
found_matches

[(3680293220734633682, 41, 45),
 (3680293220734633682, 49, 53),
 (3680293220734633682, 54, 56),
 (3680293220734633682, 61, 65),
 (3680293220734633682, 673, 677),
 (3680293220734633682, 2987, 2991)]

In [181]:
for match_id,start,end in found_matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc3[start:end]
    print(match_id,string_id,start,end,span.text)

3680293220734633682 EconMatcher 41 45 supply-side economics
3680293220734633682 EconMatcher 49 53 trickle-down economics
3680293220734633682 EconMatcher 54 56 voodoo economics
3680293220734633682 EconMatcher 61 65 free-market economics
3680293220734633682 EconMatcher 673 677 supply-side economics
3680293220734633682 EconMatcher 2987 2991 trickle-down economics


#### POS

In [182]:
import spacy 

In [183]:
nlp = spacy.load('en_core_web_sm')

In [184]:
doc = nlp("The quick brown fox jumped over the lazy dog's back")

In [185]:
doc.text

"The quick brown fox jumped over the lazy dog's back"

In [186]:
doc[4].tag_

'VBD'

In [187]:
doc[4].pos_

'VERB'

In [189]:
for token in doc:
    print(f'{token.text} {token.pos_} {token.tag_} {spacy.explain(token.tag)}')

The DET DT None
quick ADJ JJ None
brown ADJ JJ None
fox NOUN NN None
jumped VERB VBD None
over ADP IN None
the DET DT None
lazy ADJ JJ None
dog NOUN NN None
's PART POS None
back NOUN NN None




In [190]:
doc = nlp("I read books on NLP.")

In [192]:
word = doc[1]
word

read

In [193]:
token = word
print(f'{token.text} {token.pos_} {token.tag_}')

read VERB VBP


In [195]:
doc = nlp("I read a book on NLP.")
token = doc[1]
print(f'{token.text} {token.tag_} {token.pos_} ')

read VBD VERB 


In [196]:
 POS_counts = doc.count_by(spacy.attrs.POS)

In [197]:
POS_counts
#95,100 are pos code


{95: 1, 100: 1, 90: 1, 92: 1, 85: 1, 96: 1, 97: 1}

In [200]:
doc.vocab[95].text

'PRON'

In [201]:
doc[2].pos

90

In [203]:
for key , value in sorted(POS_counts.items()):
    print(f'{key}, {doc.vocab[key].text:{5}} {value}')

85, ADP   1
90, DET   1
92, NOUN  1
95, PRON  1
96, PROPN 1
97, PUNCT 1
100, VERB  1


In [204]:
Tag_counts = doc.count_by(spacy.attrs.TAG)

for key, value in sorted(Tag_counts.items()):
    print(f'{key}, {value} , {doc.vocab[key].text}')

1292078113972184607, 1 , IN
12646065887601541794, 1 , .
13656873538139661788, 1 , PRP
15267657372422890137, 1 , DT
15308085513773655218, 1 , NN
15794550382381185553, 1 , NNP
17109001835818727656, 1 , VBD
