In [2]:
import spacy

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
doc = nlp("Elon flew to mars yesterday. He carried biryani masala with him")
for token in doc:
    print(token, "|", token.pos_, "|", spacy.explain(token.pos_))

Elon | PROPN | proper noun
flew | VERB | verb
to | ADP | adposition
mars | NOUN | noun
yesterday | NOUN | noun
. | PUNCT | punctuation
He | PRON | pronoun
carried | VERB | verb
biryani | ADJ | adjective
masala | NOUN | noun
with | ADP | adposition
him | PRON | pronoun


In [5]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [6]:
doc = nlp("Wow! Dr. Strange made 265 million $ on the very first day")
for token in doc:
    print(token,"|",token.pos_,"|", spacy.explain(token.pos_),"|", token.tag_,"|",spacy.explain(token.tag_))

Wow | INTJ | interjection | UH | interjection
! | PUNCT | punctuation | . | punctuation mark, sentence closer
Dr. | PROPN | proper noun | NNP | noun, proper singular
Strange | PROPN | proper noun | NNP | noun, proper singular
made | VERB | verb | VBD | verb, past tense
265 | NUM | numeral | CD | cardinal number
million | NUM | numeral | CD | cardinal number
$ | NUM | numeral | CD | cardinal number
on | ADP | adposition | IN | conjunction, subordinating or preposition
the | DET | determiner | DT | determiner
very | ADV | adverb | RB | adverb
first | ADJ | adjective | JJ | adjective (English), other noun-modifier (Chinese)
day | NOUN | noun | NN | noun, singular or mass


In [7]:
doc = nlp("He quit the job")
doc[1]
print(doc[1].text, "|", doc[1].tag_,"|", spacy.explain(doc[1].tag_))

quit | VBD | verb, past tense


In [8]:
earnings_text = ''' Microsoft Corp. today announced the following results for the quarter ended September 30, 2023, as compared to the corresponding period of last fiscal year:

·        Revenue was $56.5 billion and increased 13% (up 12% in constant currency)

·        Operating income was $26.9 billion and increased 25% (up 24% in constant currency)

·        Net income was $22.3 billion and increased 27% (up 26% in constant currency)

·        Diluted earnings per share was $2.99 and increased 27% (up 26% in constant currency)'''

In [9]:
doc = nlp(earnings_text)
for token in doc:
    if token.pos_ not in ["SPACE","X","PUNCT"]:
        print(token," | ", token.pos_," | ", spacy.explain(token.pos_))

Microsoft  |  PROPN  |  proper noun
Corp.  |  PROPN  |  proper noun
today  |  NOUN  |  noun
announced  |  VERB  |  verb
the  |  DET  |  determiner
following  |  VERB  |  verb
results  |  NOUN  |  noun
for  |  ADP  |  adposition
the  |  DET  |  determiner
quarter  |  NOUN  |  noun
ended  |  VERB  |  verb
September  |  PROPN  |  proper noun
30  |  NUM  |  numeral
2023  |  NUM  |  numeral
as  |  SCONJ  |  subordinating conjunction
compared  |  VERB  |  verb
to  |  ADP  |  adposition
the  |  DET  |  determiner
corresponding  |  ADJ  |  adjective
period  |  NOUN  |  noun
of  |  ADP  |  adposition
last  |  ADJ  |  adjective
fiscal  |  ADJ  |  adjective
year  |  NOUN  |  noun
Revenue  |  NOUN  |  noun
was  |  AUX  |  auxiliary
$  |  SYM  |  symbol
56.5  |  NUM  |  numeral
billion  |  NUM  |  numeral
and  |  CCONJ  |  coordinating conjunction
increased  |  VERB  |  verb
13  |  NUM  |  numeral
%  |  NOUN  |  noun
up  |  ADV  |  adverb
12  |  NUM  |  numeral
%  |  NOUN  |  noun
in  |  ADP  |  ad

In [10]:
doc = nlp(earnings_text)
filtered_tokens=[]
for token in doc:
    if token.pos_ not in ["SPACE","X","PUNCT"]:
        filtered_tokens.append(token)

In [11]:
filtered_tokens

[Microsoft,
 Corp.,
 today,
 announced,
 the,
 following,
 results,
 for,
 the,
 quarter,
 ended,
 September,
 30,
 2023,
 as,
 compared,
 to,
 the,
 corresponding,
 period,
 of,
 last,
 fiscal,
 year,
 Revenue,
 was,
 $,
 56.5,
 billion,
 and,
 increased,
 13,
 %,
 up,
 12,
 %,
 in,
 constant,
 currency,
 Operating,
 income,
 was,
 $,
 26.9,
 billion,
 and,
 increased,
 25,
 %,
 up,
 24,
 %,
 in,
 constant,
 currency,
 Net,
 income,
 was,
 $,
 22.3,
 billion,
 and,
 increased,
 27,
 %,
 up,
 26,
 %,
 in,
 constant,
 currency,
 Diluted,
 earnings,
 per,
 share,
 was,
 $,
 2.99,
 and,
 increased,
 27,
 %,
 up,
 26,
 %,
 in,
 constant,
 currency]

In [12]:
count = doc.count_by(spacy.attrs.POS)
count

{103: 9,
 96: 3,
 92: 22,
 100: 10,
 90: 3,
 85: 8,
 93: 17,
 97: 15,
 98: 1,
 84: 8,
 87: 4,
 99: 4,
 89: 4,
 86: 4}

In [13]:
doc.vocab[96].text

'PROPN'

In [14]:
for k,v in count.items():
    print(doc.vocab[k].text,"|",v)

SPACE | 9
PROPN | 3
NOUN | 22
VERB | 10
DET | 3
ADP | 8
NUM | 17
PUNCT | 15
SCONJ | 1
ADJ | 8
AUX | 4
SYM | 4
CCONJ | 4
ADV | 4
