In [1]:
import spacy

In [2]:
print('test')

test


In [3]:
nlp = spacy.load('en_core_web_sm')

In [4]:
doc = nlp("Elon flew to mars yesterday. He carried biryani masala with him")
for token in doc:
    print(token.text, "->", token.pos_)

Elon -> PROPN
flew -> VERB
to -> ADP
mars -> NOUN
yesterday -> NOUN
. -> PUNCT
He -> PRON
carried -> VERB
biryani -> ADJ
masala -> NOUN
with -> ADP
him -> PRON


In [5]:
for token in doc:
    print(token.text, "->", token.pos_, "->", spacy.explain(token.pos_))

Elon -> PROPN -> proper noun
flew -> VERB -> verb
to -> ADP -> adposition
mars -> NOUN -> noun
yesterday -> NOUN -> noun
. -> PUNCT -> punctuation
He -> PRON -> pronoun
carried -> VERB -> verb
biryani -> ADJ -> adjective
masala -> NOUN -> noun
with -> ADP -> adposition
him -> PRON -> pronoun


In [6]:
doc = nlp("Wow! Dr. Strange made 256 million $ on the very first day of his Movi")
for token in doc:
    print(token.text, "-->", token.pos_, "-->", spacy.explain(token.pos_), "-->", token.tag_, "-->", spacy.explain(token.tag_))

Wow --> INTJ --> interjection --> UH --> interjection
! --> PUNCT --> punctuation --> . --> punctuation mark, sentence closer
Dr. --> PROPN --> proper noun --> NNP --> noun, proper singular
Strange --> PROPN --> proper noun --> NNP --> noun, proper singular
made --> VERB --> verb --> VBD --> verb, past tense
256 --> NUM --> numeral --> CD --> cardinal number
million --> NUM --> numeral --> CD --> cardinal number
$ --> SYM --> symbol --> $ --> symbol, currency
on --> ADP --> adposition --> IN --> conjunction, subordinating or preposition
the --> DET --> determiner --> DT --> determiner
very --> ADV --> adverb --> RB --> adverb
first --> ADJ --> adjective --> JJ --> adjective (English), other noun-modifier (Chinese)
day --> NOUN --> noun --> NN --> noun, singular or mass
of --> ADP --> adposition --> IN --> conjunction, subordinating or preposition
his --> PRON --> pronoun --> PRP$ --> pronoun, possessive
Movi --> PROPN --> proper noun --> NNP --> noun, proper singular


In [8]:
doc = nlp('He quits the job')
doc[1]
print(doc[1].text, "-->", doc[1].tag_, "-->", spacy.explain(doc[1].tag_))

quits --> VBZ --> verb, 3rd person singular present


In [9]:
doc = nlp('He quit the job')
doc[1]
print(doc[1].text, "-->", doc[1].tag_, "-->", spacy.explain(doc[1].tag_))

quit --> VBD --> verb, past tense


In [12]:
earnings_text = """
 Microsoft Corp. today announced the following results for the quarter ended December 31, 2023, as compared to the corresponding period of last fiscal year:

·        Revenue was $62.0 billion and increased 18% (up 16% in constant currency)

·        Operating income was $27.0 billion and increased 33%, and increased 25% non-GAAP (up 23% in constant currency)

·        Net income was $21.9 billion and increased 33%, and increased 26% non-GAAP (up 23% in constant currency)

·        Diluted earnings per share was $2.93 and increased 33%, and increased 26% non-GAAP (up 23% in constant currency)

Microsoft completed the acquisition of Activision Blizzard, Inc. (“Activision”) on October 13, 2023. Financial results from the acquired business are reported in the More Personal Computing segment.

"We’ve moved from talking about AI to applying AI at scale," said Satya Nadella, chairman and chief executive officer of Microsoft. "By infusing AI across every layer of our tech stack, we’re winning new customers and helping drive new benefits and productivity gains across every etc. sector.”
"""

In [13]:
doc = nlp(earnings_text)
for token in doc:
    print(token.text, "-->", token.pos_, "-->", spacy.explain(token.pos_))


  --> SPACE --> space
Microsoft --> PROPN --> proper noun
Corp. --> PROPN --> proper noun
today --> NOUN --> noun
announced --> VERB --> verb
the --> DET --> determiner
following --> VERB --> verb
results --> NOUN --> noun
for --> ADP --> adposition
the --> DET --> determiner
quarter --> NOUN --> noun
ended --> VERB --> verb
December --> PROPN --> proper noun
31 --> NUM --> numeral
, --> PUNCT --> punctuation
2023 --> NUM --> numeral
, --> PUNCT --> punctuation
as --> SCONJ --> subordinating conjunction
compared --> VERB --> verb
to --> ADP --> adposition
the --> DET --> determiner
corresponding --> ADJ --> adjective
period --> NOUN --> noun
of --> ADP --> adposition
last --> ADJ --> adjective
fiscal --> ADJ --> adjective
year --> NOUN --> noun
: --> PUNCT --> punctuation


 --> SPACE --> space
· --> PUNCT --> punctuation
        --> SPACE --> space
Revenue --> NOUN --> noun
was --> AUX --> auxiliary
$ --> SYM --> symbol
62.0 --> NUM --> numeral
billion --> NUM --> numeral
and --> CCO

In [17]:
filtered_tokens = []
for token in doc:
    if token.pos_ not in ['SPACE', 'X', "PUNCT"]:
        filtered_tokens.append(token)


filtered_tokens[:20]

[Microsoft,
 Corp.,
 today,
 announced,
 the,
 following,
 results,
 for,
 the,
 quarter,
 ended,
 December,
 31,
 2023,
 as,
 compared,
 to,
 the,
 corresponding,
 period]

In [18]:
count = doc.count_by(spacy.attrs.POS)
count

{103: 12,
 96: 17,
 92: 50,
 100: 23,
 90: 8,
 85: 21,
 93: 22,
 97: 35,
 98: 1,
 84: 14,
 87: 8,
 99: 4,
 89: 10,
 86: 4,
 95: 3,
 101: 1}

In [19]:
doc.vocab[103].text

'SPACE'

In [21]:
for k, v in count.items():
    print(doc.vocab[k].text, "-->", v)

SPACE --> 12
PROPN --> 17
NOUN --> 50
VERB --> 23
DET --> 8
ADP --> 21
NUM --> 22
PUNCT --> 35
SCONJ --> 1
ADJ --> 14
AUX --> 8
SYM --> 4
CCONJ --> 10
ADV --> 4
PRON --> 3
X --> 1
