In [9]:
import spacy

In [10]:
nlp = spacy.load("en_core_web_sm")

In [11]:
nlp.pipe_names  #these are the pipelines used by english language model to produce POS

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [12]:
doc = nlp("Mahedi moved to Dhaka, yesterday. But he takes all of his tools with him")
for token in doc:
  print(token, "|", token.pos_, "|", spacy.explain(token.pos_))

Mahedi | PROPN | proper noun
moved | VERB | verb
to | ADP | adposition
Dhaka | PROPN | proper noun
, | PUNCT | punctuation
yesterday | NOUN | noun
. | PUNCT | punctuation
But | CCONJ | coordinating conjunction
he | PRON | pronoun
takes | VERB | verb
all | PRON | pronoun
of | ADP | adposition
his | PRON | pronoun
tools | NOUN | noun
with | ADP | adposition
him | PRON | pronoun


In [13]:
text =" I spent 2 million $ for last 20 years which is really a huge amount of money!"
doc = nlp(text)
for token in doc:
  print(token,"|",token.pos_,"| ", spacy.explain(token.pos_), "|", token.tag_, spacy.explain(token.tag_))

  | SPACE |  space | _SP whitespace
I | PRON |  pronoun | PRP pronoun, personal
spent | VERB |  verb | VBD verb, past tense
2 | NUM |  numeral | CD cardinal number
million | NUM |  numeral | CD cardinal number
$ | SYM |  symbol | $ symbol, currency
for | ADP |  adposition | IN conjunction, subordinating or preposition
last | ADJ |  adjective | JJ adjective (English), other noun-modifier (Chinese)
20 | NUM |  numeral | CD cardinal number
years | NOUN |  noun | NNS noun, plural
which | PRON |  pronoun | WDT wh-determiner
is | AUX |  auxiliary | VBZ verb, 3rd person singular present
really | ADV |  adverb | RB adverb
a | DET |  determiner | DT determiner
huge | ADJ |  adjective | JJ adjective (English), other noun-modifier (Chinese)
amount | NOUN |  noun | NN noun, singular or mass
of | ADP |  adposition | IN conjunction, subordinating or preposition
money | NOUN |  noun | NN noun, singular or mass
! | PUNCT |  punctuation | . punctuation mark, sentence closer


In [16]:
text = "He is quiet laudable"
doc = nlp(text)
print(doc[0].text,"|", doc[0].pos_, "|", doc[0].tag_, spacy.explain(doc[0].tag_))

He | PRON | PRP pronoun, personal


In [21]:
report =""" Microsoft Corp. today announced the following results for the quarter ended December 31, 2022, as compared to the corresponding period of last fiscal year:

·        Revenue was $52.7 billion and increased 2%  

·        Operating income was $20.4 billion GAAP and $21.6 billion non-GAAP, and decreased 8% and 3%, respectively

·        Net income was $16.4 billion GAAP and $17.4 billion non-GAAP, and decreased 12% and 7%, respectively

·        Diluted earnings per share was $2.20 GAAP and $2.32 non-GAAP, and decreased 11% and 6%, respectively

“The next major wave of computing is being born, as the Microsoft Cloud turns the world’s most advanced AI models into a new computing platform,” said Satya Nadella, chairman and chief executive officer of Microsoft. “We are committed to helping our customers use our platforms and tools to do more with less today and innovate for the future in the new era of AI.”

 

“We are focused on operational excellence as we continue to invest to drive growth. Microsoft Cloud revenue was $27.1 billion, up 22% (up 29% in constant currency) year-over-year as our commercial offerings continue to drive value for our customers,” said Amy Hood, executive vice president and chief financial officer of Microsoft.

The following table reconciles our financial results reported in accordance with generally accepted accounting principles (GAAP) to non-GAAP financial results. Additional information regarding our non-GAAP definition is provided below. All growth comparisons relate to the corresponding period in the last fiscal year."""
doc = nlp(report)
for token in doc:
    print(token,"|",token.pos_,"| ", spacy.explain(token.pos_))

  | SPACE |  space
Microsoft | PROPN |  proper noun
Corp. | PROPN |  proper noun
today | NOUN |  noun
announced | VERB |  verb
the | DET |  determiner
following | VERB |  verb
results | NOUN |  noun
for | ADP |  adposition
the | DET |  determiner
quarter | NOUN |  noun
ended | VERB |  verb
December | PROPN |  proper noun
31 | NUM |  numeral
, | PUNCT |  punctuation
2022 | NUM |  numeral
, | PUNCT |  punctuation
as | SCONJ |  subordinating conjunction
compared | VERB |  verb
to | ADP |  adposition
the | DET |  determiner
corresponding | ADJ |  adjective
period | NOUN |  noun
of | ADP |  adposition
last | ADJ |  adjective
fiscal | ADJ |  adjective
year | NOUN |  noun
: | PUNCT |  punctuation


 | SPACE |  space
· | PUNCT |  punctuation
        | SPACE |  space
Revenue | NOUN |  noun
was | AUX |  auxiliary
$ | SYM |  symbol
52.7 | NUM |  numeral
billion | NUM |  numeral
and | CCONJ |  coordinating conjunction
increased | VERB |  verb
2 | NUM |  numeral
% | NOUN |  noun
 

 | SPACE |  spac

Remove unnecessary things/garbage from Text

In [22]:
doc = nlp(report)
filtered_data = []
for token in doc:
 if token.pos_ not in ["X","SPACE","PUNCT"]:
   filtered_data.append(token)

In [23]:
filtered_data[:50]

[Microsoft,
 Corp.,
 today,
 announced,
 the,
 following,
 results,
 for,
 the,
 quarter,
 ended,
 December,
 31,
 2022,
 as,
 compared,
 to,
 the,
 corresponding,
 period,
 of,
 last,
 fiscal,
 year,
 Revenue,
 was,
 $,
 52.7,
 billion,
 and,
 increased,
 2,
 %,
 Operating,
 income,
 was,
 $,
 20.4,
 billion,
 GAAP,
 and,
 $,
 21.6,
 billion,
 non,
 -,
 GAAP,
 and,
 decreased,
 8]

Count each parts of speech separately

In [25]:
count = doc.count_by(spacy.attrs.POS)
count


{103: 12,
 96: 15,
 92: 73,
 100: 31,
 90: 13,
 85: 22,
 93: 25,
 97: 38,
 98: 4,
 84: 29,
 87: 10,
 99: 8,
 89: 14,
 86: 8,
 94: 5,
 95: 9}

In [26]:
doc.vocab[103].text

'SPACE'

In [31]:
for k,v in count.items(): #key value pairs in count
  print(doc.vocab[k].text,"|",v)

SPACE | 12
PROPN | 15
NOUN | 73
VERB | 31
DET | 13
ADP | 22
NUM | 25
PUNCT | 38
SCONJ | 4
ADJ | 29
AUX | 10
SYM | 8
CCONJ | 14
ADV | 8
PART | 5
PRON | 9


Extact noun and adjective from the text

In [39]:
doc = nlp(report)

Noun_tokens = []
adjective_tokens = []

for token in doc:
    if token.pos_ == "NOUN":
        Noun_tokens.append(token)
    elif token.pos_ == "ADJ":
        adjective_tokens.append(token)

In [40]:
Noun_tokens[:20]


[today,
 results,
 quarter,
 period,
 year,
 Revenue,
 %,
 income,
 GAAP,
 non,
 -,
 GAAP,
 %,
 %,
 income,
 GAAP,
 non,
 -,
 GAAP,
 %]

In [41]:
adjective_tokens[:10]

[corresponding,
 last,
 fiscal,
 Net,
 next,
 major,
 advanced,
 new,
 chief,
 executive]