In [1]:
import nltk 
from nltk.stem import PorterStemmer

import spacy
nlp = spacy.load("en_core_web_sm")

In [105]:
lst_words = ['running', 'painting', 'walking', 'dressing', 'likely', 'children', 'whom', 'good', 'ate', 'fishing']


In [106]:
stemmer = PorterStemmer()
for token in lst_words:
    print(token, " | ", stemmer.stem(token))

running  |  run
painting  |  paint
walking  |  walk
dressing  |  dress
likely  |  like
children  |  children
whom  |  whom
good  |  good
ate  |  ate
fishing  |  fish


In [104]:
doc = nlp('running, painting, walking, dressing, likely, children, whom, good, ate, fishing')
for ent in doc:
    print(ent, " | ", ent.lemma_)

running  |  run
,  |  ,
painting  |  painting
,  |  ,
walking  |  walk
,  |  ,
dressing  |  dressing
,  |  ,
likely  |  likely
,  |  ,
children  |  child
,  |  ,
whom  |  whom
,  |  ,
good  |  good
,  |  ,
ate  |  ate
,  |  ,
fishing  |  fish


In [None]:
for ent in doc:
    print(ent, " | ", ent.s)

In [7]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")

In [11]:
nlp.pipeline
for ent in doc.ents:
    print(ent, " | ", ent.label_)

Tesla Inc  |  ORG
$45 billion  |  MONEY


In [88]:
from spacy import displacy
displacy.render(doc , style='ent')

In [13]:
nlp.pipe_labels['ner']

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

In [14]:
doc = nlp("Michael Bloomberg founded Bloomberg in 1982")

In [18]:
for ent in doc.ents:
    print(ent, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Michael Bloomberg  |  PERSON  |  People, including fictional
Bloomberg  |  PERSON  |  People, including fictional
1982  |  DATE  |  Absolute or relative dates or periods


In [21]:
ent.start_char


39

In [22]:
doc = nlp("Tesla is going to acquire Twitter for $45 billion")


In [24]:
for ent in doc.ents:
    print(ent, " | ", ent.label_)

Tesla  |  ORG
Twitter  |  PRODUCT
$45 billion  |  MONEY


In [25]:
from spacy.tokens import Span

In [27]:
s = doc[2:5]

In [29]:
s1 = Span(doc, 0, 1, label="ORG")
s2 = Span(doc, 5, 6, label="ORG")

In [31]:
doc.set_ents([s1, s2], default="unmodified")

In [32]:
for ent in doc.ents:
    print(ent, " | ", ent.label_)

Tesla  |  ORG
Twitter  |  ORG
$45 billion  |  MONEY


In [33]:
from spacy import displacy
displacy.render(doc, style='ent')

In [34]:
import spacy

In [35]:
nlp = spacy.load("en_core_web_sm")

In [36]:
doc = nlp("Elon flew to mars yesterday. He carried biryani masala with him")

In [39]:
for token in doc:
    print(token, " | ", token.pos_, spacy.explain(token.pos_), " | ", token.tag_, spacy.explain(token.tag_))

Elon  |  PROPN proper noun  |  NNP noun, proper singular
flew  |  VERB verb  |  VBD verb, past tense
to  |  ADP adposition  |  IN conjunction, subordinating or preposition
mars  |  NOUN noun  |  NNS noun, plural
yesterday  |  NOUN noun  |  NN noun, singular or mass
.  |  PUNCT punctuation  |  . punctuation mark, sentence closer
He  |  PRON pronoun  |  PRP pronoun, personal
carried  |  VERB verb  |  VBD verb, past tense
biryani  |  ADJ adjective  |  JJ adjective (English), other noun-modifier (Chinese)
masala  |  NOUN noun  |  NN noun, singular or mass
with  |  ADP adposition  |  IN conjunction, subordinating or preposition
him  |  PRON pronoun  |  PRP pronoun, personal


In [40]:
from sklearn.feature_extraction.text import CountVectorizer

In [42]:
v = CountVectorizer()
v.fit(["Thor Hathodawala is looking for a job"])

In [43]:
v.vocabulary_

{'thor': 5, 'hathodawala': 1, 'is': 2, 'looking': 4, 'for': 0, 'job': 3}

In [46]:
v =CountVectorizer(ngram_range=(2, 2))
v.fit(["Thor Hathodawala is looking for a job"])

In [47]:
v.vocabulary_

{'thor hathodawala': 4,
 'hathodawala is': 1,
 'is looking': 2,
 'looking for': 3,
 'for job': 0}

In [48]:
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

In [50]:
v = CountVectorizer()
v.fit_transform(documents)

<4x9 sparse matrix of type '<class 'numpy.int64'>'
	with 21 stored elements in Compressed Sparse Row format>

In [51]:
v.get_feature_names_out

<bound method CountVectorizer.get_feature_names_out of CountVectorizer()>

In [53]:
v = CountVectorizer()
x = v.fit_transform(documents)
v.get_feature_names_out

<bound method CountVectorizer.get_feature_names_out of CountVectorizer()>

In [54]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample documents
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# Create a CountVectorizer instance
vectorizer = CountVectorizer()



In [55]:
# Fit and transform the documents into a bag of words
X = vectorizer.fit_transform(documents)

# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Convert the bag of words to a dense matrix
dense_matrix = X.toarray()

# Display the result


In [56]:
print("Feature names (words):", feature_names)


Feature names (words): ['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']


In [57]:
print("Bag of words (document-term matrix):")
print(dense_matrix) 

Bag of words (document-term matrix):
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


In [58]:
text = """Kiran want to know the famous foods in each state of India. So, he opened Google and search for this question. Google showed that
in Delhi it is Chaat, in Gujarat it is Dal Dhokli, in Tamilnadu it is Pongal, in Andhrapradesh it is Biryani, in Assam it is Papaya Khar,
in Bihar it is Litti Chowkha and so on for all other states"""


In [59]:
doc = nlp(text)

In [60]:
all_gpe_names = []

for ent in doc.ents:
    if ent.label_ == 'GPE':
        all_gpe_names.append(ent)
all_gpe_names

[India, Delhi, Gujarat, Tamilnadu, Pongal, Andhrapradesh, Assam, Bihar]

In [64]:
text = """Kiran want to know the famous foods in each state of India. So, he opened Google and search for this question. Google showed that
in Delhi it is Chaat, in Gujarat it is Dal Dhokli, in Tamilnadu it is Pongal, in Andhrapradesh it is Biryani, in Assam it is Papaya Khar,
in Bihar it is Litti Chowkha and so on for all other states"""

doc = nlp(text)
all_birth_dates = []

for ent in doc.ents:
    if ent.label_ == 'DATE':
        all_birth_dates.append(ent)
all_birth_dates

[]

In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [66]:
corpus = [
    "Thor eating pizza, Loki is eating pizza, Ironman ate pizza already",
    "Apple is announcing new iphone tomorrow",
    "Tesla is announcing new model-3 tomorrow",
    "Google is announcing new pixel-6 tomorrow",
    "Microsoft is announcing new surface tomorrow",
    "Amazon is announcing new eco-dot tomorrow",
    "I am eating biryani and you are eating grapes"
]

In [67]:
v = TfidfVectorizer()
v.fit(corpus)

In [68]:
v.vocabulary_

{'thor': 25,
 'eating': 10,
 'pizza': 22,
 'loki': 17,
 'is': 16,
 'ironman': 15,
 'ate': 7,
 'already': 0,
 'apple': 5,
 'announcing': 4,
 'new': 20,
 'iphone': 14,
 'tomorrow': 26,
 'tesla': 24,
 'model': 19,
 'google': 12,
 'pixel': 21,
 'microsoft': 18,
 'surface': 23,
 'amazon': 2,
 'eco': 11,
 'dot': 9,
 'am': 1,
 'biryani': 8,
 'and': 3,
 'you': 27,
 'are': 6,
 'grapes': 13}

In [72]:
s = str(corpus)
doc = nlp(s)

In [73]:
company = []
for ent in doc.ents:
    if ent.label_ == 'ORG':
        company.append(ent)
    

In [74]:
company

[Apple, Google, Microsoft, Amazon]

In [75]:
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Example text
text = "Apple Inc. is a technology company based in Cupertino, California. Steve Jobs was one of its co-founders."

# Process the text with spaCy
doc = nlp(text)

# Extract named entities
for ent in doc.ents:
    print(f"Entity: {ent.text}, Type: {ent.label_}")


Entity: Apple Inc., Type: ORG
Entity: Cupertino, Type: GPE
Entity: California, Type: GPE
Entity: Steve Jobs, Type: PERSON


In [77]:
for ent in doc:
    print(ent.lemma_)

Apple
Inc.
be
a
technology
company
base
in
Cupertino
,
California
.
Steve
Jobs
be
one
of
its
co
-
founder
.


In [78]:
text = ("Thor eating pizza, Loki is eating pizza, Ironman ate pizza already, Apple is announcing new iphone tomorrow,Tesla is announcing new model-3 tomorrow,Google is announcing new pixel-6 tomorrow,Microsoft is announcing new surface tomorrow,Amazon is announcing new eco-dot tomorrow,I am eating biryani and you are eating grapes")

In [79]:
doc = nlp(text)

In [80]:
doc

Thor eating pizza, Loki is eating pizza, Ironman ate pizza already, Apple is announcing new iphone tomorrow,Tesla is announcing new model-3 tomorrow,Google is announcing new pixel-6 tomorrow,Microsoft is announcing new surface tomorrow,Amazon is announcing new eco-dot tomorrow,I am eating biryani and you are eating grapes

In [87]:
for ent in doc.ents:
    if ent.label_ == 'PERSON':
        print(ent)

Loki
Ironman
eating biryani


In [83]:
nlp.pipe_labels['ner']

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

In [91]:
doc = nlp("Elon flew to mars yesterday. He carried biryani masala with him")
for ent in doc:
    print(ent, " | ", ent.pos_, " | ", spacy.explain(ent.pos_), " | ", ent.tag_, " | ", spacy.explain(ent.tag_))

Elon  |  PROPN  |  proper noun  |  NNP  |  noun, proper singular
flew  |  VERB  |  verb  |  VBD  |  verb, past tense
to  |  ADP  |  adposition  |  IN  |  conjunction, subordinating or preposition
mars  |  NOUN  |  noun  |  NNS  |  noun, plural
yesterday  |  NOUN  |  noun  |  NN  |  noun, singular or mass
.  |  PUNCT  |  punctuation  |  .  |  punctuation mark, sentence closer
He  |  PRON  |  pronoun  |  PRP  |  pronoun, personal
carried  |  VERB  |  verb  |  VBD  |  verb, past tense
biryani  |  ADJ  |  adjective  |  JJ  |  adjective (English), other noun-modifier (Chinese)
masala  |  NOUN  |  noun  |  NN  |  noun, singular or mass
with  |  ADP  |  adposition  |  IN  |  conjunction, subordinating or preposition
him  |  PRON  |  pronoun  |  PRP  |  pronoun, personal


In [92]:
earnings_text="""Microsoft Corp. today announced the following results for the quarter ended December 31, 2021, as compared to the corresponding period of last fiscal year:

·         Revenue was $51.7 billion and increased 20%
·         Operating income was $22.2 billion and increased 24%
·         Net income was $18.8 billion and increased 21%
·         Diluted earnings per share was $2.48 and increased 22%
“Digital technology is the most malleable resource at the world’s disposal to overcome constraints and reimagine everyday work and life,” said Satya Nadella, chairman and chief executive officer of Microsoft. “As tech as a percentage of global GDP continues to increase, we are innovating and investing across diverse and growing markets, with a common underlying technology stack and an operating model that reinforces a common strategy, culture, and sense of purpose.”
“Solid commercial execution, represented by strong bookings growth driven by long-term Azure commitments, increased Microsoft Cloud revenue to $22.1 billion, up 32% year over year” said Amy Hood, executive vice president and chief financial officer of Microsoft."""

In [93]:
doc = nlp(earnings_text)
doc

Microsoft Corp. today announced the following results for the quarter ended December 31, 2021, as compared to the corresponding period of last fiscal year:

·         Revenue was $51.7 billion and increased 20%
·         Operating income was $22.2 billion and increased 24%
·         Net income was $18.8 billion and increased 21%
·         Diluted earnings per share was $2.48 and increased 22%
“Digital technology is the most malleable resource at the world’s disposal to overcome constraints and reimagine everyday work and life,” said Satya Nadella, chairman and chief executive officer of Microsoft. “As tech as a percentage of global GDP continues to increase, we are innovating and investing across diverse and growing markets, with a common underlying technology stack and an operating model that reinforces a common strategy, culture, and sense of purpose.”
“Solid commercial execution, represented by strong bookings growth driven by long-term Azure commitments, increased Microsoft Cloud r

In [98]:
for ent in doc.ents:
    if ent.label_ == 'WORK_OF_ART':
        print(ent)