In [1]:
import spacy.cli
spacy.cli.download('en_core_web_sm')



[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
import spacy
nlp=spacy.load('en_core_web_sm')

In [None]:
type(nlp)

spacy.lang.en.English

In [3]:
doc=nlp('Apple is looking at buying U.K. startup for $6 millions.')
for token in doc:
  print(token.text,token.pos_,token.dep_)

Apple PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.K. PROPN dobj
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM nummod
millions NOUN pobj
. PUNCT punct


In [5]:
spacy.explain('dobj')

'direct object'

In [6]:
spacy.explain('nsubj')

'nominal subject'

In [7]:
doc

Apple is looking at buying U.K. startup for $6 millions.

In [8]:
type(doc)

spacy.tokens.doc.Doc

In [9]:
doc[6].pos_

'NOUN'

In [10]:
doc[6].dep_

'dobj'

In [11]:
spacy.explain('dobj')

'direct object'

In [12]:
s='i am sending mail to the gopikanthtirumani@gmail.com'
d=nlp(s)
for i in d:
  print(i.text,i.pos_,i.dep_)

i PRON nsubj
am AUX aux
sending VERB ROOT
mail NOUN dobj
to ADP dative
the DET det
gopikanthtirumani@gmail.com NOUN pobj


In [13]:
# Named entities in the spaCy module are words or phrases that represent specific types of objects, such as people, places, organizations, products, and dates.

# SpaCy uses a statistical model to identify and classify named entities in text. The model is trained on a large corpus of text data and uses various features such as part-of-speech tags, dependency parsing, and word embeddings to identify entities.

# Once identified, spaCy assigns a label to each entity based on its type, such as PERSON for people, GPE (Geopolitical Entity) for countries or cities, ORG for organizations, and DATE for dates.

# Named entity recognition (NER) is a useful tool for various natural language processing tasks, such as information extraction, sentiment analysis, and question answering.

In [14]:
doc=nlp('Apple is looking at buying a U.K startup for $6 billions')
for ent in doc.ents:
  print(ent.text+" - " +ent.label_+" - "+str(spacy.explain(ent.label_)))

Apple - ORG - Companies, agencies, institutions, etc.
U.K - ORG - Companies, agencies, institutions, etc.
$6 billions - MONEY - Monetary values, including unit


In [15]:
from spacy import displacy
displacy.render(doc,style='ent',jupyter=True)

In [16]:
doc1=nlp("Apple is planning to release a new iPhone next month. Elon Musk, the CEO of Tesla, announced that they will build a new factory in Germany. The United Nations held a summit to discuss climate change and sustainable development. Shakespeare was born in Stratford-upon-Avon on April 23, 1564. Harry Potter and the Philosopher's Stone is a novel written by J.K. Rowling.")
for ent in doc1.ents:
  print(ent.text+" - " +ent.label_+" - "+str(spacy.explain(ent.label_)))

Apple - ORG - Companies, agencies, institutions, etc.
iPhone - ORG - Companies, agencies, institutions, etc.
next month - DATE - Absolute or relative dates or periods
Elon Musk - PERSON - People, including fictional
Tesla - ORG - Companies, agencies, institutions, etc.
Germany - GPE - Countries, cities, states
The United Nations - ORG - Companies, agencies, institutions, etc.
Shakespeare - PERSON - People, including fictional
Stratford - ORG - Companies, agencies, institutions, etc.
April 23, 1564 - DATE - Absolute or relative dates or periods
Harry Potter - PERSON - People, including fictional
the Philosopher's Stone - ORG - Companies, agencies, institutions, etc.
J.K. Rowling - PERSON - People, including fictional


In [17]:
displacy.render(doc1,style='ent',jupyter=True)

In [18]:
doc=nlp('Apple is looking at buying a U.K startup for $6 billions')
for ent in doc.ents:
  print(ent.text+" - " +ent.label_+" - "+str(spacy.explain(ent.label_)))

Apple - ORG - Companies, agencies, institutions, etc.
U.K - ORG - Companies, agencies, institutions, etc.
$6 billions - MONEY - Monetary values, including unit


In [19]:
# what are noun chunks

# Noun chunks are phrases in a sentence that consist of a noun and any words that modify it, such as adjectives or determiners. They are a useful way to identify and extract the key information in a sentence, and can be thought of as the "building blocks" of natural language.

# In the spaCy module, noun chunks can be identified using the noun_chunks attribute of a Doc object. This attribute returns a generator that yields Span objects representing the noun chunks in the document.

# For example, consider the sentence "The big brown dog chased the small white cat." The noun chunks in this sentence are "The big brown dog" and "the small white cat". These noun chunks can be extracted using the following code in spaCy:


# import spacy

# nlp = spacy.load("en_core_web_sm")
# doc = nlp("The big brown dog chased the small white cat.")
# for chunk in doc.noun_chunks:
#     print(chunk.text)


# This code will output:

# The big brown dog
# the small white cat



# Noun chunks can be useful for various natural language processing tasks, such as information extraction, summarization, and sentiment analysis.





In [20]:
doc=nlp('the big brown dog chased the small white cat.')
for chunk in doc.noun_chunks:
  print(chunk.text)

the big brown dog
the small white cat


In [21]:
#EXAMPLES


# The tall, dark-haired man in the suit walked into the room.
# Noun chunks: "The tall, dark-haired man", "the suit", "the room".

# The delicious, warm chocolate chip cookies were waiting on the kitchen counter.
# Noun chunks: "The delicious, warm chocolate chip cookies", "the kitchen counter".

# My sister's new, expensive car was stolen from the parking lot.
# Noun chunks: "My sister's new, expensive car", "the parking lot".

# The old, wooden chest in the attic was filled with treasure.
# Noun chunks: "The old, wooden chest", "the attic", "treasure".

# The large, intimidating dog barked loudly at the mailman.
# Noun chunks: "The large, intimidating dog", "the mailman".


In [22]:
import nltk

In [23]:
 #2types 
 #porter stemmer
 #snow ball stemmer

In [24]:
from nltk.stem.porter import PorterStemmer

In [25]:
p_stem=PorterStemmer()

In [26]:
words=['run','runner','running','runs','easily','fairly']
for i in words:
  print(i+'----------------->'+p_stem.stem(i))

run----------------->run
runner----------------->runner
running----------------->run
runs----------------->run
easily----------------->easili
fairly----------------->fairli


In [27]:
#the disadvantage in porterstemmer is its stems differently which doesn't exists in dictionary

In [28]:
#so overcome this we are using Snowball Stemmer. It offers a slight improvement over the original Porter Stemmer, both in logic and speed

In [29]:
from nltk.stem.snowball import SnowballStemmer

In [30]:
s_stemmer=SnowballStemmer(language='english')

In [31]:
words=['run','runner','running','runs','easily','fairly']
for i in words:
  print(i+'-------------------->'+s_stemmer.stem(i))

run-------------------->run
runner-------------------->runner
running-------------------->run
runs-------------------->run
easily-------------------->easili
fairly-------------------->fair


In [32]:
word=['consolingly']
print('porter stemmer ------>'+p_stem.stem(word[0]))
print('snowball stemmer ------->'+s_stemmer.stem(word[0]))

porter stemmer ------>consolingli
snowball stemmer ------->consol


In [33]:
s='I am doing well. what about you guys!!'
for i in s.split():
  print(i+'-------->'+p_stem.stem(i))

I-------->i
am-------->am
doing-------->do
well.-------->well.
what-------->what
about-------->about
you-------->you
guys!!-------->guys!!


In [34]:
for i in s.split():
  print(i+'---------->'+s_stemmer.stem(i))

I---------->i
am---------->am
doing---------->do
well.---------->well.
what---------->what
about---------->about
you---------->you
guys!!---------->guys!!


In [35]:
#Lemmatization

In [36]:
# what is lemmatization
# Lemmatization is the process of reducing a word to its base or root form, which is known as the lemma. The lemma is the canonical form of a word, and it represents the word's dictionary form. For example, the lemma of the word "running" is "run", the lemma of "am, is, are" is "be".

# Lemmatization is an important technique in natural language processing, and it is used to standardize words so that they can be analyzed more easily. It helps to reduce the number of forms a word can take, which is important in tasks such as text classification, information retrieval, and machine translation.

# Lemmatization is often used in combination with part-of-speech (POS) tagging, which is the process of identifying the part of speech of each word in a sentence. This is because the lemma of a word can depend on its part of speech. For example, the lemma of the word "good" is "good" when used as an adjective (e.g., "a good book"), but it is "well" when used as an adverb (e.g., "he speaks English well").

In [37]:
# what is the difference between lemmatization and stemming
# Both lemmatization and stemming are techniques used to reduce words to their base or root form, but they differ in the way they achieve this goal.

# Stemming is a process of removing prefixes and suffixes from words to obtain their root form, which is called a stem. The stem may not necessarily be a word by itself, but it can still convey the general meaning of the original word. For example, the stem of the word "jumping" is "jump", and the stem of the word "running" is "run". Stemming is a simpler and faster process than lemmatization, but it may not always produce the correct root form.

# Lemmatization, on the other hand, is a more sophisticated process that involves analyzing the structure of words based on their context and morphology to determine their root form, which is called a lemma. Unlike stemming, lemmatization produces an actual word that is present in a language's dictionary. For example, the lemma of the word "jumping" is "jump", and the lemma of the word "running" is also "run". Lemmatization is more accurate than stemming, but it is also more computationally intensive and may be slower.

# In summary, stemming is a simpler and faster method of reducing words to their root form, while lemmatization is a more accurate but computationally more expensive process that produces an actual word. The choice between the two techniques depends on the specific task and the level of accuracy required.





In [38]:
doc=nlp('I am a runner I am running day and also and i ran today, i will also run tommorow')
for token in doc:
  print(token.text+'\t'+token.pos_+'\t'+token.lemma_)

I	PRON	I
am	AUX	be
a	DET	a
runner	NOUN	runner
I	PRON	I
am	AUX	be
running	VERB	run
day	NOUN	day
and	CCONJ	and
also	ADV	also
and	CCONJ	and
i	PRON	I
ran	VERB	run
today	NOUN	today
,	PUNCT	,
i	PRON	I
will	AUX	will
also	ADV	also
run	VERB	run
tommorow	NOUN	tommorow


In [39]:
def lemma_show(text):
  for token in text:
    print(f'{token.text:{12}} {token.pos_:{8}} {token.lemma:<{22}} {token.lemma_}')

In [40]:
doc=nlp('I saw nine mice today')
lemma_show(doc)

I            PRON     4690420944186131903    I
saw          VERB     11925638236994514241   see
nine         NUM      17718451046594752029   nine
mice         NOUN     1384165645700560590    mouse
today        NOUN     11042482332948150395   today


In [41]:
# examples

# Word: "dogs"
# Lemma: "dog"

# Word: "ran"
# Lemma: "run"

# Word: "am"
# Lemma: "be"

# Word: "better"
# Lemma: "good"

# Word: "went"
# Lemma: "go"

# Word: "went"
# Lemma: "go"

# Word: "wolves"
# Lemma: "wolf"

# Word: "feet"
# Lemma: "foot"

# Word: "geese"
# Lemma: "goose"

# Word: "playing"
# Lemma: "play"

In [42]:
doc=nlp("That's an enormous automobile")
lemma_show(doc)

That         PRON     4380130941430378203    that
's           AUX      10382539506755952630   be
an           DET      15099054000809333061   an
enormous     ADJ      17917224542039855524   enormous
automobile   NOUN     7211811266693931283    automobile


In [43]:
# stops words
# words like "a" and "the" appear so frequently that they dont;t require tagging  as throughly as nouns, verbs and modifiers. We call these stop words, and they can be filtered from the text to be processed spacy holds a built-in list of some 305 English stop words

In [44]:
# what are stop words

# Stop words are common words that are often removed from texts during preprocessing, as they are considered to be uninformative or irrelevant for the analysis or classification task at hand. Stop words are typically the most common words in a language, such as "the", "and", "of", "to", "in", "a", "an", "is", "are", "for", "that", "with", "on", "at", "by", "from", "as", "it", "its", "be", "been", "was", "were", "will", "would", "have", "has", "had", "can", "could", "should", "shall", "may", "might", "must", etc.

# Removing stop words can help to improve the efficiency of text processing by reducing the number of words that need to be analyzed, and can also improve the accuracy of some text analysis tasks by removing irrelevant information. However, in some cases, stop words may be important for the analysis, especially for tasks such as sentiment analysis, where the meaning of a sentence can be heavily influenced by the presence or absence of certain stop words.

# It is worth noting that the list of stop words may vary depending on the specific task or application, as well as the language or dialect being analyzed. Therefore, it is common to create custom lists of stop words for a particular project or use case.

In [45]:
print(nlp.Defaults.stop_words)

{'used', 'can', 'meanwhile', 'someone', 'one', 'any', 'show', 'formerly', 'onto', 'over', 'next', 'it', 'their', 'front', 'hereby', 'whereafter', 'thereupon', 'yours', 'itself', 'noone', 'everything', 'amount', 'upon', 'often', 'a', 'being', 'get', 'give', 'whereupon', 'him', 'own', 'whence', "'ve", "'d", 'are', 'she', 'these', '’ll', 'hereupon', 'them', 'whoever', 'though', "'re", 'least', 'nowhere', 'seem', 'mine', 'that', 'anyway', 'were', 'your', 'somehow', 'toward', 'sixty', 'amongst', 'so', 'had', 'keep', 'thereafter', 'beyond', 'within', 'when', '‘ll', 'below', 'by', 'he', 'after', 'is', 'call', 'top', 'six', '’d', 'fifteen', 'becoming', 'latterly', 'afterwards', 'anyone', 'been', 'hers', 'eleven', 'out', 'most', 'both', '‘m', 'her', 'seems', 'across', 'beforehand', 'again', 'ca', 'in', 'such', '’re', 'nobody', 'rather', 'through', 'between', 'more', 'sometimes', 'about', 'almost', 'take', 'from', 'whatever', '‘re', 'ever', 'towards', 'seeming', 'third', 'those', 'before', 'ours

In [46]:
len(nlp.Defaults.stop_words)

326

In [47]:
nlp.vocab['first'].is_stop  #it checks whether it is stop_word or not

True

In [48]:
nlp.vocab['Gopikanth'].is_stop

False

In [49]:
# adding a stop word to the vocabulary
# if any word is repeating many times in our text such that our ml algorithm may got deviate and scores less score 

In [50]:
# why adding a stop word to the vocabulary

# Adding a stop word to the vocabulary can be done for a few different reasons:

# Importance: Sometimes a word that is normally considered a stop word may be important in the context of a specific task or analysis. For example, in sentiment analysis, the word "not" is often considered a stop word, but it can be crucial for understanding the sentiment of a sentence (e.g., "I am not happy" vs. "I am happy"). In such cases, adding the word "not" to the stop word list could negatively impact the accuracy of the analysis, so it may be beneficial to exclude it.

# Filtering: Stop words are often used to filter out words that are common and uninformative. However, in some cases, certain words may be considered uninformative in one context but informative in another. For example, in a study of legal documents, words like "law" and "regulation" might be considered stop words in other contexts, but are important for understanding the content of legal documents. In this case, these words may be added to the vocabulary to ensure that they are not removed from the analysis.

# Consistency: Depending on the tool or library being used, the list of stop words may vary. By adding a specific stop word to the vocabulary, you can ensure that it is consistently treated as a stop word throughout the analysis. This can help to avoid inconsistencies in the results and make it easier to interpret the output.

# In general, adding a stop word to the vocabulary should be done thoughtfully and with a clear understanding of the implications for the analysis. It is important to consider the specific task or application, as well as the language and dialect being analyzed, when deciding which words to include or exclude from the stop word list.

In [51]:
nlp.Defaults.stop_words.add('Gopikanth')
nlp.vocab['Gopikanth'].is_stop=True

In [52]:
nlp.vocab['Gopikanth'].is_stop

True

In [53]:
len(nlp.Defaults.stop_words)

327

In [54]:
# Removing a stop word from thye vocabulary
# Removing a stop word from the vocabulary can be done for a few different reasons:

# Relevance: In some cases, a word that is normally considered a stop word may be relevant to the task or analysis at hand. For example, in a study of social media posts, words like "lol" or "omg" might be important for understanding the tone or sentiment of the post. In such cases, removing these words from the stop word list could help to improve the accuracy of the analysis.

# Precision: Depending on the specific analysis, removing certain stop words can help to increase the precision of the results. For example, if you are using a bag-of-words model to classify text, removing stop words can reduce the dimensionality of the feature space and improve the accuracy of the classification.

# Context: Stop words are often used to remove words that are common and uninformative, but the list of stop words may not be appropriate for all contexts. For example, the stop word "not" may be important in some contexts, such as sentiment analysis, but not in others. By removing certain stop words, you can tailor the analysis to the specific context and improve the relevance of the results.

# It's important to note that removing a stop word should be done thoughtfully and with a clear understanding of the implications for the analysis. Removing stop words can sometimes lead to overfitting or loss of information, so it's important to carefully consider the trade-offs between precision, recall, and relevance when deciding which words to remove from the stop word list.






In [55]:
nlp.Defaults.stop_words.remove('Gopikanth')
nlp.vocab['Gopikanth'].is_stop=False

In [56]:
nlp.vocab['Gopikanth'].is_stop

False

In [57]:
len(nlp.Defaults.stop_words)

326

In [58]:
nlp.Defaults.stop_words.remove('only')
nlp.vocab['only'].is_stop=False

In [59]:
nlp.vocab['only'].is_stop

False

In [60]:
len(nlp.Defaults.stop_words)

325

In [61]:
nlp.Defaults.stop_words.add('only')
nlp.vocab['only'].is_stop=True

In [62]:
nlp.vocab['only'].is_stop

True

In [63]:
print(sorted(nlp.Defaults.stop_words))

["'d", "'ll", "'m", "'re", "'s", "'ve", 'a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amount', 'an', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around', 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 'between', 'beyond', 'both', 'bottom', 'but', 'by', 'ca', 'call', 'can', 'cannot', 'could', 'did', 'do', 'does', 'doing', 'done', 'down', 'due', 'during', 'each', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty', 'first', 'five', 'for', 'former', 'formerly', 'forty', 'four', 'from', 'front', 'full', 'further', 'get', 'give', 'go', 'had', 'has', 'have', 'he', 'hence', 'her', 'here', 'he

In [64]:
# natural language toolkit

In [65]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [66]:
from nltk.corpus import stopwords
stop_words=stopwords.words('english')
type(stop_words)
len(stop_words)

179

In [67]:
# Stops_words in scikit-learn

In [68]:
from sklearn.feature_extraction import text

In [69]:
stop_words=text.ENGLISH_STOP_WORDS
len(stop_words)

318

In [70]:
print(sorted(list(stop_words)))

['a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around', 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both', 'bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant', 'co', 'con', 'could', 'couldnt', 'cry', 'de', 'describe', 'detail', 'do', 'done', 'down', 'due', 'during', 'each', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first', 'five', 'for', 'former', 'formerly', 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'get', 'give

In [71]:
type(stop_words)

frozenset

In [72]:
# vocabulary matching

In [73]:
# Rule base matching

In [77]:
# from thinc.shims import pytorch
# spacy
# scikit-learn
# nltk
# hugging face
# tensorflow
# pytorch
# hugging face
# gensim

In [78]:
# apis

# Fasttext
# Tensorflow Hub
# GPT3

In [79]:
# what is rule based matching
# Rule-based matching is a technique used in natural language processing (NLP) to identify and extract specific patterns or phrases in text based on a set of pre-defined rules. These rules can be simple or complex and can involve matching on various criteria such as part of speech, entities, and syntactic dependencies.

# In rule-based matching, a set of rules or patterns is defined using regular expressions, grammars, or other linguistic tools. These rules are then applied to text to identify instances that match the specified patterns. This can be useful in a variety of applications, such as information extraction, sentiment analysis, and chatbots.

# For example, let's say we want to extract all the dates mentioned in a text. We could define a rule that matches any sequence of words that looks like a date, such as "January 1, 2022" or "02/14/2023". We could also define rules that match other date-related phrases, such as "next week" or "last month". By applying these rules to the text, we can extract all the relevant date information.

# Rule-based matching can be a powerful technique for certain types of NLP tasks, but it does have limitations. It requires a significant amount of upfront work to define the rules and patterns, and it may not be effective for more complex tasks where the patterns are difficult to define. Additionally, it may be less flexible than other approaches such as machine learning, which can adapt to new patterns and data.

In [80]:
# application : otp will resads directly in some apps that is the rule based application

In [81]:
from spacy.matcher import Matcher

In [82]:
matcher=Matcher(nlp.vocab)

In [83]:
# The next step is to define the patterns that will be used to filter similar phrases. suppose
# we want to find the phrases 'quick-brown-fox', 'quick brown fox'
# or 'quick brownfox'.To do so we need to create the following four patterns:

In [84]:
p1=[{'LOWER':'quickbrownfox'}]
p2=[{'LOWER':'quick'}, {'IS_PUNCT':True},{'LOWER':'brown'},{'IS_PUNCT':True},{'LOWER':'fox'}]
p3=[{'LOWER':'quick'},{'LOWER':'brown'},{'LOWER':'fox'}]
p4=[{'LOWER':'quick'},{'LOWER':'brownfox'}]

In [86]:
matcher.add('QBF', None, p1,p2,p3,p4)

TypeError: ignored

In [87]:
# Tagging

In [89]:
doc=nlp('The quick brown fox jumped over dog\'s back')
for token in doc:
  print(f'{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {str(spacy.explain(token.tag_))}')

The        DET        DT         determiner
quick      ADJ        JJ         adjective (English), other noun-modifier (Chinese)
brown      ADJ        JJ         adjective (English), other noun-modifier (Chinese)
fox        NOUN       NN         noun, singular or mass
jumped     VERB       VBD        verb, past tense
over       ADP        IN         conjunction, subordinating or preposition
dog        NOUN       NN         noun, singular or mass
's         PART       POS        possessive ending
back       ADV        RB         adverb


In [90]:
# counting POS tags

In [91]:
pos_counts=doc.count_by(spacy.attrs.POS)

In [92]:
pos_counts

{90: 1, 84: 2, 92: 2, 100: 1, 85: 1, 94: 1, 86: 1}

In [93]:
for i in pos_counts:
  j=doc.vocab[i].text
  print(j,str(spacy.explain(j)))

DET determiner
ADJ adjective
NOUN noun
VERB verb
ADP adposition
PART particle
ADV adverb


In [94]:
# Named Entity Recognition

In [96]:
doc=nlp('Apple is looking at U.K. startup for buying at $6 millions')
for token in doc.ents:
  print(token.text+'----->'+token.label_+'------>'+str(spacy.explain(token.label_)))

Apple----->ORG------>Companies, agencies, institutions, etc.
U.K.----->GPE------>Countries, cities, states
$6 millions----->MONEY------>Monetary values, including unit


In [97]:
def show_entities(doc):
  if doc.ents:
    for ent in doc.ents:
      print(ent.text+'------->'+ent.label_+'------>'+str(spacy.explain(ent.label_)))
  else:
    print('no entities found')

In [98]:
show_entities(doc)

Apple------->ORG------>Companies, agencies, institutions, etc.
U.K.------->GPE------>Countries, cities, states
$6 millions------->MONEY------>Monetary values, including unit


In [99]:
doc=nlp('May I go to washington, DC next May to see the Washington Monument')

In [100]:
show_entities(doc)

washington------->GPE------>Countries, cities, states
DC------->GPE------>Countries, cities, states
next May------->DATE------>Absolute or relative dates or periods
the Washington Monument------->ORG------>Companies, agencies, institutions, etc.


In [101]:
# ent.text : The original entity text
# ent.label : The entity type's hash value 
# ent.label_: The entity type's string discription
# ent.start : The token span's start index position in the Doc
# ent.end : The token span's stop index position in the Doc

In [112]:
for ent in doc.ents:
  print(ent.text,ent.start,ent.start_char,ent.end,ent.end_char,ent.label,ent.label_,)

washington 4 12 5 22 384 GPE
DC 6 24 7 26 384 GPE
next May 7 27 9 35 391 DATE
the Washington Monument 11 43 14 66 383 ORG


In [113]:
# Adding a named entity to a span

In [119]:
doc=nlp('welcome to my socialbook')
show_entities(doc)

no entities found


In [121]:
doc[3]

socialbook

In [122]:
from spacy.tokens import Span
org=doc.vocab.strings['ORG']

In [123]:
org

383

In [126]:
new_ent=Span(doc,3,4,label=org)

In [128]:
# Span(doc,start_index,end_index,label=org)

In [129]:
new_ent

socialbook

In [130]:
doc.ents=list(doc.ents)+[new_ent]

In [131]:
show_entities(doc)

socialbook------->ORG------>Companies, agencies, institutions, etc.


In [132]:
displacy.render(doc,style='ent',jupyter=True)

In [133]:
doc1=nlp("Apple is planning to release a new iPhone next month. Elon Musk, the CEO of Tesla, announced that they will build a new factory in Germany. The United Nations held a summit to discuss climate change and sustainable development. Shakespeare was born in Stratford-upon-Avon on April 23, 1564. Harry Potter and the Philosopher's Stone is a novel written by J.K. Rowling.")
for ent in doc1.ents:
  print(ent.text+" - " +ent.label_+" - "+str(spacy.explain(ent.label_)))

Apple - ORG - Companies, agencies, institutions, etc.
iPhone - ORG - Companies, agencies, institutions, etc.
next month - DATE - Absolute or relative dates or periods
Elon Musk - PERSON - People, including fictional
Tesla - ORG - Companies, agencies, institutions, etc.
Germany - GPE - Countries, cities, states
The United Nations - ORG - Companies, agencies, institutions, etc.
Shakespeare - PERSON - People, including fictional
Stratford - ORG - Companies, agencies, institutions, etc.
April 23, 1564 - DATE - Absolute or relative dates or periods
Harry Potter - PERSON - People, including fictional
the Philosopher's Stone - ORG - Companies, agencies, institutions, etc.
J.K. Rowling - PERSON - People, including fictional


In [134]:
displacy.render(doc1,style='ent',jupyter=True)

In [138]:
options={'ents':{'ORG','DATE'}}

In [139]:
displacy.render(doc1,style='ent',jupyter=True,options=options)

In [140]:
# we can also chage the names of the above entities colors by using some parameters

In [141]:
# BAG OF WORDS

In [142]:
#  what is bag of words


# The "bag of words" (BoW) is a simple text representation technique used in natural language processing and information retrieval. It involves converting a piece of text into a collection (or "bag") of individual words, ignoring grammar and word order, and counting the frequency of each word in the text.

# In other words, the BoW model represents a text document as a set of words without any consideration for the order in which they appear. The order is not taken into account, only the frequency of each word is recorded. This method can be useful for text classification, sentiment analysis, and information retrieval.

# For example, given the sentence "The cat in the hat", the corresponding BoW representation would be:
# {"the": 2, "cat": 1, "in": 1, "hat": 1}

# As you can see, the BoW model represents the sentence as a collection of words with their corresponding frequencies.

In [143]:
# Sure! Here's an example of how the bag of words technique can be used:

# Let's say we have a collection of three documents:

# Document 1: "The quick brown fox jumps over the lazy dog"
# Document 2: "The lazy dog sleeps all day"
# Document 3: "The quick brown fox and the lazy dog"

# To represent these documents using the bag of words model, we first create a vocabulary of all the unique words in the collection, ignoring stop words like "the", "and", and "all". In this case, the vocabulary would be:

# Vocabulary: quick, brown, fox, jumps, over, lazy, dog, sleeps

# Next, we count the frequency of each word in each document, and represent the documents as vectors of word frequencies:

# Document 1: {quick: 1, brown: 1, fox: 1, jumps: 1, over: 1, lazy: 1, dog: 1, sleeps: 0}
# Document 2: {quick: 0, brown: 0, fox: 0, jumps: 0, over: 0, lazy: 1, dog: 1, sleeps: 1}
# Document 3: {quick: 1, brown: 1, fox: 1, jumps: 0, over: 0, lazy: 1, dog: 1, sleeps: 0}

# As you can see, each document is represented as a vector of word frequencies, with the index of each element corresponding to the position of the word in the vocabulary. This allows us to compare and analyze the documents based on their word frequencies, without considering the order of the words.

In [146]:
# Count Vectorised
# tf-Idf Vectorized : Term Frequency and Inverse Document Frequency

In [147]:
# Count vectorization and tf-idf vectorization are two common methods used to convert text data into numerical vectors that can be used as input for machine learning algorithms.

# Count vectorization simply counts the frequency of each word in a document and represents the document as a vector of these word frequencies. This is the basic bag of words representation we discussed earlier. For example, if we have a sentence "The quick brown fox jumps over the lazy dog", the corresponding count vector would be {the: 2, quick: 1, brown: 1, fox: 1, jumps: 1, over: 1, lazy: 1, dog: 1}. Count vectorization is a simple and effective method, but it does not account for the relative importance of words in a document.

# TF-IDF vectorization (term frequency-inverse document frequency) is a method that takes into account both the frequency of a word in a document and the frequency of the word in the corpus (i.e., collection of documents). The idea behind TF-IDF is that words that appear frequently in a document but rarely in the corpus are more important in describing the content of the document. Conversely, words that appear frequently in the corpus but rarely in a document are less important.

# To compute the TF-IDF score for each word in a document, we multiply the term frequency (TF) of the word (i.e., the frequency of the word in the document) by the inverse document frequency (IDF) of the word, which is a measure of how rare the word is in the corpus. The IDF of a word is computed as the logarithm of the total number of documents in the corpus divided by the number of documents that contain the word. The resulting TF-IDF score gives a measure of the importance of each word in the document.

# Overall, TF-IDF vectorization can be a more effective way to represent text data than count vectorization, as it takes into account the relative importance of words in a document and the corpus.