# Text classification using spaCy NLP package.

- Author : Manu Nellutla
- Date   : Aug 16,2020

We will be using SPACY package to classify text and also do a sentiment analysis.

In [2]:
# install the required packages
!pip install spacy



## Normal split of text - 

**using programmatical split.**

In [4]:
# text to analyze. and lets split it into words.

text ="Neuro-linguistic programming was developed in the 1970's at the University of California, Santa Cruz. Its primary founders are John Grinder, a linguist, and Richard Bandler, an information scientist and mathematician. Judith DeLozier and Leslie Cameron-Bandler also contributed significantly to the field, as did David Gordon and Robert Dilts."

text.split()

['Neuro-linguistic',
 'programming',
 'was',
 'developed',
 'in',
 'the',
 "1970's",
 'at',
 'the',
 'University',
 'of',
 'California,',
 'Santa',
 'Cruz.',
 'Its',
 'primary',
 'founders',
 'are',
 'John',
 'Grinder,',
 'a',
 'linguist,',
 'and',
 'Richard',
 'Bandler,',
 'an',
 'information',
 'scientist',
 'and',
 'mathematician.',
 'Judith',
 'DeLozier',
 'and',
 'Leslie',
 'Cameron-Bandler',
 'also',
 'contributed',
 'significantly',
 'to',
 'the',
 'field,',
 'as',
 'did',
 'David',
 'Gordon',
 'and',
 'Robert',
 'Dilts.']

When you see above you can see apostrophe and commas are included in the words and split was based on blanks between the words.

## lets check how spaCy does it.

**using NLP englis spit**


In [65]:
import spacy
import sys
!{sys.executable} -m spacy download en

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/anaconda3/lib/python3.7/site-packages/en_core_web_sm -->
/usr/local/anaconda3/lib/python3.7/site-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [66]:
#import necessary packages

from spacy.lang.en import English

nlp_lang = spacy.load('en')

#spacy will use English language annotaitons to split the text

nlp_split = nlp_lang(text)

# Create list of word tokens
token_list = []
for token in nlp_split:
    token_list.append(token.text)

display(len(token_list))
token_list


61

['Neuro',
 '-',
 'linguistic',
 'programming',
 'was',
 'developed',
 'in',
 'the',
 '1970',
 "'s",
 'at',
 'the',
 'University',
 'of',
 'California',
 ',',
 'Santa',
 'Cruz',
 '.',
 'Its',
 'primary',
 'founders',
 'are',
 'John',
 'Grinder',
 ',',
 'a',
 'linguist',
 ',',
 'and',
 'Richard',
 'Bandler',
 ',',
 'an',
 'information',
 'scientist',
 'and',
 'mathematician',
 '.',
 'Judith',
 'DeLozier',
 'and',
 'Leslie',
 'Cameron',
 '-',
 'Bandler',
 'also',
 'contributed',
 'significantly',
 'to',
 'the',
 'field',
 ',',
 'as',
 'did',
 'David',
 'Gordon',
 'and',
 'Robert',
 'Dilts',
 '.']

The split is completely different to what we saw in the normal split. Before using this... 

## You can also split by sentences

**using sentenceTokenizer**


In [23]:
nlp_lang = English()
#nlp_lang.remove_pipe("sentencizer")
sbd = nlp_lang.create_pipe('sentencizer')

# Add the component to the pipeline
nlp_lang.add_pipe(sbd, last=True)

nlp_split = nlp_lang(text)

# Create list of word tokens
token_list = []
for token in nlp_split.sents: #----> nlp_split.sents does split by sentences
    token_list.append(token.text)
print(token_list)


["Neuro-linguistic programming was developed in the 1970's at the University of California, Santa Cruz.", 'Its primary founders are John Grinder, a linguist, and Richard Bandler, an information scientist and mathematician.', 'Judith DeLozier and Leslie Cameron-Bandler also contributed significantly to the field, as did David Gordon and Robert Dilts.']


Now that we have the ability to split we need to remove words that doesn't provide context. These are called stop words.

## Stopwords 



In [25]:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

#Printing the total number of stop words:
print('Number of stop words: %d' % len(spacy_stopwords))

#Printing first ten stop words:
print('First ten stop words: %s' % list(spacy_stopwords)[:10])

Number of stop words: 326
First ten stop words: ['other', 'towards', 'elsewhere', 'not', 'whereafter', 'used', 'same', 'above', 'twelve', 'full']


lets filter all the stop words from our text. 

use **is_stop == false** to remove all the words that are not 
    

In [67]:
#filter stop words from our text.
display(f"number of words before filtering : {len(nlp_split)}")
words_no_stop = [a for a in nlp_split if a.is_stop == False]

display(f"number of words after filtering : {len(words_no_stop)}")

display(words_no_stop)

'number of words before filtering : 61'

'number of words after filtering : 41'

[Neuro,
 -,
 linguistic,
 programming,
 developed,
 1970,
 University,
 California,
 ,,
 Santa,
 Cruz,
 .,
 primary,
 founders,
 John,
 Grinder,
 ,,
 linguist,
 ,,
 Richard,
 Bandler,
 ,,
 information,
 scientist,
 mathematician,
 .,
 Judith,
 DeLozier,
 Leslie,
 Cameron,
 -,
 Bandler,
 contributed,
 significantly,
 field,
 ,,
 David,
 Gordon,
 Robert,
 Dilts,
 .]

## Lemmatize the words.

**using '.lemma_**

In [68]:
words_no_stop_lemma = {ab : [ab.lemma_, ab.pos_, ab.dep_] for ab in words_no_stop}

words_no_stop_lemma

{Neuro: ['Neuro', 'PROPN', 'npadvmod'],
 -: ['-', 'PUNCT', 'punct'],
 linguistic: ['linguistic', 'ADJ', 'amod'],
 programming: ['programming', 'NOUN', 'nsubjpass'],
 developed: ['develop', 'VERB', 'ROOT'],
 1970: ['1970', 'NUM', 'nummod'],
 University: ['University', 'PROPN', 'pobj'],
 California: ['California', 'PROPN', 'pobj'],
 ,: [',', 'PUNCT', 'punct'],
 Santa: ['Santa', 'PROPN', 'compound'],
 Cruz: ['Cruz', 'PROPN', 'pobj'],
 .: ['.', 'PUNCT', 'punct'],
 primary: ['primary', 'ADJ', 'amod'],
 founders: ['founder', 'NOUN', 'nsubj'],
 John: ['John', 'PROPN', 'compound'],
 Grinder: ['Grinder', 'PROPN', 'attr'],
 ,: [',', 'PUNCT', 'punct'],
 linguist: ['linguist', 'ADJ', 'appos'],
 ,: [',', 'PUNCT', 'punct'],
 Richard: ['Richard', 'PROPN', 'compound'],
 Bandler: ['Bandler', 'PROPN', 'conj'],
 ,: [',', 'PUNCT', 'punct'],
 information: ['information', 'NOUN', 'compound'],
 scientist: ['scientist', 'NOUN', 'appos'],
 mathematician: ['mathematician', 'NOUN', 'conj'],
 .: ['.', 'PUNCT', 'p

## Lets do Entity Detection

understanding 'person' date etc....

In [70]:
#identifying entities in text
entities=[(i, i.label_, i.label) for i in nlp_split.ents]
entities

[(Neuro, 'ORG', 383),
 (1970, 'DATE', 391),
 (the University of California, Santa Cruz, 'ORG', 383),
 (John Grinder, 'PERSON', 380),
 (Richard Bandler, 'PERSON', 380),
 (Judith DeLozier, 'PERSON', 380),
 (Leslie Cameron-Bandler, 'PERSON', 380),
 (David Gordon, 'PERSON', 380),
 (Robert Dilts, 'PERSON', 380)]

### Displacy - package helps highlite entities

In [71]:
#import displacy
from spacy import displacy

displacy.render(nlp_split, style = "ent",jupyter = True)