## Introduction to Natural Language Processing
<br>
<br>

In [9]:
import nltk
import nltk.corpus

In [19]:
text = "Natural language processing is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data."

### Tokenization

In [20]:
from nltk.tokenize import word_tokenize
tokens = word_tokenize(text)
print(tokens)


['Natural', 'language', 'processing', 'is', 'a', 'subfield', 'of', 'linguistics', ',', 'computer', 'science', ',', 'and', 'artificial', 'intelligence', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', 'language', ',', 'in', 'particular', 'how', 'to', 'program', 'computers', 'to', 'process', 'and', 'analyze', 'large', 'amounts', 'of', 'natural', 'language', 'data', '.']


In [21]:
type(tokens), len(tokens)

(list, 42)

### Frequency

In [22]:
from nltk.probability import FreqDist
fdist = FreqDist()
for i in tokens:
    fdist[i]=fdist[i]+1
fdist

FreqDist({'language': 3, ',': 3, 'and': 3, 'of': 2, 'computers': 2, 'to': 2, 'Natural': 1, 'processing': 1, 'is': 1, 'a': 1, ...})

In [24]:
top5=fdist.most_common(5)
top5

[('language', 3), (',', 3), ('and', 3), ('of', 2), ('computers', 2)]

### Bigram

In [29]:
print(tokens[:15])
list(nltk.bigrams(tokens[:15]))

['Natural', 'language', 'processing', 'is', 'a', 'subfield', 'of', 'linguistics', ',', 'computer', 'science', ',', 'and', 'artificial', 'intelligence']


[('Natural', 'language'),
 ('language', 'processing'),
 ('processing', 'is'),
 ('is', 'a'),
 ('a', 'subfield'),
 ('subfield', 'of'),
 ('of', 'linguistics'),
 ('linguistics', ','),
 (',', 'computer'),
 ('computer', 'science'),
 ('science', ','),
 (',', 'and'),
 ('and', 'artificial'),
 ('artificial', 'intelligence')]

In [31]:
list(nltk.trigrams(tokens[:15]))

[('Natural', 'language', 'processing'),
 ('language', 'processing', 'is'),
 ('processing', 'is', 'a'),
 ('is', 'a', 'subfield'),
 ('a', 'subfield', 'of'),
 ('subfield', 'of', 'linguistics'),
 ('of', 'linguistics', ','),
 ('linguistics', ',', 'computer'),
 (',', 'computer', 'science'),
 ('computer', 'science', ','),
 ('science', ',', 'and'),
 (',', 'and', 'artificial'),
 ('and', 'artificial', 'intelligence')]

In [32]:
list(nltk.ngrams(tokens[:15],4))

[('Natural', 'language', 'processing', 'is'),
 ('language', 'processing', 'is', 'a'),
 ('processing', 'is', 'a', 'subfield'),
 ('is', 'a', 'subfield', 'of'),
 ('a', 'subfield', 'of', 'linguistics'),
 ('subfield', 'of', 'linguistics', ','),
 ('of', 'linguistics', ',', 'computer'),
 ('linguistics', ',', 'computer', 'science'),
 (',', 'computer', 'science', ','),
 ('computer', 'science', ',', 'and'),
 ('science', ',', 'and', 'artificial'),
 (',', 'and', 'artificial', 'intelligence')]

### Stemming 

Stemming is the process of reducing a word to it's word stem by cutting off the beginning or the end

In [33]:
from nltk.stem import PorterStemmer
pst=PorterStemmer()

In [34]:
pst.stem("winning"),pst.stem("studies"),pst.stem("buying")

('win', 'studi', 'buy')

### Lemmatization

Lemmatization is the process of grouping together the different inflected forms of a word so they can be analyzed as a single item

In [35]:
from nltk.stem import wordnet, WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [42]:
words= ["widget","flowers","geese"]

In [43]:
for i in words:
    print(i + " : " + lemmatizer.lemmatize(i))

widget : widget
flowers : flower
geese : goose


### Part of Speech Tagging 

In [44]:
text = "What are you trying to say! We are all here to learn"
for i in word_tokenize(text) :
    print(nltk.pos_tag([i]))

[('What', 'WP')]
[('are', 'VBP')]
[('you', 'PRP')]
[('trying', 'VBG')]
[('to', 'TO')]
[('say', 'VB')]
[('!', '.')]
[('We', 'PRP')]
[('are', 'VBP')]
[('all', 'DT')]
[('here', 'RB')]
[('to', 'TO')]
[('learn', 'NN')]


### Named Entity Recognition

Named Entity Recognition is the process of taking a string of text as input and identify relevant nouns that are mentioned in that string

In [50]:
from nltk import ne_chunk

In [53]:
text = "John lives in New York"
text_ner = ne_chunk(nltk.pos_tag(word_tokenize(text)))
print(text_ner)

(S (PERSON John/NNP) lives/VBZ in/IN (GPE New/NNP York/NNP))


### Spacy

In [54]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [55]:
doc = nlp("this is our corpus.")
for token in doc :
    print(token.text)

this
is
our
corpus
.


In [48]:
for token in doc :
    print(token.i,token.text)

0 this
1 is
2 our
3 corpus
4 .


In [61]:
doc = nlp("An average cup of coffee costs 2.70$")
doc

An average cup of coffee costs 2.70$

In [62]:
for token in doc : 
    print(token.i, token.text,token.pos_)

0 An DET
1 average ADJ
2 cup NOUN
3 of ADP
4 coffee NOUN
5 costs VERB
6 2.70 NUM
7 $ SYM


In [63]:
for ent in doc.ents:
    print(ent.text, ent.label_)

2.70$ MONEY


####  Matcher

In [64]:
from spacy.matcher import Matcher

In [65]:
doc = nlp("John likes to share his knowledge")


In [66]:
pattern = [{'LEMMA':'share'},{'ORTH':'his'}]
matcher = Matcher(nlp.vocab)
matcher.add('white_Pattern',[pattern])
matches = matcher(doc)

In [67]:
for _,start,end in matches:
    span = doc[start:end]
    print(span.text)

share his


In [68]:
doc = nlp("2018 FIFA world cup : France won!!!")


In [69]:
pattern = [{'IS_DIGIT':True},{'LOWER':'fifa'},{'LOWER':'world'},{'LOWER':'cup'}]
matcher = Matcher(nlp.vocab)
matcher.add('white_Pattern',[pattern])
matches = matcher(doc)

In [70]:
for _,start,end in matches:
    span = doc[start:end]
    print(span.text)

2018 FIFA world cup
