In [25]:
!pip install nltk



In [1]:
paragraph='Natural language processing (NLP) is an interdisciplinary subfield of computer science and information retrieval. It is primarily concerned with giving computers the ability to support and manipulate human language. It involves processing natural language datasets, such as text corpora or speech corpora, using either rule-based or probabilistic (i.e. statistical and, most recently, neural network-based) machine learning approaches. The goal is a computer capable of "understanding"[citation needed] the contents of documents, including the contextual nuances of the language within them. To this end, natural language processing often borrows ideas from theoretical linguistics. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.'

In [2]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [3]:
#tokenization --- convert paragraph into sentences-words
nltk.download('punkt')
sentences=nltk.sent_tokenize(paragraph)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
print(sentences)

['Natural language processing (NLP) is an interdisciplinary subfield of computer science and information retrieval.', 'It is primarily concerned with giving computers the ability to support and manipulate human language.', 'It involves processing natural language datasets, such as text corpora or speech corpora, using either rule-based or probabilistic (i.e.', 'statistical and, most recently, neural network-based) machine learning approaches.', 'The goal is a computer capable of "understanding"[citation needed] the contents of documents, including the contextual nuances of the language within them.', 'To this end, natural language processing often borrows ideas from theoretical linguistics.', 'The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.']


In [5]:
stemmer=PorterStemmer()

In [6]:
stemmer.stem('thinking')

'think'

In [22]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [28]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [25]:
import re
from nltk.corpus import wordnet

corpus=[]
for i in range(len(sentences)):
  review=re.sub('[^a-zA-Z]',' ',sentences[i])
  review=review.lower()
  review=review.split()
  review=[lemmatizer.lemmatize(word) for word in review]
  review =" ".join(review)
  corpus.append(review)

In [29]:
corpus

['natural language processing nlp is an interdisciplinary subfield of computer science and information retrieval',
 'it is primarily concerned with giving computer the ability to support and manipulate human language',
 'it involves processing natural language datasets such a text corpus or speech corpus using either rule based or probabilistic i e',
 'statistical and most recently neural network based machine learning approach',
 'the goal is a computer capable of understanding citation needed the content of document including the contextual nuance of the language within them',
 'to this end natural language processing often borrows idea from theoretical linguistics',
 'the technology can then accurately extract information and insight contained in the document a well a categorize and organize the document themselves']

In [30]:
for i in corpus:
  words=nltk.word_tokenize(i)
  for word in words:
    if word not in set(stopwords.words('english')):
      print(stemmer.stem(word))

natur
languag
process
nlp
interdisciplinari
subfield
comput
scienc
inform
retriev
primarili
concern
give
comput
abil
support
manipul
human
languag
involv
process
natur
languag
dataset
text
corpu
speech
corpu
use
either
rule
base
probabilist
e
statist
recent
neural
network
base
machin
learn
approach
goal
comput
capabl
understand
citat
need
content
document
includ
contextu
nuanc
languag
within
end
natur
languag
process
often
borrow
idea
theoret
linguist
technolog
accur
extract
inform
insight
contain
document
well
categor
organ
document


In [32]:
for i in corpus:
  words=nltk.word_tokenize(i)
  for word in words:
    if word not in set(stopwords.words('english')):
      print(lemmatizer.lemmatize(word))

natural
language
processing
nlp
interdisciplinary
subfield
computer
science
information
retrieval
primarily
concerned
giving
computer
ability
support
manipulate
human
language
involves
processing
natural
language
datasets
text
corpus
speech
corpus
using
either
rule
based
probabilistic
e
statistical
recently
neural
network
based
machine
learning
approach
goal
computer
capable
understanding
citation
needed
content
document
including
contextual
nuance
language
within
end
natural
language
processing
often
borrows
idea
theoretical
linguistics
technology
accurately
extract
information
insight
contained
document
well
categorize
organize
document


In [33]:
## Bag of words
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(binary=True,ngram_range=(2,3))

In [34]:
X=cv.fit_transform(corpus)

In [35]:
cv.vocabulary_

{'natural language': 107,
 'language processing': 95,
 'processing nlp': 139,
 'nlp is': 116,
 'is an': 83,
 'an interdisciplinary': 4,
 'interdisciplinary subfield': 79,
 'subfield of': 153,
 'of computer': 120,
 'computer science': 32,
 'science and': 147,
 'and information': 6,
 'information retrieval': 76,
 'natural language processing': 109,
 'language processing nlp': 96,
 'processing nlp is': 140,
 'nlp is an': 117,
 'is an interdisciplinary': 84,
 'an interdisciplinary subfield': 5,
 'interdisciplinary subfield of': 80,
 'subfield of computer': 154,
 'of computer science': 121,
 'computer science and': 33,
 'science and information': 148,
 'and information retrieval': 7,
 'it is': 91,
 'is primarily': 87,
 'primarily concerned': 135,
 'concerned with': 36,
 'with giving': 193,
 'giving computer': 63,
 'computer the': 34,
 'the ability': 163,
 'ability to': 0,
 'to support': 183,
 'support and': 157,
 'and manipulate': 10,
 'manipulate human': 103,
 'human language': 67,
 'it is

In [36]:
## TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer
cv=TfidfVectorizer()
X=cv.fit_transform(corpus)

In [37]:
corpus[0]

'natural language processing nlp is an interdisciplinary subfield of computer science and information retrieval'

In [39]:
X[0].toarray()

array([[0.        , 0.        , 0.31524446, 0.19419671, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.22367537, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.26167998,
        0.        , 0.31524446, 0.        , 0.22367537, 0.        ,
        0.17011088, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.22367537, 0.        , 0.        , 0.        ,
        0.31524446, 0.        , 0.26167998, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.22367537, 0.        ,
        0.31524446, 0.        , 0.31524446, 0.        , 0.        ,
        0.31524446, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  