# Natural Language Processing With Python's NLTK Package

### Getting Started With Python’s NLTK

In [None]:
#pip install nltk==3.5

In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### Tokenizing

In [4]:
example_string = """Muad'Dib learned rapidly because his first training was in how to learn.
And the first lesson of all was the basic trust that he could learn.
It's shocking to find how many people do not believe they can learn,
and how many more believe learning to be difficult."""

#### Tokenizing by Sentence

In [6]:
from nltk.tokenize import sent_tokenize  

sentences=sent_tokenize(example_string)
print(sentences)

["Muad'Dib learned rapidly because his first training was in how to learn.", 'And the first lesson of all was the basic trust that he could learn.', "It's shocking to find how many people do not believe they can learn,\nand how many more believe learning to be difficult."]


In [7]:
for s in sentences:
  print(s)

Muad'Dib learned rapidly because his first training was in how to learn.
And the first lesson of all was the basic trust that he could learn.
It's shocking to find how many people do not believe they can learn,
and how many more believe learning to be difficult.


#### Tokenizing by word

In [18]:
from nltk.tokenize import word_tokenize  

words=word_tokenize(example_string)
print(words)
for w in words:
  print(w)



["Muad'Dib", 'learned', 'rapidly', 'because', 'his', 'first', 'training', 'was', 'in', 'how', 'to', 'learn', '.', 'And', 'the', 'first', 'lesson', 'of', 'all', 'was', 'the', 'basic', 'trust', 'that', 'he', 'could', 'learn', '.', 'It', "'s", 'shocking', 'to', 'find', 'how', 'many', 'people', 'do', 'not', 'believe', 'they', 'can', 'learn', ',', 'and', 'how', 'many', 'more', 'believe', 'learning', 'to', 'be', 'difficult', '.']
Muad'Dib
learned
rapidly
because
his
first
training
was
in
how
to
learn
.
And
the
first
lesson
of
all
was
the
basic
trust
that
he
could
learn
.
It
's
shocking
to
find
how
many
people
do
not
believe
they
can
learn
,
and
how
many
more
believe
learning
to
be
difficult
.


In [20]:
data=[]
for s in sentences:
  data.append(word_tokenize(s))
print(s) 

It's shocking to find how many people do not believe they can learn,
and how many more believe learning to be difficult.


### Filtering Stop Words

Stop words are words that you want to ignore, so you filter them out of your text when you’re processing it. Very common words like 'in', 'is', and 'an' are often used as stop words since they don’t add a lot of meaning to a text in and of themselves.

In [21]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [22]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
#special character removal
#['Sir', 'protest', 'merry', 'man']

In [23]:
worf_quote = "Sir, I protest. I am not a merry man!"

In [24]:
words_in_quote = word_tokenize(worf_quote)
words_in_quote


['Sir', ',', 'I', 'protest', '.', 'I', 'am', 'not', 'a', 'merry', 'man', '!']

In [25]:
stop_words = set(stopwords.words("english"))

In [None]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [26]:
filtered_list = []

In [27]:
for word in words_in_quote:
    if word.casefold() not in stop_words:
        filtered_list.append(word)

In [28]:
filtered_list = [
    word for word in words_in_quote if word.casefold() not in stop_words
]

In [29]:
filtered_list

['Sir', ',', 'protest', '.', 'merry', 'man', '!']

### Stemming

In [30]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
# PorterStemmer is an algorithm

In [31]:
stemmer = PorterStemmer()  #steamer is an abject of porterstemmer

In [32]:
string_for_stemming = """
The crew of the USS Discovery discovered many discoveries.
Discovering is what explorers do."""

In [33]:
words = word_tokenize(string_for_stemming)

In [None]:
words

['The',
 'crew',
 'of',
 'the',
 'USS',
 'Discovery',
 'discovered',
 'many',
 'discoveries',
 '.',
 'Discovering',
 'is',
 'what',
 'explorers',
 'do',
 '.']

In [34]:
stemmed_words = [stemmer.stem(word) for word in words]

In [35]:
stemmed_words

['the',
 'crew',
 'of',
 'the',
 'uss',
 'discoveri',
 'discov',
 'mani',
 'discoveri',
 '.',
 'discov',
 'is',
 'what',
 'explor',
 'do',
 '.']

## Tagging Parts of Speech

In [36]:
sagan_quote = """
If you wish to make an apple pie from scratch,
you must first invent the universe."""

In [37]:
words_in_sagan_quote = word_tokenize(sagan_quote)

In [38]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [39]:
nltk.pos_tag(words_in_sagan_quote)

[('If', 'IN'),
 ('you', 'PRP'),
 ('wish', 'VBP'),
 ('to', 'TO'),
 ('make', 'VB'),
 ('an', 'DT'),
 ('apple', 'NN'),
 ('pie', 'NN'),
 ('from', 'IN'),
 ('scratch', 'NN'),
 (',', ','),
 ('you', 'PRP'),
 ('must', 'MD'),
 ('first', 'VB'),
 ('invent', 'VB'),
 ('the', 'DT'),
 ('universe', 'NN'),
 ('.', '.')]

In [40]:
nltk.download('tagsets')

nltk.help.upenn_tagset()

[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Unzipping help/tagsets.zip.
$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je j

In [41]:
jabberwocky_excerpt = """
'Twas brillig, and the slithy toves did gyre and gimble in the wabe:
all mimsy were the borogoves, and the mome raths outgrabe."""

In [42]:
words_in_excerpt = word_tokenize(jabberwocky_excerpt)

In [43]:
nltk.pos_tag(words_in_excerpt)

[("'Twas", 'CD'),
 ('brillig', 'NN'),
 (',', ','),
 ('and', 'CC'),
 ('the', 'DT'),
 ('slithy', 'JJ'),
 ('toves', 'NNS'),
 ('did', 'VBD'),
 ('gyre', 'NN'),
 ('and', 'CC'),
 ('gimble', 'JJ'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('wabe', 'NN'),
 (':', ':'),
 ('all', 'DT'),
 ('mimsy', 'NNS'),
 ('were', 'VBD'),
 ('the', 'DT'),
 ('borogoves', 'NNS'),
 (',', ','),
 ('and', 'CC'),
 ('the', 'DT'),
 ('mome', 'JJ'),
 ('raths', 'NNS'),
 ('outgrabe', 'RB'),
 ('.', '.')]

## Lemmatizing

In [44]:
from nltk.stem import WordNetLemmatizer

In [45]:
lemmatizer = WordNetLemmatizer()

In [46]:
 nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [47]:
lemmatizer.lemmatize("scarves")

'scarf'

In [48]:
string_for_lemmatizing = "The friends of DeSoto love scarves."

In [49]:
words = word_tokenize(string_for_lemmatizing)

In [50]:
words

['The', 'friends', 'of', 'DeSoto', 'love', 'scarves', '.']

In [51]:
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

In [52]:
lemmatized_words

['The', 'friend', 'of', 'DeSoto', 'love', 'scarf', '.']

In [53]:
lemmatizer.lemmatize("worst")

'worst'

In [54]:
lemmatizer.lemmatize("worst", pos="a")

'bad'

# Text Feature Extraction

## N-grams

N-grams are the combination of multiple words used together. Ngrams with N=1 are called unigrams. Similarly, bigrams (N=2), trigrams (N=3) and so on can also be used.

In [55]:
from textblob import TextBlob

In [56]:
example_string = """Muad'Dib learned rapidly because his first training was in how to learn.
And the first lesson of all was the basic trust that he could learn.
It's shocking to find how many people do not believe they can learn,
and how many more believe learning to be difficult."""

In [58]:
TextBlob(example_string).ngrams(3)

[WordList(["Muad'Dib", 'learned', 'rapidly']),
 WordList(['learned', 'rapidly', 'because']),
 WordList(['rapidly', 'because', 'his']),
 WordList(['because', 'his', 'first']),
 WordList(['his', 'first', 'training']),
 WordList(['first', 'training', 'was']),
 WordList(['training', 'was', 'in']),
 WordList(['was', 'in', 'how']),
 WordList(['in', 'how', 'to']),
 WordList(['how', 'to', 'learn']),
 WordList(['to', 'learn', 'And']),
 WordList(['learn', 'And', 'the']),
 WordList(['And', 'the', 'first']),
 WordList(['the', 'first', 'lesson']),
 WordList(['first', 'lesson', 'of']),
 WordList(['lesson', 'of', 'all']),
 WordList(['of', 'all', 'was']),
 WordList(['all', 'was', 'the']),
 WordList(['was', 'the', 'basic']),
 WordList(['the', 'basic', 'trust']),
 WordList(['basic', 'trust', 'that']),
 WordList(['trust', 'that', 'he']),
 WordList(['that', 'he', 'could']),
 WordList(['he', 'could', 'learn']),
 WordList(['could', 'learn', 'It']),
 WordList(['learn', 'It', "'s"]),
 WordList(['It', "'s", 's

## Bag of Words(BOW) model

Bag of Words (BoW) refers to the representation of text which describes the presence of words within the text data

In [59]:
import pandas as pd

In [60]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

In [61]:
text = ["They love NLP",
        "NLP is future",
        "They will learn in two months"]
vectorizer = CountVectorizer()
count_matrix = vectorizer.fit_transform(text)
count_array = count_matrix.toarray()
df = pd.DataFrame(data=count_array,columns = vectorizer.get_feature_names())
print(df)

   future  in  is  learn  love  months  nlp  they  two  will
0       0   0   0      0     1       0    1     1    0     0
1       1   0   1      0     0       0    1     0    0     0
2       0   1   0      1     0       1    0     1    1     1




In [62]:
text2 = ['They love NLP but can not learn in two months']
vectorizer.transform(text2).toarray()

array([[0, 1, 0, 1, 1, 1, 1, 1, 1, 0]])

In [63]:
text = ["food was not bad","I am not feeling bad"]
vectorizer = CountVectorizer(ngram_range = (1,2))
count_matrix = vectorizer.fit_transform(text)
count_array = count_matrix.toarray()
df = pd.DataFrame(data=count_array,columns = vectorizer.get_feature_names())
print(df)

   am  am not  bad  feeling  feeling bad  food  food was  not  not bad  \
0   0       0    1        0            0     1         1    1        1   
1   1       1    1        1            1     0         0    1        0   

   not feeling  was  was not  
0            0    1        1  
1            1    0        0  




## Term Frequency – Inverse Document Frequency (TF-IDF)

In [64]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [66]:
text = ["i love the NLP",
        "NLP is the future",
        "i will learn the NLP"]
vectorizer = TfidfVectorizer()
matrix = vectorizer.fit_transform(text)
count_array = matrix.toarray()
df = pd.DataFrame(data=count_array,columns = vectorizer.get_feature_names())
print(df)

     future        is     learn      love       nlp       the      will
0  0.000000  0.000000  0.000000  0.767495  0.453295  0.453295  0.000000
1  0.608845  0.608845  0.000000  0.000000  0.359594  0.359594  0.000000
2  0.000000  0.000000  0.608845  0.000000  0.359594  0.359594  0.608845




## Word Embeddings

Word Embedding is the representation of text in the form of vectors. The underlying idea here is that similar words will have a minimum distance between their vectors.

In [67]:
#pip install gensim

In [70]:
#convert text into the word2vec format.

from gensim.scripts.glove2word2vec import glove2word2vec

glove_input_file = 'glove.6B.50d.txt'

word2vec_output_file = 'glove.6B.50d.txt.word2vec'

glove2word2vec(glove_input_file, word2vec_output_file)

(4898, 50)

Now, we can load the above word2vec file as a model.

In [72]:
from gensim.models import KeyedVectors # load the Stanford GloVe model

filename = 'glove.6B.50d.txt.word2vec'

model = KeyedVectors.load_word2vec_format(filename, binary=False)

ValueError: ignored

Let’s say our tweet contains a text saying ‘go away’. We can easily obtain it’s word vector using the above model:

In [None]:
model['go']

array([ 1.4828e-01,  1.7761e-01,  4.2346e-01, -3.1489e-01,  3.2273e-01,
       -7.2413e-01, -7.8955e-01,  4.9214e-01, -2.0693e-01, -5.5088e-04,
       -4.7877e-01,  2.8853e-01, -5.7376e-01,  2.7217e-01,  1.1129e+00,
        5.7808e-01,  6.9321e-01, -2.8652e-01, -5.4545e-02, -6.1826e-01,
        1.7227e-01,  2.9263e-01,  3.8184e-01,  6.2186e-01,  5.5461e-01,
       -1.7411e+00, -2.8802e-01, -1.7140e-01,  7.4743e-01, -1.0135e+00,
        3.3596e+00,  1.1370e+00, -1.0028e+00,  1.7685e-01, -6.1795e-03,
       -6.3491e-02,  1.9077e-01,  4.4046e-02,  3.8228e-01, -4.1607e-01,
       -5.0359e-01, -8.3803e-02,  1.7508e-01,  4.0420e-01,  7.7324e-02,
        1.7415e-01,  1.2541e-01, -2.1820e-01,  1.2971e-01,  3.2953e-01],
      dtype=float32)

In [None]:
model['away']

array([ 0.34176  , -0.32715  ,  0.66209  , -0.71138  ,  0.28488  ,
       -0.19242  , -0.85185  ,  0.56403  , -0.13852  , -0.06717  ,
       -0.42702  , -0.20546  , -0.70012  , -0.13799  ,  0.29457  ,
        0.1881   ,  0.50458  , -0.14432  , -0.73977  , -0.63253  ,
        0.06105  ,  0.55907  ,  0.45083  ,  0.16689  ,  0.55929  ,
       -1.924    ,  0.48437  ,  0.66656  ,  0.89432  , -1.0412   ,
        3.1784   ,  1.0617   , -0.15902  ,  0.0067243, -0.35329  ,
        0.39728  , -0.44211  ,  0.41718  ,  0.38365  , -0.39747  ,
       -0.15511  ,  0.21717  ,  0.047058 ,  0.3904   , -0.20639  ,
        0.075575 ,  0.09143  , -1.0418   ,  0.24466  , -1.1117   ],
      dtype=float32)

In [None]:
(model['go'] + model['away'])/2

array([ 0.24502   , -0.07477   ,  0.54277503, -0.513135  ,  0.303805  ,
       -0.458275  , -0.8207    ,  0.528085  , -0.17272499, -0.03386044,
       -0.452895  ,  0.041535  , -0.63694   ,  0.06709   ,  0.703735  ,
        0.38309   ,  0.598895  , -0.21542001, -0.3971575 , -0.625395  ,
        0.11666   ,  0.42584997,  0.416335  ,  0.39437503,  0.55695   ,
       -1.83255   ,  0.09817499,  0.24757999,  0.82087505, -1.02735   ,
        3.269     ,  1.09935   , -0.58090997,  0.09178715, -0.17973475,
        0.1668945 , -0.12567   ,  0.230613  ,  0.382965  , -0.40677   ,
       -0.32935   ,  0.0666835 ,  0.111069  ,  0.3973    , -0.064533  ,
        0.12486251,  0.10842   , -0.63      ,  0.187185  , -0.39108503],
      dtype=float32)

## Sentiment Analysis

In [None]:
import pandas as pd

train = pd.read_csv('train_E6oV3lV.csv')

In [None]:
#the sentiment of the first few tweets.

train['tweet'][:5].apply(lambda x: TextBlob(x).sentiment)

0         (-0.5, 1.0)
1          (0.2, 0.2)
2          (0.0, 0.0)
3    (0.9765625, 0.6)
4          (0.0, 0.0)
Name: tweet, dtype: object

In [None]:
train['sentiment'] = train['tweet'].apply(lambda x: TextBlob(x).sentiment[0] )
train[['tweet','sentiment']].head()

Unnamed: 0,tweet,sentiment
0,@user when a father is dysfunctional and is s...,-0.5
1,@user @user thanks for #lyft credit i can't us...,0.2
2,bihday your majesty,0.0
3,#model i love u take with u all the time in ...,0.976562
4,factsguide: society now #motivation,0.0
