In [3]:
import nltk

# Break into sentences

In [7]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/luba/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
mytext = """In the previous chapter, we saw examples of some common NLP
applications that we might encounter in everyday life. If we were asked to
build such an application, think about how we would approach doing so at our
organization. We would normally walk through the requirements and break the
problem down into several sub-problems, then try to develop a step-by-step
procedure to solve them. Since language processing is involved, we would also
list all the forms of text processing needed at each step. This step-by-step
processing of text is known as pipeline. It is the series of steps involved in
building any NLP model. These steps are common in every NLP project, so it
makes sense to study them in this chapter. Understanding some common procedures
in any NLP pipeline will enable us to get started on any NLP problem encountered
in the workplace. Laying out and developing a text-processing pipeline is seen
as a starting point for any NLP application development process. In this
chapter, we will learn about the various steps involved and how they play
important roles in solving the NLP problem and we’ll see a few guidelines
about when and how to use which step. In later chapters, we’ll discuss
specific pipelines for various NLP tasks (e.g., Chapters 4–7)."""


In [8]:
# this will break our text into sentences
my_sentences = nltk.sent_tokenize(mytext)

In [9]:
my_sentences

['In the previous chapter, we saw examples of some common NLP\napplications that we might encounter in everyday life.',
 'If we were asked to\nbuild such an application, think about how we would approach doing so at our\norganization.',
 'We would normally walk through the requirements and break the\nproblem down into several sub-problems, then try to develop a step-by-step\nprocedure to solve them.',
 'Since language processing is involved, we would also\nlist all the forms of text processing needed at each step.',
 'This step-by-step\nprocessing of text is known as pipeline.',
 'It is the series of steps involved in\nbuilding any NLP model.',
 'These steps are common in every NLP project, so it\nmakes sense to study them in this chapter.',
 'Understanding some common procedures\nin any NLP pipeline will enable us to get started on any NLP problem encountered\nin the workplace.',
 'Laying out and developing a text-processing pipeline is seen\nas a starting point for any NLP applicatio

# Break into tokens (tokenization)

In [11]:
nltk.word_tokenize(my_sentences[1])

['If',
 'we',
 'were',
 'asked',
 'to',
 'build',
 'such',
 'an',
 'application',
 ',',
 'think',
 'about',
 'how',
 'we',
 'would',
 'approach',
 'doing',
 'so',
 'at',
 'our',
 'organization',
 '.']

# Removing stop words, digits, punctuation and lowercase

In [14]:
from nltk.corpus import stopwords
from string import punctuation

In [25]:
def preprocess_corpus(texts):
    mystopwords = set(stopwords.words("english"))

    def remove_stop_digits(tokens):
        return [
            token.lower()
            for token in tokens
            if token not in mystopwords
            and not token.isdigit()
            and token not in punctuation
        ]

    return [remove_stop_digits(nltk.word_tokenize(text)) for text in texts]


In [26]:
text_test = "Oh sheat, look at this"

In [27]:
preprocess_corpus(text_test)

[['o'],
 ['h'],
 [],
 [],
 ['h'],
 ['e'],
 [],
 [],
 [],
 [],
 ['l'],
 [],
 [],
 ['k'],
 [],
 [],
 [],
 [],
 [],
 ['h'],
 [],
 []]

# Stemming

In [28]:
from nltk.stem.porter import PorterStemmer

In [29]:
stemmer = PorterStemmer()
word1, word2 = "cards", "revolutions"
print(stemmer.stem(word1), stemmer.stem(word2))

card revolut


# Lemmatization

Look at this lemmatizer based on WordNet from NLTK

In [32]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("better", pos="a"))  # a is for adjective

good


In [2]:
# using spacy
import spacy
sp = spacy.load("en_core_web_sm")
token = sp(u'better')
for word in token:
    print(word.text, word.lemma_)

better well


# POS Tagging

In [3]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp(
    u"Charles Spencer Chaplin was born on 16 april 1889 toHannah Chaplin born Hannah Harriet Pedlingham Hill and Charles Chaplin Sr"
)
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.shape_, token.is_alpha, token.is_stop)


Charles Charles PROPN Xxxxx True False
Spencer Spencer PROPN Xxxxx True False
Chaplin Chaplin PROPN Xxxxx True False
was be AUX xxx True True
born bear VERB xxxx True False
on on ADP xx True True
16 16 NUM dd False False
april april PROPN xxxx True False
1889 1889 NUM dddd False False
toHannah toHannah PROPN xxXxxxx True False
Chaplin Chaplin PROPN Xxxxx True False
born bear VERB xxxx True False
Hannah Hannah PROPN Xxxxx True False
Harriet Harriet PROPN Xxxxx True False
Pedlingham Pedlingham PROPN Xxxxx True False
Hill Hill PROPN Xxxx True False
and and CCONJ xxx True True
Charles Charles PROPN Xxxxx True False
Chaplin Chaplin PROPN Xxxxx True False
Sr Sr PROPN Xx True False
