[View in Colaboratory](https://colab.research.google.com/github/gmum/natural-language-processing-classes/blob/master/lab-2-preprocessing/notebook.ipynb)

# Lecture 2 - Text preprocessing

## Example of preprocessing

(from [article](https://www.kdnuggets.com/2017/12/general-approach-preprocessing-text-data.html) by Matthew Mayo)

Beyond the standard Python libraries, we are also using the following:

- [NLTK](http://www.nltk.org/) - The Natural Language ToolKit is one of the best-known and most-used NLP libraries in the Python ecosystem, useful for all sorts of tasks from tokenization, to stemming, to part of speech tagging, and beyond
- [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) - BeautifulSoup is a useful library for extracting data from HTML and XML documents
- [Inflect](https://pypi.org/project/inflect/) - This is a simple library for accomplishing the natural language related tasks of generating plurals, singular nouns, ordinals, and indefinite articles, and converting numbers to words

In [3]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting joblib
  Downloading joblib-1.3.2-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.2/302.2 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
Installing collected packages: joblib, nltk
Successfully installed joblib-1.3.2 nltk-3.8.1


In [4]:
import re, string, unicodedata
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('gutenberg')
nltk.download('averaged_perceptron_tagger')
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize, sent_tokenize, TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, PorterStemmer,WordNetLemmatizer
!pip install inflect
import inflect

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/krystynawaniova/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/krystynawaniova/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/krystynawaniova/nltk_data...
[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/krystynawaniova/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/krystynawaniova/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Collecting inflect
  Downloading inflect-7.0.0-py3-none-any.whl (34 kB)
Installing collected packages: inflect
Successfully installed inflect-7.0.0


In [118]:
from typing import List

def sortWithIndexes(arr):
  data =  [(i, number) for i, number in zip(range(len(arr)), arr)]
  data = sorted(data, key=lambda temp: temp[1])
  return [x[1] for x in data], [x[0] for x in data]

def twoSum(nums: List[int], target: int) -> List[int]:
    # n * O(log n)
    i = target
    nums, indexes = sortWithIndexes(nums)
    result = []
    while i > 0:
      try:
        idx = bisearch_idx(nums, i)
        if idx >= 0:
          remaining = target - i
          try:
            remaining_idx = bisearch_idx(nums[:idx], remaining)
          except Exception as X:
            print(X)
          if remaining_idx >= 0:
            result = [remaining_idx, idx]
            break
      except Exception as X:
        print()
      i = i - 1
    # print(f'result: {result}')
    return [indexes[result[0]], indexes[result[1]]]



def bisearch_idx(arr, target):
    idx = int(len(arr) / 2)
    # print(idx)
    if idx >= len(arr):
      raise Exception('Index out of bounds')
    middle = arr[idx]
    if middle < target:
      return idx + 1 + bisearch_idx(arr[(idx + 1):], target)
    elif middle > target:
      return bisearch_idx(arr[:idx], target)
    elif middle == target:
      return idx
    raise Exception('Index out of bounds')


# def bisearch(arr, target):
#   idx = int(len(arr) / 2)
#   if idx >= len(arr):
#     return False
#   middle = arr[idx]
#   if middle < target:
#     return bisearch(arr[(idx + 1):], target)
#   elif middle > target:
#     return bisearch(arr[:idx], target)
#   elif middle == target:
#     return idx
#   return False



# print(twoSum([1,2,3,5,6,7,8], 13))
print(twoSum([2,7,11,15], 9))
# print(bisearch_idx([1,2,3,4,5,6,7,8,9], 12))


# assert bisearch_idx([1,2,3,4,5,6,7,8,9], 6) == 5
# assert bisearch_idx([1,2,3,4,5,6,7,8,9], 7) == 6
# assert bisearch_idx([1,2,3,4,5,6,7,8,9], 3) == 2
# assert bisearch_idx([1,2,3,4,5,6,7,8,9], 1) == 0
# assert bisearch([1,2,3,4,5,6,7,8,9], 7) == True
# assert bisearch([1,2,3,4,5,6,7,8,9], 8) == True
# assert bisearch([1,2,3,4,5,6,7,8,9], 9) == True
# assert bisearch([1,2,3,4,5,6,7,8,9], 10) == False
# assert bisearch_idx([1,2,3,4,5,6,7,8,9], 12) == -1

assert twoSum([1,2,3], 3) == [0,1]
assert twoSum([1,2,3,5,6,7,8], 13) == [3,6]
assert twoSum([2,7,11,15], 9) == [0,1]
assert twoSum([3,2,4], 6) == [1,2]
assert twoSum([1,2,3,5,6,7,8], 13) == [3,6]
# assert(twoSum([1,2,3], 3), [0,1])



[0, 1]
Index out of bounds

















In [194]:
class Solution:
    def sortWithIndexes(self, arr):
        data =  [(i, number) for i, number in zip(range(len(arr)), arr)]
        data = sorted(data, key=lambda temp: temp[1])
        return [x[1] for x in data], [x[0] for x in data]

    # def bisearch_idx(self, arr, target, idx):
    #     idx = int(len(arr) / 2)
    #     if idx >= len(arr):
    #         raise Exception('Index out of bounds')
    #     middle = arr[idx]
    #     if middle < target:
    #         return idx + 1 + self.bisearch_idx(arr[(idx + 1):], target)
    #     elif middle > target:
    #         return self.bisearch_idx(arr[:idx], target)
    #     elif middle == target:
    #         return idx
    #     else:
    #         raise Exception('Index out of bounds')

    def bisearch_idx(self, arr, target, index = 0):
        idx = int(len(arr) / 2)
        if idx >= len(arr):
            raise Exception('Index out of bounds')
        middle = arr[idx]
        if middle < target:
            return self.bisearch_idx(arr[(idx + 1):], target, index + idx + 1)
        elif middle > target:
            return self.bisearch_idx(arr[:idx], target, index )
        elif middle == target:
            return index + idx
        else:
            raise Exception('Index out of bounds')


    def twoSum(self,  nums: List[int], target: int) -> List[int]:
        # n * O(log n)
        index = len(nums) - 1
        nums, indexes = self.sortWithIndexes(nums)
        result = []
        while index >= 0:
            i = nums[index]
            try:
                idx = self.bisearch_idx(nums, i)
                while len(nums) > idx + 1 and nums[idx + 1] == i:
                    idx = idx + 1
                remaining = target - i
                remaining_idx = self.bisearch_idx(nums[:idx], remaining)

                if remaining_idx >= 0:
                    result = [remaining_idx, index]
                    break
            except Exception as X:
                print('Error: ', X)
            index = index - 1
        return [indexes[result[0]], indexes[result[1]]]



solution = Solution()

# assert solution.bisearch_idx([1,2,3,4,5,6,7,8,9], 6) == 5
# assert solution.bisearch_idx([1,2,3,4,5,6,7,8,9], 7) == 6
# assert solution.bisearch_idx([1,2,3,4,5,6,7,8,9], 3) == 2
# assert solution.bisearch_idx([1,2,3,4,5,6,7,8,9], 1) == 0

print(solution.twoSum([0,4,3,0], 0))
assert solution.twoSum([3,2,3], 6) == [0,2]
assert solution.twoSum([0,4,3,0], 0) == [0,3]
assert solution.twoSum([-3,4,3,90], 0) == [0,2]

assert solution.twoSum([1,2,3], 3) == [0,1]
assert solution.twoSum([1,2,3,5,6,7,8], 13) == [3,6]
assert solution.twoSum([2,7,11,15], 9) == [0,1]
assert solution.twoSum([3,2,4], 6) == [1,2]
assert solution.twoSum([1,2,3,5,6,7,8], 13) == [3,6]

Error:  Index out of bounds
Error:  Index out of bounds
[0, 3]
Error:  Index out of bounds
Error:  Index out of bounds
Error:  Index out of bounds
Error:  Index out of bounds
Error:  Index out of bounds
Error:  Index out of bounds
Error:  Index out of bounds


In [213]:
class Solution:
    def timeRequiredToBuy(self, tickets: List[int], k: int) -> int:
        less = [n for n in tickets if n < tickets[k]]
        more = [n for n in tickets if n >= tickets[k]]
        bigger_behind = [n for n in tickets[k + 1:] if n >= tickets[k]]
        # print(less)
        # print(more)
        print(bigger_behind)
        return tickets[k] * len(more) + sum(less) - len(bigger_behind)

solution = Solution()

print(solution.timeRequiredToBuy([15,66,3,47,71,27,54,43,97,34,94,33,54,26,15,52,20,71,88,42,50,6,66,88,36,99,27,82,7,72], 18))


# assert solution.timeRequiredToBuy([2,3,4,5,6], 2) == 14
# assert solution.timeRequiredToBuy([5,1,1,1], 0) == 8
assert solution.timeRequiredToBuy([84,49,5,24,70,77,87,8], 3) == 154
# assert solution.timeRequiredToBuy([15,66,3,47,71,27,54,43,97,34,94,33,54,26,15,52,20,71,88,42,50,6,66,88,36,99,27,82,7,72], 18) == 1457

[88, 99]
1457
[70, 77, 87]


In [216]:

class Solution:
    def timeRequiredToBuy(self, tickets: List[int], k: int) -> int:
        less = list(filter(lambda x: x < tickets[k], tickets))
        more = list(filter(lambda x: x >= tickets[k], tickets))
        bigger_behind = list(filter(lambda x: x >= tickets[k], tickets[k + 1:]))

        return tickets[k] * len(more) + sum(less) - len(bigger_behind)

solution = Solution()

assert solution.timeRequiredToBuy([84,49,5,24,70,77,87,8], 3) == 154


We need some sample text. We'll start with something very small and artificial in order to easily see the results of what we are doing step by step.

In [5]:
sample = """<h1>Title Goes Here</h1>

<b>Bolded Text</b>
<i>Italicized Text</i>

<img src="this should all be gone"/>

<a href="this will be gone, too">But this will still be here!</a>

I run. He ran. She is running. Will they stop running?

I talked. She was talking. They talked to them about running. Who ran to the talking runner?

[Some text we don't want to keep is in here]

¡Sebastián, Nicolás, Alejandro and Jéronimo are going to the store tomorrow morning!

something... is! wrong() with.,; this :: sentence.

I can't do this anymore. I didn't know them. Why couldn't you have dinner at the restaurant?

My favorite movie franchises, in order: Indiana Jones; Marvel Cinematic Universe; Star Wars; Back to the Future; Harry Potter.

Don't do it.... Just don't. Billy! I know what you're doing. This is a great little house you've got here.

[This is some other unwanted text]

John: "Well, well, well."
James: "There, there. There, there."

&nbsp;&nbsp;

There are a lot of reasons not to do this. There are 101 reasons not to do it. 1000000 reasons, actually.
I have to go get 2 tutus from 2 different stores, too.

22    45   1067   445

{{Here is some stuff inside of double curly braces.}}
{Here is more stuff in single curly braces.}

[DELETE]

</body>
</html>"""

A toy dataset indeed, but make no mistake; the steps we are taking here to preprocessing this data are fully transferable.

The text data preprocessing framework:

![](https://www.kdnuggets.com/wp-content/uploads/text-preprocessing-framework-2.png)



### Noise Removal

Let's loosely define noise removal as text-specific normalization tasks which often take place prior to tokenization. Some would argue that, while the other 2 major steps of the preprocessing framework (tokenization and normalization) are basically task-independent, noise removal is much more task-specific.

Sample noise removal tasks could include:

- removing text file headers, footers
- removing HTML, XML, etc. markup and metadata
- extracting valuable data from other formats, such as JSON

As you can imagine, the boundary between noise removal and data collection and assembly, on the one hand, is a fuzzy one, while the line between noise removal and normalization is blurred on the other. Given its close relationship with specific texts and their collection and assembly, many denoising tasks, such as parsing a JSON structure, would obviously need to be implemented prior to tokenization.

In our data preprocessing pipeline, we will strip away HTML markup with the help of the BeautifulSoup library, and use regular expressions to remove open and close double brackets and anything in between them (we assume this is necessary based on our sample text).

In [6]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_between_square_brackets(text):
    return re.sub(r'\[[^]]*\]', '', text)

def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text

sample = denoise_text(sample)
print(sample)

Title Goes Here
Bolded Text
Italicized Text

But this will still be here!

I run. He ran. She is running. Will they stop running?

I talked. She was talking. They talked to them about running. Who ran to the talking runner?



¡Sebastián, Nicolás, Alejandro and Jéronimo are going to the store tomorrow morning!

something... is! wrong() with.,; this :: sentence.

I can't do this anymore. I didn't know them. Why couldn't you have dinner at the restaurant?

My favorite movie franchises, in order: Indiana Jones; Marvel Cinematic Universe; Star Wars; Back to the Future; Harry Potter.

Don't do it.... Just don't. Billy! I know what you're doing. This is a great little house you've got here.



John: "Well, well, well."
James: "There, there. There, there."

  

There are a lot of reasons not to do this. There are 101 reasons not to do it. 1000000 reasons, actually.
I have to go get 2 tutus from 2 different stores, too.

22    45   1067   445

{{Here is some stuff inside of double curly braces

### Tokenization

Tokenization is a step which splits longer strings of text into smaller pieces, or tokens. Larger chunks of text can be tokenized into sentences, sentences can be tokenized into words, etc. Further processing is generally performed after a piece of text has been appropriately tokenized. Tokenization is also referred to as text segmentation or lexical analysis. Sometimes segmentation is used to refer to the breakdown of a large chunk of text into pieces larger than words (e.g. paragraphs or sentences), while tokenization is reserved for the breakdown process which results exclusively in words.

For our task, we will tokenize our sample text into a list of words. This is done using NTLK's word_tokenize() function.



In [7]:
words = nltk.word_tokenize(sample)
print(words)

['Title', 'Goes', 'Here', 'Bolded', 'Text', 'Italicized', 'Text', 'But', 'this', 'will', 'still', 'be', 'here', '!', 'I', 'run', '.', 'He', 'ran', '.', 'She', 'is', 'running', '.', 'Will', 'they', 'stop', 'running', '?', 'I', 'talked', '.', 'She', 'was', 'talking', '.', 'They', 'talked', 'to', 'them', 'about', 'running', '.', 'Who', 'ran', 'to', 'the', 'talking', 'runner', '?', '¡Sebastián', ',', 'Nicolás', ',', 'Alejandro', 'and', 'Jéronimo', 'are', 'going', 'to', 'the', 'store', 'tomorrow', 'morning', '!', 'something', '...', 'is', '!', 'wrong', '(', ')', 'with.', ',', ';', 'this', ':', ':', 'sentence', '.', 'I', 'ca', "n't", 'do', 'this', 'anymore', '.', 'I', 'did', "n't", 'know', 'them', '.', 'Why', 'could', "n't", 'you', 'have', 'dinner', 'at', 'the', 'restaurant', '?', 'My', 'favorite', 'movie', 'franchises', ',', 'in', 'order', ':', 'Indiana', 'Jones', ';', 'Marvel', 'Cinematic', 'Universe', ';', 'Star', 'Wars', ';', 'Back', 'to', 'the', 'Future', ';', 'Harry', 'Potter', '.', 'D

### Normalization
 
Normalization generally refers to a series of related tasks meant to put all text on a level playing field: converting all text to the same case (upper or lower), removing punctuation, converting numbers to their word equivalents, and so on. Normalization puts all words on equal footing, and allows processing to proceed uniformly.

Normalizing text can mean performing a number of tasks, but for our framework we will approach normalization in 3 distinct steps: 
- stemming, 
- lemmatization,
- everything else. 

For specifics on what these distinct steps may be, [see this post](https://www.kdnuggets.com/2017/12/general-approach-preprocessing-text-data.html).

Remember, after tokenization, we are no longer working at a text level, but now at a word level. Our normalization functions, shown below, reflect this. Function names and comments should provide the necessary insight into what each does.

In [8]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize list of tokenized words as verbs"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    return words

words = normalize(words)
print(words)

['title', 'goes', 'bolded', 'text', 'italicized', 'text', 'still', 'run', 'ran', 'running', 'stop', 'running', 'talked', 'talking', 'talked', 'running', 'ran', 'talking', 'runner', 'sebastian', 'nicolas', 'alejandro', 'jeronimo', 'going', 'store', 'tomorrow', 'morning', 'something', 'wrong', 'sentence', 'ca', 'nt', 'anymore', 'nt', 'know', 'could', 'nt', 'dinner', 'restaurant', 'favorite', 'movie', 'franchises', 'order', 'indiana', 'jones', 'marvel', 'cinematic', 'universe', 'star', 'wars', 'back', 'future', 'harry', 'potter', 'nt', 'nt', 'billy', 'know', 'great', 'little', 'house', 'got', 'john', 'well', 'well', 'well', 'james', 'lot', 'reasons', 'one hundred and one', 'reasons', 'one million', 'reasons', 'actually', 'go', 'get', 'two', 'tutus', 'two', 'different', 'stores', 'twenty-two', 'forty-five', 'one thousand and sixty-seven', 'four hundred and forty-five', 'stuff', 'inside', 'double', 'curly', 'braces', 'stuff', 'single', 'curly', 'braces']


Calling the stemming and lemming functions are done as below:

In [9]:

# While stemming involves chopping off prefixes or suffixes from words to obtain a common root,
# lemmatization aims for a valid base form through linguistic analysis.
# Lemmatization tends to be more accurate but can be computationally more expensive than stemming.

def stem_and_lemmatize(words):
    stems = stem_words(words)
    lemmas = lemmatize_verbs(words)
    return stems, lemmas

stems, lemmas = stem_and_lemmatize(words)
print('Stemmed:\n', stems)
print('\nLemmatized:\n', lemmas)

Stemmed:
 ['titl', 'goe', 'bold', 'text', 'it', 'text', 'stil', 'run', 'ran', 'run', 'stop', 'run', 'talk', 'talk', 'talk', 'run', 'ran', 'talk', 'run', 'sebast', 'nicola', 'alejandro', 'jeronimo', 'going', 'stor', 'tomorrow', 'morn', 'someth', 'wrong', 'sent', 'ca', 'nt', 'anym', 'nt', 'know', 'could', 'nt', 'din', 'resta', 'favorit', 'movy', 'franch', 'ord', 'indian', 'jon', 'marvel', 'cinem', 'univers', 'star', 'war', 'back', 'fut', 'harry', 'pot', 'nt', 'nt', 'bil', 'know', 'gre', 'littl', 'hous', 'got', 'john', 'wel', 'wel', 'wel', 'jam', 'lot', 'reason', 'one hundred and on', 'reason', 'one million', 'reason', 'act', 'go', 'get', 'two', 'tut', 'two', 'diff', 'stor', 'twenty-two', 'forty-five', 'one thousand and sixty-seven', 'four hundred and forty-five', 'stuff', 'insid', 'doubl', 'cur', 'brac', 'stuff', 'singl', 'cur', 'brac']

Lemmatized:
 ['title', 'go', 'bolded', 'text', 'italicize', 'text', 'still', 'run', 'run', 'run', 'stop', 'run', 'talk', 'talk', 'talk', 'run', 'run', '

Depending on your NLP task or preference, one of these may be more appropriate than the other. See here for a [discussion on lemmatization vs stemming](https://blog.bitext.com/what-is-the-difference-between-stemming-and-lemmatization/).

In order to resolve ambiguous cases, lemmatization usually requires tokens to be accompanied by part-of-speech tags. For example, the word lemma for rose depends on whether it is used as a noun or a verb:

In [10]:
lemmer = WordNetLemmatizer()
print(f"noun lemmatization: {lemmer.lemmatize('rose', 'n')}")
print(f"verb lemmatization: {lemmer.lemmatize('rose', 'v')}")

noun lemmatization: rose
verb lemmatization: rise


## Exercise 1.

In [18]:
text_1 = """Now the Children of Ilu´vatar are Elves and Men, the Firstborn
and the Followers. And amid all the splendours of the
World, its vast halls and spaces, and its wheeling fires, Ilu´vatar
chose a place for their habitation in the Deeps of Time
and in the midst of the innumerable stars. And this habitation
might seem a little thing to those who consider only the
majesty of the Ainur, and not their terrible sharpness; as who
should take the whole field of Arda for the foundation of a
pillar and so raise it until the cone of its summit were more
bitter than a needle; or who consider only the immeasurable
vastness of the World, which still the Ainur are shaping, and
not the minute precision to which they shape all things
therein. But when the Ainur had beheld this habitation in a
vision and had seen the Children of Ilu´vatar arise therein,
then many of the most mighty among them bent all their
thought and their desire towards that place. And of these
Melkor was the chief, even as he was in the beginning the
greatest of the Ainur who took part in the Music. And he
feigned, even to himself at first, that he desired to go thither
and order all things for the good of the Children of Ilu´vatar,
controlling the turmoils of the heat and the cold that had
come to pass through him. But he desired rather to subdue
to his will both Elves and Men, envying the gifts with which
Ilu´vatar promised to endow them; and he wished himself to
have subjects and servants, and to be called Lord, and to be
a master over other wills.
But the other Ainur looked upon this habitation set within
the vast spaces of the World, which the Elves call Arda,
the Earth; and their hearts rejoiced in light, and their eyes
beholding many colours were filled with gladness; but
because of the roaring of the sea they felt a great unquiet.
And they observed the winds and the air, and the matters of
which Arda was made, of iron and stone and silver and gold
and many substances: but of all these water they most greatly
praised. And it is said by the Eldar that in water there lives
yet the echo of the Music of the Ainur more than in any
substance else that is in this Earth; and many of the Children
of Ilu´vatar hearken still unsated to the voices of the Sea, and
yet know not for what they listen.
Now to water had that Ainu whom the Elves call Ulmo
turned his thought, and of all most deeply was he instructed
by Ilu´vatar in music. But of the airs and winds Manwe¨ most
had pondered, who is the noblest of the Ainur. Of the fabric
of Earth had Aule¨ thought, to whom Ilu´vatar had given skill
and knowledge scare less than to Melkor; but the delight and
pride of Aule¨ is in the deed of making, and in the thing made,
and neither in possession nor in his own mastery; wherefore
he gives and hoards not, and is free from care, passing ever
on to some new work.
And Ilu´vatar spoke to Ulmo, and said: ‘Seest thou not how
here in this little realm in the Deeps of Time Melkor hath
made war upon thy province? He hath bethought him of
bitter cold immoderate, and yet hath not destroyed the beauty
of thy fountains, nor of thy clear pools. Behold the snow,
and the cunning work of frost! Melkor hath devised heats
and fire without restraint, and hath not dried up thy desire
nor utterly quelled the music of the sea. Behold rather the
height and glory of the clouds, and the everchanging mists;
and listen to the fall of rain upon the Earth! And in these
clouds thou art drawn nearer to Manwe¨, thy friend, whom
thou lovest.’
Then Ulmo answered: ‘Truly, Water is become now fairer
than my heart imagined, neither had my secret thought conceived
the snowflake, nor in all my music was contained the
falling of the rain. I will seek Manwe¨, that he and I may make
melodies for ever to thy delight!’ And Manwe¨ and Ulmo have
from the beginning been allied, and in all things have served
most faithfully the purpose of Ilu´vatar.
But even as Ulmo spoke, and while the Ainur were yet
gazing upon this vision, it was taken away and hidden from
their sight; and it seemed to them that in that moment they
perceived a new thing, Darkness, which they had not known
before except in thought. But they had become enamoured
of the beauty of the vision and engrossed in the unfolding
of the World which came there to being, and their minds
were filled with it; for the history was incomplete and the
circles of time not full-wrought when the vision was taken
away. And some have said that the vision ceased ere the
fulfilment of the Dominion of Men and the fading of the
Firstborn; wherefore, though the Music is over all, the Valar
have not seen as with sight the Later Ages or the ending of
the World.
Then there was unrest among the Ainur; but Ilu´vatar called
to them, and said: ‘I know the desire of your minds that what
ye have seen should verily be, not only in your thought, but
even as ye yourselves are, and yet other. Therefore I say: Ea¨!
Let these things Be! And I will send forth into the Void the
Flame Imperishable, and it shall be at the heart of the World,
and the World shall Be; and those of you that will may go
down into it.’ And suddenly the Ainur saw afar off a light,
as it were a cloud with a living heart of flame; and they knew
that this was no vision only, but that Ilu´vatar had made a
new thing: Ea¨, the World that Is.""".replace("\n", " ")



1. Make a vocabulary that for each token contains the number of its occurencies in above text. Store the vocabulary as a list of tuples. Sort this vocabulary by the number of occurences, from biggest to smallest. Return the second most popular word from the dictionary. Use word_tokenize from nltk.

2. Repeat this process, but this time also convert all tokens to lowercase and lemmatize all tokens as verbs.

3. Use sentence tokenizer (nltk.sent_tokenize) to find the longest sentence (with respect to number of characters) in the text and return the number of words in this sentence (excluding punctuation!)

4. Read about [different tokenizers](https://www.nltk.org/api/nltk.tokenize.html) from NLTK. Give example of sentence, that would be tokenized better by TweetTokenizer().tokenize() and a sentence that would be better after word_tokenize().


In [64]:
def create_dictionary(words):
    word_freq = {}
    for word in words:
        if word not in word_freq:
            word_freq[word] = 0
        word_freq[word] += 1
    return word_freq

words = nltk.word_tokenize(text_1)


In [65]:
# 1. Tokenize the text into words
print(words)
print(len(words))
print(create_dictionary(words))
list_of_words_1 = [(word, freq) for word, freq in create_dictionary(words).items()]
sorted_list_1 = sorted(list_of_words_1, key=lambda x: x[1], reverse=True)
print(sorted_list_1)
print(len(sorted_list_1))

['Now', 'the', 'Children', 'of', 'Ilu´vatar', 'are', 'Elves', 'and', 'Men', ',', 'the', 'Firstborn', 'and', 'the', 'Followers', '.', 'And', 'amid', 'all', 'the', 'splendours', 'of', 'the', 'World', ',', 'its', 'vast', 'halls', 'and', 'spaces', ',', 'and', 'its', 'wheeling', 'fires', ',', 'Ilu´vatar', 'chose', 'a', 'place', 'for', 'their', 'habitation', 'in', 'the', 'Deeps', 'of', 'Time', 'and', 'in', 'the', 'midst', 'of', 'the', 'innumerable', 'stars', '.', 'And', 'this', 'habitation', 'might', 'seem', 'a', 'little', 'thing', 'to', 'those', 'who', 'consider', 'only', 'the', 'majesty', 'of', 'the', 'Ainur', ',', 'and', 'not', 'their', 'terrible', 'sharpness', ';', 'as', 'who', 'should', 'take', 'the', 'whole', 'field', 'of', 'Arda', 'for', 'the', 'foundation', 'of', 'a', 'pillar', 'and', 'so', 'raise', 'it', 'until', 'the', 'cone', 'of', 'its', 'summit', 'were', 'more', 'bitter', 'than', 'a', 'needle', ';', 'or', 'who', 'consider', 'only', 'the', 'immeasurable', 'vastness', 'of', 'the',

In [66]:
# 2. Tokenize the text into words, lemmatize and lowercase
words_2 = to_lowercase(words)
words_2 = lemmatize_verbs(words_2)

list_of_words_2 = [(word, freq) for word, freq in create_dictionary(words_2).items()]
sorted_list_2 = sorted(list_of_words_2, key=lambda x: x[1], reverse=True)
print(sorted_list_2)
print(len(sorted_list_2))

[('the', 99), ('and', 71), (',', 57), ('of', 56), ('be', 34), ('in', 23), ('to', 23), ('.', 22), ('have', 17), (';', 15), ('that', 15), ('ilu´vatar', 12), ('a', 12), ('but', 12), ('not', 11), ('ainur', 10), ('all', 9), ('they', 9), ('he', 9), ('world', 8), ('their', 8), ('this', 7), ('it', 7), ('which', 7), ('for', 6), ('as', 6), ('vision', 6), ('think', 6), ('music', 6), ('make', 6), (':', 6), ('thy', 6), ('who', 5), ('most', 5), ('desire', 5), ('will', 5), ('with', 5), ('say', 5), ('yet', 5), ('ulmo', 5), ('hath', 5), ('!', 5), ('i', 5), ('children', 4), ('elves', 4), ('habitation', 4), ('thing', 4), ('only', 4), ('take', 4), ('than', 4), ('things', 4), ('behold', 4), ('many', 4), ('them', 4), ('these', 4), ('melkor', 4), ('even', 4), ('call', 4), ('upon', 4), ('earth', 4), ('water', 4), ('know', 4), ('manwe¨', 4), ('nor', 4), ('now', 3), ('men', 3), ('its', 3), ('time', 3), ('arda', 3), ('see', 3), ('then', 3), ('his', 3), ('other', 3), ('sea', 3), ('there', 3), ('whom', 3), ('from'

In [68]:
sentenses =nltk.sent_tokenize(text_1)
words_in_sentenses = [nltk.word_tokenize(sentense) for sentense in sentenses]
words_in_sentenses_without_punctuation = [remove_punctuation(sentense) for sentense in words_in_sentenses]
list_with_sentense_len = sorted([len(sentense) for sentense in words_in_sentenses_without_punctuation], reverse=True)
print(len(sentenses))
print(list_with_sentense_len)

27
[82, 79, 66, 56, 52, 49, 48, 48, 46, 45, 43, 42, 38, 38, 37, 36, 28, 26, 25, 23, 23, 23, 17, 14, 9, 4, 4]


In [69]:
# 1
vocab_1 = sorted_list_1
second_most_popular_1 = sorted_list_1[1]
# 2
vocab_2 = sorted_list_2
second_most_popular_2 = sorted_list_2[1]

# 3
num_tokens = list_with_sentense_len[0]


assert len(vocab_1) == 379
assert len(vocab_2) == 336
assert num_tokens == 82
print(second_most_popular_1)
print(second_most_popular_2)

('and', 59)
('and', 71)


## Exercise 2.



In [70]:
# You can download texts using NLTK :)

raw = nltk.corpus.gutenberg.raw("burgess-busterbrown.txt")
# print(raw[:500])
words = nltk.corpus.gutenberg.words("burgess-busterbrown.txt")
# print(words[:20])
sents = nltk.corpus.gutenberg.sents("burgess-busterbrown.txt")
# print(sents[:5])

Using "burgess-busterbrown.txt", do the following:

1. Count the number of sentences containing word "the" (case insensitive)
2. Compute the average token length in the above corpus.
3. (Stemming) Read about [Porter](http://snowball.tartarus.org/algorithms/english/stemmer.html) and [Lancaster](https://www.nltk.org/_modules/nltk/stem/lancaster.html) stemmers and (after lowercasing the tokens) find words from above file that have different stemmer outputs.
4. (Lemmatization) Perform lemmatization on above corpus. Use POS tagger (defined below) to write a lemmatizer that uses a specific POS tag for each word. Give an example of sentences (from corpus) where using lemmatizer with POS tags is working better than lemmatize_verbs(), that assumes that each word is a verb.

In [80]:
#solution 1.
sentenses = nltk.sent_tokenize(raw)
print(len(sentenses))
sentenses_to_words = [to_lowercase(nltk.word_tokenize(sentense)) for sentense in sentenses]
print(len(sentenses_to_words))
result = list(filter(lambda x: "the" in x, sentenses_to_words))
len(result)

1001
1001


369

In [87]:
import functools
import statistics

#solution 2.

words = nltk.word_tokenize(raw)
print(len(words))
words_len = [len(word) for word in words]
sum = functools.reduce(lambda x, y: x + y, words_len)
print("Result: " ,sum/len(words_len))
result = statistics.mean(words_len)
result

18542
Result:  3.6145507496494447


3.6145507496494447

In [94]:
# solution 3.
def stem_words_lanc(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def stem_words_porter(words):
    """Stem words in list of tokenized words"""
    stemmer = PorterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

words = nltk.word_tokenize(raw)
words = to_lowercase(words)
words = remove_punctuation(words)
words_dic = create_dictionary(words)
words = list(words_dic.keys())
words_lanc = stem_words_lanc(words)
words_porter = stem_words_porter(words)


print(len(words_lanc))
print(len(words_porter))
print(words_porter)
print(words_lanc)

different_word_pairs = []

for l,p in zip(words_lanc, words_porter):
    if l != p:
        different_word_pairs.append((l,p))
print(different_word_pairs)
len(different_word_pairs)

1538
1538
['the', 'adventur', 'of', 'buster', 'bear', 'by', 'thornton', 'w', 'burgess', '1920', 'i', 'goe', 'fish', 'yawn', 'as', 'he', 'lay', 'on', 'hi', 'comfort', 'bed', 'leav', 'and', 'watch', 'first', 'earli', 'morn', 'sunbeam', 'creep', 'through', 'green', 'forest', 'to', 'chase', 'out', 'black', 'shadow', 'onc', 'more', 'slowli', 'got', 'feet', 'shook', 'himself', 'then', 'walk', 'over', 'a', 'big', 'pinetre', 'stood', 'up', 'hind', 'leg', 'reach', 'high', 'trunk', 'tree', 'could', 'scratch', 'bark', 'with', 'great', 'claw', 'after', 'that', 'until', 'it', 'seem', 'if', 'jaw', 'would', 'crack', 'sat', 'down', 'think', 'what', 'want', 'for', 'breakfast', 'while', 'there', 'tri', 'make', 'mind', 'tast', 'best', 'wa', 'listen', 'sound', 'told', 'wake', 'all', 'littl', 'peopl', 'who', 'live', 'in', 'heard', 'sammi', 'jay', 'way', 'off', 'distanc', 'scream', 'thief', 'grin', 'wonder', 'thought', 'some', 'one', 'ha', 'stolen', 's', 'or', 'els', 'probabl', 'is', 'chatter', 'red', 'squi

560

In [None]:
# for 4.


def pt_to_wn(pos):
    """
    Takes a Penn Treebank tag and converts it to an
    appropriate WordNet equivalent for lemmatization.

    A list of Penn Treebank tags is available at:
    https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
    """

    from nltk.corpus.reader.wordnet import NOUN, VERB, ADJ, ADV

    pos = pos.lower()

    if pos.startswith('jj'):
        tag = ADJ
    elif pos == 'md':
        # Modal auxiliary verbs
        tag = VERB
    elif pos.startswith('rb'):
        tag = ADV
    elif pos.startswith('vb'):
        tag = VERB
    elif pos == 'wrb':
        # Wh-adverb (how, however, whence, whenever...)
        tag = ADV
    else:
        # default to VERB
        # This is not strictly correct, but it is good
        # enough for lemmatization.
        tag = VERB

    return tag


def nltk_pos_tagger(tokens):
    """
    Takes a list of tokens and returns a list of
    tuples [(token, wordnet_tag), ..]
    """

    # Tag tokens with part-of-speech:
    tagged = nltk.pos_tag(tokens)

    # Convert our Treebank-style tags to WordNet-style tags.
    tagged = [(word, pt_to_wn(tag))
                     for (word, tag) in tagged]
    return tagged


def lemmatizer(tokens):
    tagged = nltk_pos_tagger(tokens)

    #write code to lemmatize tokens using taggs from nltk_pos_tagger

    pass
