### The text corpus, file.txt

Follow these steps to implement this activity:

1. Import the necessary libraries.

2. Load the text corpus to a variable.

3. Apply the tokenization process to the text corpus and print the first 20 tokens.

4. Apply spelling correction on each token and print the initial 20 corrected tokens
as well as the corrected text corpus.

5. Apply PoS tags to each of the corrected tokens and print them.

6. Remove stop words from the corrected token list and print the initial 20 tokens.

7. Apply stemming and lemmatization to the corrected token list and then print the
initial 20 tokens.

8. Detect the sentence boundaries in the given text corpus and print the total
number of sentences.

In [20]:
# (1) Import the necessary libraries
import string
import pandas as pd
import numpy as np
from nltk import stem
from nltk import ne_chunk
from nltk.wsd import lesk
from autocorrect import Speller
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag, download

In [21]:
download('stopwords')
download(['punkt','averaged_perceptron_tagger','stopwords'])

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/josephitopa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/josephitopa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/josephitopa/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/josephitopa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [22]:
# (2)  Load the text corpus to a variable
sentence = open("file.txt", 'r').read()
sentence

'The reader of this course should have a basic knowledge of the Python programming lenguage.\nHe/she must have knowldge of data types in Python.He should be able to write functions,\nand also have the ability to import and use libraries and packages in Python. Familiarity\nwith basic linguistics and probability is assumed although not required to fully\ncomplete this course.\n'

In [23]:
# (3) apply tokenization process
sentence_token = word_tokenize(sentence)
sentence_token[:20]

['The',
 'reader',
 'of',
 'this',
 'course',
 'should',
 'have',
 'a',
 'basic',
 'knowledge',
 'of',
 'the',
 'Python',
 'programming',
 'lenguage',
 '.',
 'He/she',
 'must',
 'have',
 'knowldge']

In [24]:
# (4) Apply spelling correction on each token and print the initial 20 corrected tokens as well as the corrected text corpus.
spell = Speller(lang = 'en')
sentence_corrected = ' '.join([spell(word) for word in sentence_token])

In [25]:
# (5)  Correct sentence and words
spell = Speller(lang = 'en')

def correct_sentence(words):
    corrected_sentence = ""
    corrected_word_list = []
    for wd in words:
        if wd not in string.punctuation:
            wd_c = spell(wd)
            if wd_c != wd:
                print(wd+" has been corrected to: "+wd_c)
                corrected_sentence = corrected_sentence+" "+wd_c
                corrected_word_list.append(wd_c)
            else:
                corrected_sentence = corrected_sentence+" "+wd
                corrected_word_list.append(wd)
        else:
            corrected_sentence = corrected_sentence + wd
            corrected_word_list.append(wd)
    return corrected_sentence, corrected_word_list

In [26]:
corrected_sentence, corrected_word_list = correct_sentence(sentence_token)

lenguage has been corrected to: language
knowldge has been corrected to: knowledge


In [27]:
corrected_sentence

' The reader of this course should have a basic knowledge of the Python programming language. He/she must have knowledge of data types in Python.He should be able to write functions, and also have the ability to import and use libraries and packages in Python. Familiarity with basic linguistics and probability is assumed although not required to fully complete this course.'

In [28]:
print(corrected_word_list[0:20])

['The', 'reader', 'of', 'this', 'course', 'should', 'have', 'a', 'basic', 'knowledge', 'of', 'the', 'Python', 'programming', 'language', '.', 'He/she', 'must', 'have', 'knowledge']


In [29]:
# (6)  Apply PoS tags
print(pos_tag(corrected_word_list))

[('The', 'DT'), ('reader', 'NN'), ('of', 'IN'), ('this', 'DT'), ('course', 'NN'), ('should', 'MD'), ('have', 'VB'), ('a', 'DT'), ('basic', 'JJ'), ('knowledge', 'NN'), ('of', 'IN'), ('the', 'DT'), ('Python', 'NNP'), ('programming', 'NN'), ('language', 'NN'), ('.', '.'), ('He/she', 'NNP'), ('must', 'MD'), ('have', 'VB'), ('knowledge', 'NN'), ('of', 'IN'), ('data', 'NNS'), ('types', 'NNS'), ('in', 'IN'), ('Python.He', 'NNP'), ('should', 'MD'), ('be', 'VB'), ('able', 'JJ'), ('to', 'TO'), ('write', 'VB'), ('functions', 'NNS'), (',', ','), ('and', 'CC'), ('also', 'RB'), ('have', 'VBP'), ('the', 'DT'), ('ability', 'NN'), ('to', 'TO'), ('import', 'NN'), ('and', 'CC'), ('use', 'NN'), ('libraries', 'NNS'), ('and', 'CC'), ('packages', 'NNS'), ('in', 'IN'), ('Python', 'NNP'), ('.', '.'), ('Familiarity', 'NN'), ('with', 'IN'), ('basic', 'JJ'), ('linguistics', 'NNS'), ('and', 'CC'), ('probability', 'NN'), ('is', 'VBZ'), ('assumed', 'VBN'), ('although', 'IN'), ('not', 'RB'), ('required', 'VBN'), ('to

In [31]:
# (7) remove stop words
stop_words = stopwords.words('english')
def remove_stop_words(word_list):
    corrected_word_list_without_stopwords = []
    for wd in word_list:
        if wd not in stop_words:
            corrected_word_list_without_stopwords.append(wd)
    return corrected_word_list_without_stopwords

In [32]:
corrected_word_list_without_stopwords = remove_stop_words(corrected_word_list)
corrected_word_list_without_stopwords[:20]

['The',
 'reader',
 'course',
 'basic',
 'knowledge',
 'Python',
 'programming',
 'language',
 '.',
 'He/she',
 'must',
 'knowledge',
 'data',
 'types',
 'Python.He',
 'able',
 'write',
 'functions',
 ',',
 'also']

In [33]:
# (8) Apply stemming and lemmatization
stemmer = stem.PorterStemmer()
def get_stems(word_list):
    corrected_word_list_without_stopwords_stemmed = []
    for wd in word_list:
        corrected_word_list_without_stopwords_stemmed\
        .append(stemmer.stem(wd))
    return corrected_word_list_without_stopwords_stemmed

In [34]:
corrected_word_list_without_stopwords_stemmed = \
get_stems(corrected_word_list_without_stopwords)
corrected_word_list_without_stopwords_stemmed[:20]

['the',
 'reader',
 'cours',
 'basic',
 'knowledg',
 'python',
 'program',
 'languag',
 '.',
 'he/sh',
 'must',
 'knowledg',
 'data',
 'type',
 'python.h',
 'abl',
 'write',
 'function',
 ',',
 'also']

In [35]:
lemmatizer = WordNetLemmatizer()
def get_lemma(word_list):
    corrected_word_list_without_stopwords_lemmatized = []
    for wd in word_list:
        corrected_word_list_without_stopwords_lemmatized\
        .append(lemmatizer.lemmatize(wd))
    return corrected_word_list_without_stopwords_lemmatized

In [36]:
corrected_word_list_without_stopwords_lemmatized = \
get_lemma(corrected_word_list_without_stopwords_stemmed)
corrected_word_list_without_stopwords_lemmatized[:20]

['the',
 'reader',
 'cours',
 'basic',
 'knowledg',
 'python',
 'program',
 'languag',
 '.',
 'he/sh',
 'must',
 'knowledg',
 'data',
 'type',
 'python.h',
 'abl',
 'write',
 'function',
 ',',
 'also']

In [38]:
# (8) Detect the sentence boundaries in the given text corpus 
print(sent_tokenize(corrected_sentence))

[' The reader of this course should have a basic knowledge of the Python programming language.', 'He/she must have knowledge of data types in Python.He should be able to write functions, and also have the ability to import and use libraries and packages in Python.', 'Familiarity with basic linguistics and probability is assumed although not required to fully complete this course.']
