# Module 12 Text Processing

- Demo shows basic text processing tasks.
- Normalization
- Stemming
- Lemmatization
- String Methods

### Create a text sample.

In [1]:
s = "Text mining is a subfield of data mining. It also uses machine learning algorithms."

In [2]:
# import the sentence tokenizer.
from nltk.tokenize import sent_tokenize
import nltk

In [3]:
#nltk.download('punkt_tab')

### A list of two sentences were returned.

In [4]:
# A list of two sentences were returned.
sent_tokenize(s)

['Text mining is a subfield of data mining.',
 'It also uses machine learning algorithms.']

### Import the word tokenizer.

In [5]:
from nltk.tokenize import word_tokenize

### Apply word tokenizer on one sentence.

In [6]:
word_tokenize("Text mining is a subfield of data mining.")

['Text', 'mining', 'is', 'a', 'subfield', 'of', 'data', 'mining', '.']

In [7]:
import string

### Show punctuation marks.

In [8]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

### Add more marks if needed.

In [9]:
string.punctuation+'p'

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~p'

In [10]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [11]:
len('!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')

32

### Remove punctuation.

In [12]:
s = "Text mining is a subfield of data mining."

s.translate(str.maketrans('','', string.punctuation))

'Text mining is a subfield of data mining'

In [13]:
s = 'Text mining is a subfield of data mining'

### split() is often used to filter each individual word.

In [14]:
s.split()

['Text', 'mining', 'is', 'a', 'subfield', 'of', 'data', 'mining']

### join() is used to create a combination of words.

In [15]:
l = ['love', 'text', 'mining']

" ".join(l)

'love text mining'

### nltk has a stopwords list.

In [16]:
from nltk.corpus import stopwords

english_stops = set(stopwords.words('english')) 

In [17]:
english_stops

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [18]:
len(english_stops)

179

In [19]:
print(english_stops)

{'is', 'was', 'from', 'on', 'then', 'their', 'her', 'he', 'both', 'of', 'themselves', 'if', "shouldn't", 'my', "don't", 'over', 'ma', 'here', "that'll", 'for', 'are', 'while', 'aren', 'as', 'your', 'have', 'at', "aren't", 'hers', 'own', "couldn't", 'hadn', 'these', 'why', 'm', 'same', "hasn't", 'an', 'not', "you'd", 'doing', 'yours', 'himself', 'ourselves', 'me', 'most', 'had', 'between', "doesn't", 'nor', 'further', 'shouldn', 'until', 'against', 'it', 'we', 'haven', "shan't", 'up', 'again', 'out', 'a', 'itself', 'them', "it's", 'than', 'won', 've', 'myself', 'how', "isn't", 'shan', 'no', 'when', 'didn', 'our', 'that', 'yourself', 'y', "you'll", 'whom', 'has', 'the', "you've", 'did', 'should', 'once', 'having', 'll', "needn't", "wouldn't", "she's", 'been', 'they', 'more', 'isn', 't', 'with', "didn't", "should've", 'yourselves', 'herself', 'does', 'couldn', 'off', 'being', 'all', 'by', 'were', 'or', 'can', "hadn't", 'am', 'below', 'weren', "you're", 'be', 're', 'some', 'which', "weren'

## Stemming

### Create a list of various forms of kick.

In [20]:
tokens = ['kick', 'kicks', 'kicked', 'kicking']

### Load the nltk porter stemmer. 

In [21]:
import nltk

porter = nltk.PorterStemmer() 

In [22]:
for w in tokens:
    print(porter.stem(w))

kick
kick
kick
kick


## Lemmatization

### Load the nltk WordNet lemmatizer

In [23]:
wnl = nltk.WordNetLemmatizer()

In [24]:
print(tokens)

['kick', 'kicks', 'kicked', 'kicking']


In [25]:
for w in tokens:
    print(wnl.lemmatize(w))

kick
kick
kicked
kicking


## Normalization

In [26]:
s = 'Hello'
print(s.lower())

hello


In [27]:
s = 'hello'
print(s.upper())

HELLO


In [28]:
s = 'hello'
print(s.capitalize())

Hello


In [29]:
s = 'hello world'
print(s.title())

Hello World


## Useful string methods

In [30]:
s = 'Sun'
print(s.isalpha())

True
