# Module 12 Text Processing

- Demo shows basic text processing tasks.
- Normalization
- Stemming
- Lemmatization
- String Methods

### Create a text sample.

In [1]:
s = "Text mining is a subfield of data mining. It also uses machine learning algorithms."

In [2]:
from nltk.tokenize import sent_tokenize
import nltk

In [3]:
#nltk.download('punkt_tab')

### A list of two sentences were returned.

In [4]:
sent_tokenize(s)

['Text mining is a subfield of data mining.',
 'It also uses machine learning algorithms.']

### Import the word tokenizer.

In [5]:
from nltk.tokenize import word_tokenize

### Apply word tokenizer on one sentence.

In [6]:
word_tokenize("Text mining is a subfield of data mining.")

['Text', 'mining', 'is', 'a', 'subfield', 'of', 'data', 'mining', '.']

In [7]:
import string

### Show punctuation marks.

In [8]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

### Add more marks if needed.

In [9]:
string.punctuation+'p'

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~p'

In [10]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [11]:
len('!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')

32

### Remove punctuation.

In [12]:
s = "Text mining is a subfield of data mining."

s.translate(str.maketrans('','', string.punctuation))

'Text mining is a subfield of data mining'

In [13]:
s1 ='I love this, that, and all of those!'

In [14]:
s1.translate(str.maketrans('','', string.punctuation))

'I love this that and all of those'

In [15]:
s = 'Text mining is a subfield of data mining.'

### split() is often used to filter each individual word.

In [16]:
s.split()

['Text', 'mining', 'is', 'a', 'subfield', 'of', 'data', 'mining.']

### join() is used to create a combination of words.

In [17]:
l = ['I','love', 'text', 'mining']

" ".join(l)

'I love text mining'

In [18]:
result =  " ".join(l)

In [19]:
result + "."

'I love text mining.'

### nltk has a stopwords list.

In [20]:
from nltk.corpus import stopwords

english_stops = list(set(stopwords.words('english'))) 

In [21]:
english_stops

['with',
 'too',
 'above',
 'weren',
 'from',
 'both',
 'if',
 'below',
 'shouldn',
 'were',
 'him',
 "couldn't",
 'over',
 'up',
 'haven',
 "hadn't",
 'd',
 'will',
 'ain',
 'into',
 'can',
 'll',
 'that',
 'same',
 'down',
 'to',
 'aren',
 'ma',
 'on',
 'who',
 'during',
 "mightn't",
 "wouldn't",
 'herself',
 'after',
 'very',
 "she's",
 'been',
 'have',
 'such',
 'did',
 'your',
 'in',
 'she',
 'our',
 'i',
 'we',
 't',
 'here',
 'any',
 'my',
 'hers',
 'before',
 'was',
 "shan't",
 'the',
 'didn',
 'ours',
 'o',
 "doesn't",
 'do',
 'wasn',
 'be',
 'until',
 "won't",
 'had',
 'you',
 'is',
 "you'll",
 'how',
 'for',
 'more',
 'through',
 "it's",
 "shouldn't",
 'it',
 'only',
 'he',
 'won',
 'or',
 'has',
 'while',
 'further',
 'by',
 "isn't",
 "you'd",
 'couldn',
 'does',
 'himself',
 'about',
 'those',
 'don',
 "should've",
 'themselves',
 'of',
 "you've",
 "aren't",
 'against',
 'not',
 'are',
 'doing',
 'these',
 'few',
 'should',
 'just',
 "needn't",
 'its',
 'there',
 'hadn',
 

In [22]:
english_stops[:10]

['with',
 'too',
 'above',
 'weren',
 'from',
 'both',
 'if',
 'below',
 'shouldn',
 'were']

In [23]:
len(english_stops)

179

In [24]:
print(english_stops)

['with', 'too', 'above', 'weren', 'from', 'both', 'if', 'below', 'shouldn', 'were', 'him', "couldn't", 'over', 'up', 'haven', "hadn't", 'd', 'will', 'ain', 'into', 'can', 'll', 'that', 'same', 'down', 'to', 'aren', 'ma', 'on', 'who', 'during', "mightn't", "wouldn't", 'herself', 'after', 'very', "she's", 'been', 'have', 'such', 'did', 'your', 'in', 'she', 'our', 'i', 'we', 't', 'here', 'any', 'my', 'hers', 'before', 'was', "shan't", 'the', 'didn', 'ours', 'o', "doesn't", 'do', 'wasn', 'be', 'until', "won't", 'had', 'you', 'is', "you'll", 'how', 'for', 'more', 'through', "it's", "shouldn't", 'it', 'only', 'he', 'won', 'or', 'has', 'while', 'further', 'by', "isn't", "you'd", 'couldn', 'does', 'himself', 'about', 'those', 'don', "should've", 'themselves', 'of', "you've", "aren't", 'against', 'not', 'are', 'doing', 'these', 'few', 'should', 'just', "needn't", 'its', 'there', 'hadn', 'being', 's', 'yours', 'wouldn', 'where', 'a', 'm', 'theirs', 'own', 'their', 'mustn', 'me', 'off', 'yourselv

## Stemming

### Create a list of various forms of kick.

In [25]:
tokens = ['kick', 'kicks', 'kicked', 'kicking']

### Load the nltk porter stemmer. 

In [26]:
import nltk

porter = nltk.PorterStemmer() 

In [27]:
for w in tokens:
    print(porter.stem(w))

kick
kick
kick
kick


## Lemmatization

### Load the nltk WordNet lemmatizer

In [28]:
wnl = nltk.WordNetLemmatizer()

In [29]:
print(tokens)

['kick', 'kicks', 'kicked', 'kicking']


In [30]:
for w in tokens:
    print(wnl.lemmatize(w))

kick
kick
kicked
kicking


## Normalization

In [31]:
s = 'Hello'
print(s.lower())

hello


In [32]:
s = 'hello'
print(s.upper())

HELLO


In [33]:
s = 'hello'
print(s.capitalize())

Hello


In [34]:
s = 'hello world'
print(s.title())

Hello World


## Useful string methods

In [35]:
s = 'Sun'
print(s.isalpha())

True


In [36]:
s = 'Sun'
print(s.isdigit())

False


In [37]:
s = '123'
print(s.isdigit())

True


In [38]:
s = 'fsun123'
print(s.isalnum())

True
