In [68]:
##### manipulating text data

In [1]:
import numpy as np
import pandas as pd

In [2]:
import re
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\herma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# lowercase string data
text = "Yes, that is a duplicate catalog category. The catalog number is C1357-A."
text = text.lower()
text

'yes, that is a duplicate catalog category. the catalog number is c1357-a.'

In [5]:
# replace a string with another string or character
text = "Yes, that is a duplicate Catalog category. The Catalog number is C1357-A."
regex = re.compile(r"cat")
text = regex.sub("#", text)
text

'Yes, that is a dupli#e Catalog #egory. The Catalog number is C1357-A.'

In [6]:
# replace a set of characters appearing anywhere with another string or character:
# here we'll use a character class made up of characters c, a, and t directly
text = "Yes, that is a duplicate catalog category. The catalog number is C1357-A."
regex = re.compile(r"[cat]", re.IGNORECASE)
text = regex.sub("#", text)
text

'Yes, #h## is # dupli###e ####log ###egory. #he ####log number is #1357-#.'

In [7]:
# replace a set of characters appearing anywhere with another string or character:
# here we'll use a character class made up of characters c, a, and t directly
# since it's just a character class, it doesn't matter in what order the characters c, a, t 
# appear in the character class .
text = "Yes, that is a duplicate catalog category. The catalog number is C1357-A."
regex = re.compile(r"[tac]", re.IGNORECASE)
text = regex.sub("#", text)
text

'Yes, #h## is # dupli###e ####log ###egory. #he ####log number is #1357-#.'

In [8]:
# replace a set of characters appearing anywhere with another string or character:
# here we'll create a character class of characters c, a, and t, using string formatting
text = "Yes, that is a duplicate catalog category. The catalog number is C1357-A."
pattern = "cat"
regex = re.compile(r"[%s]" % pattern, re.IGNORECASE)
text = regex.sub("#", text)
text

'Yes, #h## is # dupli###e ####log ###egory. #he ####log number is #1357-#.'

In [9]:
# replace digits appearing anywhere with another string or character:
# here we'll use a character class made up of digits 0-9 directly
text = "Yes, that is a duplicate catalog category. The catalog number is C1357-A."
regex = re.compile(r"[0-9]")
text = regex.sub("#", text)
text

'Yes, that is a duplicate catalog category. The catalog number is C####-A.'

In [10]:
string.digits

'0123456789'

In [11]:
# replace digits appearing anywhere with another string or character:
# here we'll create a character class of digits using string formatting
text = "Yes, that is a duplicate catalog category. The catalog number is C1357-A."
pattern = string.digits
regex = re.compile(r"[%s]" % pattern)
text = regex.sub("#", text)
text

'Yes, that is a duplicate catalog category. The catalog number is C####-A.'

In [12]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [13]:
# similarly, can replace punctuations with another string or character:
# here we'll create a character class of punctuations using string formatting
text = "Yes, that is a duplicate catalog category. The catalog number is C1357-A."
pattern = string.punctuation
regex = re.compile(r"[%s]" % pattern)
text = regex.sub("#", text)
text

'Yes# that is a duplicate catalog category# The catalog number is C1357#A#'

In [14]:
# similarly, can replace digits AND punctuations with another string or character:
# here we'll create a character class of digits and punctuations using string formatting
text = "Yes, that is a duplicate catalog category. The catalog number is C1357-A."
pattern1 = string.digits
pattern2 = string.punctuation
regex = re.compile(r"[%s%s]" % (pattern1,pattern2))
text = regex.sub("#", text)
text

'Yes# that is a duplicate catalog category# The catalog number is C#####A#'

In [15]:
# or, as is done more typically, we can get rid of  digits and punctuation by replacing with 
# empty string 
text = "Yes, that is a duplicate catalog category. The catalog number is C1357-A."
pattern1 = string.digits
pattern2 = string.punctuation
regex = re.compile(r"[%s%s]" % (pattern1,pattern2))
text = regex.sub("", text)
text

'Yes that is a duplicate catalog category The catalog number is CA'

In [16]:
# replace one or more white-space characters with single space
text = "Yes, that is a duplicate   catalog \t category. The   catalog    number is C1357-A.\n"
regex = re.compile(r"\s+")
regex.sub(' ', text)
#re.sub(r"\s+", ' ', text)

'Yes, that is a duplicate catalog category. The catalog number is C1357-A. '

In [17]:
# remove stopwords from a sentence
# hint: use split, list comprehension, and join

sen = "Yes, that is a duplicate catalog category. The catalog number is C1357-A."
sw = ['the', 'is', 'a', 'that', 'in']

senlst = sen.split()
senclean = ' '.join([w for w in senlst if w.lower() not in sw])
senclean

'Yes, duplicate catalog category. catalog number C1357-A.'

In [85]:
# text (from BB)
text = """Python is an interpreted, high-level, general-purpose programming 
language. Created by Guido van Rossum and first released in 1991, Python has a 
design philosophy that emphasizes code readability, notably using significant 
whitespace. It provides constructs that enable clear programming on both small 
and large scales.[26] Van Rossum led the language community until stepping 
down as leader in July 2018.[27][28] Python features a dynamic type system 
and automatic memory management. It supports multiple programming paradigms, 
including object-oriented, imperative, functional and procedural, and has a 
large and comprehensive standard library.[29] Python interpreters are 
available for many operating systems. CPython, the reference implementation of 
Python, is open source software[30] and has a community-based development 
model, as do nearly all of Python's other implementations. Python and CPython 
are managed by the non-profit Python Software Foundation."""
text

"Python is an interpreted, high-level, general-purpose programming \nlanguage. Created by Guido van Rossum and first released in 1991, Python has a \ndesign philosophy that emphasizes code readability, notably using significant \nwhitespace. It provides constructs that enable clear programming on both small \nand large scales.[26] Van Rossum led the language community until stepping \ndown as leader in July 2018.[27][28] Python features a dynamic type system \nand automatic memory management. It supports multiple programming paradigms, \nincluding object-oriented, imperative, functional and procedural, and has a \nlarge and comprehensive standard library.[29] Python interpreters are \navailable for many operating systems. CPython, the reference implementation of \nPython, is open source software[30] and has a community-based development \nmodel, as do nearly all of Python's other implementations. Python and CPython \nare managed by the non-profit Python Software Foundation."

In [86]:
# preprocess text - lower case, remove digits, remove punctuation, remove extra white-spaces, 
# remove stop words

# all englist stopwords
sw = stopwords.words('english')

# lower case
text = text.lower()          
# remove digits and punctuation
regex = re.compile(r"[%s%s]" % (string.punctuation, string.digits))
text = regex.sub(' ', text)        
# replace one or more white-space characters with a space
regex = re.compile(r"\s+")                               
text = regex.sub(' ', text)       
# remove stop words
text = text.split()                                              
text = ' '.join([w for w in text if w not in sw]) 
text

'python interpreted high level general purpose programming language created guido van rossum first released python design philosophy emphasizes code readability notably using significant whitespace provides constructs enable clear programming small large scales van rossum led language community stepping leader july python features dynamic type system automatic memory management supports multiple programming paradigms including object oriented imperative functional procedural large comprehensive standard library python interpreters available many operating systems cpython reference implementation python open source software community based development model nearly python implementations python cpython managed non profit python software foundation'

In [87]:
# write a function preprocess(text) that pre-processes the text:
# lower case, remove digits, remove punctuation, remove extra white-spaces, remove stop words
def preprocess(text):
    # lower case
    text = text.lower()          
    # remove digits and punctuation
    regex = re.compile(r"[%s%s]" % (string.punctuation, string.digits))
    text = regex.sub(' ', text)        
    # replace one or more white-space characters with a space
    regex = re.compile(r"\s+")                               
    text = regex.sub(' ', text)       
    # remove stop words
    text = text.split()                                              
    text = ' '.join([w for w in text if w not in sw]) 
    return text

In [88]:
# doc corpus (from BB)
corpus = ["This is a brown house. This house is big.",
          "This is a small house. This house has 1 bedroom.",
          "This dog is brown. This dog likes to play",
          "The dog is in the bedroom."]
corpus

['This is a brown house. This house is big.',
 'This is a small house. This house has 1 bedroom.',
 'This dog is brown. This dog likes to play',
 'The dog is in the bedroom.']

In [89]:
# use list comprehension to preprocess documents in the corpus
corpus = list(map(preprocess, corpus))
corpus

['brown house house big',
 'small house house bedroom',
 'dog brown dog likes play',
 'dog bedroom']

In [90]:
# use list comprehension to split documents in the corpus to a list of words
corpus = [c.split() for c in corpus]
corpus

[['brown', 'house', 'house', 'big'],
 ['small', 'house', 'house', 'bedroom'],
 ['dog', 'brown', 'dog', 'likes', 'play'],
 ['dog', 'bedroom']]

In [91]:
# use list comprehension to convert each document in corpus from list of words notation to series objects (with default indices)
corpus = [pd.Series(c) for c in corpus]
corpus

[0    brown
 1    house
 2    house
 3      big
 dtype: object, 0      small
 1      house
 2      house
 3    bedroom
 dtype: object, 0      dog
 1    brown
 2      dog
 3    likes
 4     play
 dtype: object, 0        dog
 1    bedroom
 dtype: object]

In [92]:
# use list comprehension along with value_counts method to convert word series to word frequency series
corpus = [c.value_counts() for c in corpus]
corpus

[house    2
 brown    1
 big      1
 dtype: int64, house      2
 bedroom    1
 small      1
 dtype: int64, dog      2
 play     1
 brown    1
 likes    1
 dtype: int64, bedroom    1
 dog        1
 dtype: int64]

In [93]:
# convert the corpus to a dataframe
df = pd.DataFrame(corpus)
df

Unnamed: 0,house,brown,big,bedroom,small,dog,play,likes
0,2.0,1.0,1.0,,,,,
1,2.0,,,1.0,1.0,,,
2,,1.0,,,,2.0,1.0,1.0
3,,,,1.0,,1.0,,


In [94]:
# fill missing values with 0
df.fillna(value=0, inplace=True)
df

Unnamed: 0,house,brown,big,bedroom,small,dog,play,likes
0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1,2.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,2.0,1.0,1.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
