## Text Preprocessing Practice in Python

In [2]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import re                      # Regular expressions

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [0]:
# load data
df_pd = pd.read_csv("C:/Your/files/textdata.csv", encoding = 'utf-8', header = None)
df_np = np.asarray(pd.read_csv("C:/Your/files/textdata.csv", encoding ='utf-8', header = None)

- Tokenization

In [5]:
paragraph = " Hello world! I am craving for coffee before school! "
sentences = sent_tokenize(paragraph)
print(sentences)

[' Hello world!', 'I am craving for coffee before school!']


In [0]:
def tokenization_s(sentences): # same can be achieved for words tokens
    s_new = []
    for sent in (sentences[:][0]): #For NumpY = sentences[:]
        s_token = sent_tokenize(sent)
        if s_token != '':
            s_new.append(s_token)
            
    return s_new

- Regular Expressions

In [0]:
def preprocess(text):
    clean_data = []

    for x in (text[:][0]): 
        new_text = re.sub('<.*?>', '', x)   # remove HTML tags
        new_text = re.sub(r'[^\w\s]', '', new_text) # remove punc.
        new_text = re.sub(r'\d+','',new_text)# remove numbers
        new_text = new_text.lower() # lower case, .upper() for upper          
        if new_text != '':
            clean_data.append(new_text)
            
    return clean_data

- Word Tokenization

In [18]:
sentences = " What kind of coffee do you like? "
words = word_tokenize(sentences)
print(words)

['What', 'kind', 'of', 'coffee', 'do', 'you', 'like', '?']


In [0]:
def tokenization_w(words):
    w_new = []
    for w in (words[:][0]):  # for NumPy = words[:]
        w_token = word_tokenize(w)
        if w_token != '':
            w_new.append(w_token)
            
    return w_new

In [12]:
tokenization_w(sentences)

[[]]

- Stemming

In [0]:
# use Snowball Stemmer from nltk.stem library
snowball = SnowballStemmer(language = 'english')
def stemming(words):
    new = []
    stem_words = [snowball.stem(x) for x in (words[:][0])]
    new.append(stem_words)
    return new

In [19]:
test = ['You like Boba. Why not we go for it today!']
test_pd = pd.DataFrame(test)  # makes this into a panda data frame
clean_test = preprocess(test_pd) # removes punctuation, see above
clean_words = tokenization_w(clean_test) # word tokenization
stem_test = stemming(clean_words) # stemming similar words
print(stem_test)

[['y']]


- Lemmatization

In [0]:
lemmatizer = WordNetLemmatizer()

def lemmatization(words):
    new = []
    lem_words = [lemmatizer.lemmatize(x) for x in (words[:][0])]
    new.append(lem_words)
    return new

In [17]:
lemtest = lemmatization(clean_words)
print(lemtest)

[['y']]
