# NLP tutorials

In [13]:
# Core libraries
import pandas as pd

# String handling
import string
import re

# NLP libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download ('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from textblob import TextBlob, Word

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ayub\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ayub\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ayub\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Ayub\AppData\Roaming\nltk_data...


## Raw text

In [2]:
text = [
    "This is the NLP TASKS ARTICLE written by AYuB kHaN **",
    "IN this article I”ll be explaining various DATA-CLEANING techniques",
    "So stay tuned for FURther More &&",
    "Nah I don't think he goes to usf, he lives around"
    ]

# Convert to DataFrame
df = pd.DataFrame({'text':text})

## Data cleaning

In [3]:
# Convert to lowercase 
df['lower'] = df['text'].apply(lambda x: " ".join(x.lower()  for x in x.split()))

In [4]:
# Remove punctuation
df.text.apply(lambda x: "".join(i for i in x if i not in string.punctuation))

0    This is the NLP TASKS ARTICLE written by AYuB ...
1    IN this article I”ll be explaining various DAT...
2                      So stay tuned for FURther More 
3      Nah I dont think he goes to usf he lives around
Name: text, dtype: object

In [5]:
df.text.apply(lambda x:''.join(re.findall(r'[a-zA-Z+" "]', x)))

0    This is the NLP TASKS ARTICLE written by AYuB ...
1    IN this article Ill be explaining various DATA...
2                      So stay tuned for FURther More 
3      Nah I dont think he goes to usf he lives around
Name: text, dtype: object

In [6]:
# Remove stop words
allstopwords = stopwords.words('english')
df.lower.apply(lambda x: " ".join(i for i in x.split() if i not in allstopwords))

0               nlp tasks article written ayub khan **
1    article i”ll explaining various data-cleaning ...
2                                        stay tuned &&
3                     nah think goes usf, lives around
Name: lower, dtype: object

In [7]:
# Spelling correction
df['lower'].apply(lambda x: str(TextBlob(x).correct()))

0    this is the nap tasks article written by club ...
1    in this article i”ll be explaining various dat...
2                    so stay tuned for further more &&
3      ah i don't think he goes to us, he lives around
Name: lower, dtype: object

## Tokenization

* Split text into meaningful unit words

In [8]:
mystring = "My favorite animal is cat" 
word_tokenize(mystring)
mystring.split(" ")

['My', 'favorite', 'animal', 'is', 'cat']

## Stemming

* Convert words into their root word using a set of rules
* The root word may be truncated form of the original
* Root words generated may not have meaning

In [10]:
st = PorterStemmer()
df['text'].apply(lambda x:" ".join([st.stem(word) for word in x.split()]))

0    thi is the nlp task articl written by ayub kha...
1    in thi articl i”ll be explain variou data-clea...
2                     so stay tune for further more &&
3      nah i don't think he goe to usf, he live around
Name: text, dtype: object

## Lemmatization

* Convert words into their root word using vocabulary mapping
* Root words generated are themselves words and have meaning
* Slower than stemming

In [16]:
# Single word example
w = Word("played")

# Pass in WordNet part of speech (verb)
wl = w.lemmatize("v")  

# Lemmatized version
print(f"Lemmatization of '{w}'  ---> '{wl}'")

Lemmatization of 'played'  ---> 'play'


In [None]:
# Lemmatization of dataset
df['processed'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))