# Introduction To Natural Language Processing

# Data Preprocessing - NLP

# 1 Split by white spaces

In [1]:
#Eg -> Look how who's is split
text = "Albert Einsten is one of the most brilliant scientist who's ever lived"
#Split into words by white space
words = text.split()
print(words[:100])

['Albert', 'Einsten', 'is', 'one', 'of', 'the', 'most', 'brilliant', 'scientist', "who's", 'ever', 'lived']


# 2 Split by words

In [4]:
import re

#split based on words only
words = re.split(r'\W+', text)
print(words[:100])

['Albert', 'Einsten', 'is', 'one', 'of', 'the', 'most', 'brilliant', 'scientist', 'who', 's', 'ever', 'lived']


# Normalization

In [7]:
#Split based on words only
words = re.split(r'\W+', text)

#convert to lowercase
#Using the list comprehension
words = [word.lower() for word in words]
print(words[:100])

['albert', 'einsten', 'is', 'one', 'of', 'the', 'most', 'brilliant', 'scientist', 'who', 's', 'ever', 'lived']


# Natural Language ToolKit(NLTK)

# 1 Split By Sentence

In [8]:
import nltk
from nltk import sent_tokenize
nltk.download('punkt')

#split into sentences
sentences = sent_tokenize(text)
for sentence in sentences:
    print(sentence)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Albert Einsten is one of the most brilliant scientist who's ever lived


# Split by words

In [9]:
from nltk.tokenize import word_tokenize

#Split into words
tokens = word_tokenize(text)
print(tokens[:100])

['Albert', 'Einsten', 'is', 'one', 'of', 'the', 'most', 'brilliant', 'scientist', 'who', "'s", 'ever', 'lived']


# Filtering

In [10]:
#Split into words
tokens = word_tokenize(text)

#Remove all tokens that are not alphabetic
words = [word for word in tokens if word.isalpha()]

print(words[:100])

['Albert', 'Einsten', 'is', 'one', 'of', 'the', 'most', 'brilliant', 'scientist', 'who', 'ever', 'lived']


# 4 Remove stopwords

In [12]:
#Let's list all the stopwords for NLTK
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

stop_words = stopwords.words('english')
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
#Clean our text
#Split into words

tokens = word_tokenize(text)

#convert to lower
tokens = [w.lower() for w in tokens]

#removes all tokens that are not alphabetic
words = [word for word in tokens if word.isalpha()]

#filter out stopwords
stop_words = set(stopwords.words('english'))
word = [w for w in words if not w in stop_words]
print(word[:100])

['albert', 'einsten', 'one', 'brilliant', 'scientist', 'ever', 'lived']
