In [1]:
import re
import string
import unidecode
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import gensim.downloader as api
from pycontractions import Contractions
from word2number import w2n


In [2]:
text = "“Everything we’re doing is about going forward,” Phoebe Philo told Vogue in 2009, shortly before showing her debut Resort collection for Céline. Although the label had garnered headlines when it was revived by Michael Kors in the late ’90s, it was Philo who truly brought the till then somewhat somnambulant luxury house to the forefront. Critics credited her with pushing fashion in a new direction, toward a more spare, stripped-down kind of sophistication. What Céline now offered women was, as the magazine put it, “a grown-up and hip way to put themselves together.”"

# tokenization:Tokenization means splitting up strings of text into smaller pieces.

In [3]:
#NLTK has a sentence tokenizer, as well as a word tokenizer.
#First let's look at the sentence tokenizer.
#the sentence tokenizer will split a paragraph into sentences.
sentences = sent_tokenize(text)
print(sentences)


['“Everything we’re doing is about going forward,” Phoebe Philo told Vogue in 2009, shortly before showing her debut Resort collection for Céline.', 'Although the label had garnered headlines when it was revived by Michael Kors in the late ’90s, it was Philo who truly brought the till then somewhat somnambulant luxury house to the forefront.', 'Critics credited her with pushing fashion in a new direction, toward a more spare, stripped-down kind of sophistication.', 'What Céline now offered women was, as the magazine put it, “a grown-up and hip way to put themselves together.”']


In [4]:
# use the word tokenizer to split the first sentence into word tokens.
test_sentence = sentences[0]
words = word_tokenize(test_sentence)
print(words)

['“', 'Everything', 'we', '’', 're', 'doing', 'is', 'about', 'going', 'forward', ',', '”', 'Phoebe', 'Philo', 'told', 'Vogue', 'in', '2009', ',', 'shortly', 'before', 'showing', 'her', 'debut', 'Resort', 'collection', 'for', 'Céline', '.']


In [6]:
#split() operation to split up a string of text into words.
words= test_sentence.split()
print(words)
#Using split(), the string is broken up in words based on whitespace, and the punctuation is grouped in with the words instead of broken up as its own token.

['“Everything', 'we’re', 'doing', 'is', 'about', 'going', 'forward,”', 'Phoebe', 'Philo', 'told', 'Vogue', 'in', '2009,', 'shortly', 'before', 'showing', 'her', 'debut', 'Resort', 'collection', 'for', 'Céline.']


# removing stopwords

In [8]:
import nltk
nltk.download('stopwords')


stop_words = set(stopwords.words('english'))
filtered = [word for word in word_tokenize(test_sentence) if word not in stop_words]
print(filtered)


['“', 'Everything', '’', 'going', 'forward', ',', '”', 'Phoebe', 'Philo', 'told', 'Vogue', '2009', ',', 'shortly', 'showing', 'debut', 'Resort', 'collection', 'Céline', '.']


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/madhumithaganji/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# removing whitespace

In [9]:
#strip() method on strings in Python to remove leading and trailing whitespace.
#raw text data can look pretty awful in the middle as well.
#One way to remove the extra whitespaces is with a regular expression to replace any group of more than one space in the text string with a single space.
test_sentence = re.sub(' +',' ', test_sentence)
print(test_sentence)

“Everything we’re doing is about going forward,” Phoebe Philo told Vogue in 2009, shortly before showing her debut Resort collection for Céline.


# converting to lowercase

In [10]:
test_sentence = test_sentence.lower()
print(test_sentence)

“everything we’re doing is about going forward,” phoebe philo told vogue in 2009, shortly before showing her debut resort collection for céline.


# expanding contractions

In [21]:
pattern = r'we[\’\']re'
replacement = 'we are'
test_sentence = re.sub(pattern,replacement,test_sentence)
print(test_sentence)

“everything we are doing is about going forward,” phoebe philo told vogue in 2009, shortly before showing her debut resort collection for céline.


# removing punctuation

In [22]:
#Removing punctuation can be done with the built-in string module in Python.

punctuation_table = str.maketrans('','',string.punctuation)
test_sentence = test_sentence.translate(punctuation_table)
print(test_sentence)


“everything we are doing is about going forward” phoebe philo told vogue in 2009 shortly before showing her debut resort collection for céline


In [23]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

# Accented characters and other unicode issues

In [24]:
#Removing accents helps to normalize the words in your text data.
#Céline and Celine would be considered separate tokens
'"' in test_sentence

False

In [26]:
#We can strip all of these unicode characters from the sentence with the unidecode module.
test_sentence = unidecode.unidecode(test_sentence)
print(test_sentence)


"everything we are doing is about going forward" phoebe philo told vogue in 2009 shortly before showing her debut resort collection for celine


In [27]:
#check for quotation marks
'"' in test_sentence

True

# convert number words into numeric

In [28]:
text = "five hundred twenty five thousand six hundred"
print(w2n.word_to_num(text))


525600


# lemmatization

In [30]:
#Lemmatization converts a word to its base form, removing grammatical inflection.
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
d = lemmatizer.lemmatize('dresses')
print(d)



dress


In [31]:
#Stemming is similar to lemmatization, but it mainly chops off a prefix or suffix, while the lemmatizer takes into account parts of speech and is more sophisticated in determining the base form of a word
stemmer = PorterStemmer()
d = stemmer.stem('dresses')
print(d)

dress


In [32]:
better_lemmatized = lemmatizer.lemmatize('better', pos='a')
better_stemmed = stemmer.stem('better')

print(better_lemmatized)


print(better_stemmed)


good
better
