**Preprocessing Natural Language**


We clean up a dataset of natural language data and use word2vec to embed the language in word vectors.

**N.B.**: Some, all or none of these preprocessing steps may be helpful to a given downstream application.

**Dependencies load**

In [None]:
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('stopwords')

import string

import gensim
from gensim.models.phrases import Phraser, Phrases
from gensim.models.word2vec import Word2Vec

import spacy # for a lemmatization example

from sklearn.manifold import TSNE

import pandas as pd
from bokeh.io import output_notebook, output_file
from bokeh.plotting import show, figure

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


 **Loading Data**

In [None]:
from nltk.corpus import gutenberg

In [None]:
# a convenient method that handles newlines, as well as tokenizing sentences and words in one shot
gberg_sents = gutenberg.sents()

In [None]:
  gberg_sents[0:6]

[['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']'],
 ['VOLUME', 'I'],
 ['CHAPTER', 'I'],
 ['Emma',
  'Woodhouse',
  ',',
  'handsome',
  ',',
  'clever',
  ',',
  'and',
  'rich',
  ',',
  'with',
  'a',
  'comfortable',
  'home',
  'and',
  'happy',
  'disposition',
  ',',
  'seemed',
  'to',
  'unite',
  'some',
  'of',
  'the',
  'best',
  'blessings',
  'of',
  'existence',
  ';',
  'and',
  'had',
  'lived',
  'nearly',
  'twenty',
  '-',
  'one',
  'years',
  'in',
  'the',
  'world',
  'with',
  'very',
  'little',
  'to',
  'distress',
  'or',
  'vex',
  'her',
  '.'],
 ['She',
  'was',
  'the',
  'youngest',
  'of',
  'the',
  'two',
  'daughters',
  'of',
  'a',
  'most',
  'affectionate',
  ',',
  'indulgent',
  'father',
  ';',
  'and',
  'had',
  ',',
  'in',
  'consequence',
  'of',
  'her',
  'sister',
  "'",
  's',
  'marriage',
  ',',
  'been',
  'mistress',
  'of',
  'his',
  'house',
  'from',
  'a',
  'very',
  'early',
  'period',
  '.'],
 ['Her',
  'mother',
  'h

In [None]:
gberg_sents[4][14]

'father'

**Iteratively preprocess a sentence**

**Here is a tokenized sentence:**

In [None]:
gberg_sents[4]

['She',
 'was',
 'the',
 'youngest',
 'of',
 'the',
 'two',
 'daughters',
 'of',
 'a',
 'most',
 'affectionate',
 ',',
 'indulgent',
 'father',
 ';',
 'and',
 'had',
 ',',
 'in',
 'consequence',
 'of',
 'her',
 'sister',
 "'",
 's',
 'marriage',
 ',',
 'been',
 'mistress',
 'of',
 'his',
 'house',
 'from',
 'a',
 'very',
 'early',
 'period',
 '.']

**now to lowercase:**

In [None]:
[w.lower() for w in gberg_sents[4]]

['she',
 'was',
 'the',
 'youngest',
 'of',
 'the',
 'two',
 'daughters',
 'of',
 'a',
 'most',
 'affectionate',
 ',',
 'indulgent',
 'father',
 ';',
 'and',
 'had',
 ',',
 'in',
 'consequence',
 'of',
 'her',
 'sister',
 "'",
 's',
 'marriage',
 ',',
 'been',
 'mistress',
 'of',
 'his',
 'house',
 'from',
 'a',
 'very',
 'early',
 'period',
 '.']

**To remove stopwords and punctuation**

In [None]:
stpwords = stopwords.words('english') + list(string.punctuation)

In [None]:
stpwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [None]:
[w.lower() for w in gberg_sents[4] if w.lower() not in stpwords]
#Here the lower case words from the fifth sentence is compared with the list of stopwords and words which are not in that list and printed

['youngest',
 'two',
 'daughters',
 'affectionate',
 'indulgent',
 'father',
 'consequence',
 'sister',
 'marriage',
 'mistress',
 'house',
 'early',
 'period']

**Stem words:**

In [None]:
stemmer = PorterStemmer() #Here is a stemming algorithm method

In [None]:
[stemmer.stem(w.lower()) for w in gberg_sents[4] if w.lower() not in stpwords]

['youngest',
 'two',
 'daughter',
 'affection',
 'indulg',
 'father',
 'consequ',
 'sister',
 'marriag',
 'mistress',
 'hous',
 'earli',
 'period']

**A lemmatization example:**

In [None]:
nlp = spacy.load('en_core_web_sm') # the spacy method is imported separately as it follows a different flow of loading
#Also to perform lemmatization with Spacy we need a detailed reference dictionary, where the above is a core english web based dictionary 
#which is of a small version while there is also an large version

In [None]:
gutenberg.raw()[291:477]
#Importing the raw dictionary of gutenberg just to visualize the working of Spacy

"She was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period."

In [None]:
spacy_doc = nlp(gutenberg.raw()[291:477])

In [None]:
[w.lemma_ for w in spacy_doc]
#for every word w in spacy_doc we lemmatize

['-PRON-',
 'be',
 'the',
 'young',
 'of',
 'the',
 'two',
 'daughter',
 'of',
 'a',
 'most',
 'affectionate',
 ',',
 '\n',
 'indulgent',
 'father',
 ';',
 'and',
 'have',
 ',',
 'in',
 'consequence',
 'of',
 '-PRON-',
 'sister',
 "'s",
 'marriage',
 ',',
 '\n',
 'be',
 'mistress',
 'of',
 '-PRON-',
 'house',
 'from',
 'a',
 'very',
 'early',
 'period',
 '.']

**Handle bigram collocations:**

In [None]:
phrases = Phrases(gberg_sents) # Training a detector

In [None]:
bigram = Phraser(phrases) # create a more efficient Phraser object for transforming "detected" sentences

In [None]:
bigram.phrasegrams #output score of all the detected individual bigrams
#which is a tuple

{(b'two', b'daughters'): (19, 11.966813731181546),
 (b'her', b'sister'): (195, 17.7960829227865),
 (b"'", b's'): (9781, 31.066242737744524),
 (b'very', b'early'): (24, 11.01214147275924),
 (b'Her', b'mother'): (14, 13.529425062715127),
 (b'long', b'ago'): (38, 63.22343628984788),
 (b'more', b'than'): (541, 29.023584433996874),
 (b'had', b'been'): (1256, 22.306024648925288),
 (b'an', b'excellent'): (54, 39.063874851750626),
 (b'Miss', b'Taylor'): (48, 453.75918026073305),
 (b'very', b'fond'): (28, 24.134280468850747),
 (b'passed', b'away'): (25, 12.35053642325912),
 (b'too', b'much'): (173, 31.376002029426687),
 (b'did', b'not'): (935, 11.728416217142811),
 (b'any', b'means'): (27, 14.096964108090186),
 (b'wedding', b'-'): (15, 17.4695197740113),
 (b'Her', b'father'): (18, 13.129571562488772),
 (b'after', b'dinner'): (21, 21.5285481168817),
 (b'self', b'-'): (124, 47.79018053120332),
 (b'sixteen', b'years'): (12, 107.0461671612265),
 (b'five', b'years'): (42, 40.128755673408115),
 (b'ye

In [None]:
tokenized_sentence = " I was at the New York city counsel".split()

In [None]:
tokenized_sentence

['I', 'was', 'at', 'the', 'New', 'York', 'city', 'counsel']

In [None]:
bigram[tokenized_sentence]
#N.B. here to detect tri-grams, we will have to run the detector once again, and if the dataset does consists of NYC, then the tri-gram can be detected

['I', 'was', 'at', 'the', 'New_York', 'city', 'counsel']

**Preprocess the corpus**

In [None]:
# As in Maas et al. (2001):
# - Leave in stopwords ("indicative of sentiment")
# - no stemming ("model learns similar representations of words of the same stem when data suggests it")
lower_sents = []
for s in gberg_sents :
  lower_sents.append([w.lower() for w in s if w.lower() not in list(string.punctuation)])

In [None]:
lower_sents[0:5]

[['emma', 'by', 'jane', 'austen', '1816'],
 ['volume', 'i'],
 ['chapter', 'i'],
 ['emma',
  'woodhouse',
  'handsome',
  'clever',
  'and',
  'rich',
  'with',
  'a',
  'comfortable',
  'home',
  'and',
  'happy',
  'disposition',
  'seemed',
  'to',
  'unite',
  'some',
  'of',
  'the',
  'best',
  'blessings',
  'of',
  'existence',
  'and',
  'had',
  'lived',
  'nearly',
  'twenty',
  'one',
  'years',
  'in',
  'the',
  'world',
  'with',
  'very',
  'little',
  'to',
  'distress',
  'or',
  'vex',
  'her'],
 ['she',
  'was',
  'the',
  'youngest',
  'of',
  'the',
  'two',
  'daughters',
  'of',
  'a',
  'most',
  'affectionate',
  'indulgent',
  'father',
  'and',
  'had',
  'in',
  'consequence',
  'of',
  'her',
  'sister',
  's',
  'marriage',
  'been',
  'mistress',
  'of',
  'his',
  'house',
  'from',
  'a',
  'very',
  'early',
  'period']]

In [None]:
lower_bigram = Phraser(Phrases(lower_sents)) # Phrases() is used to detect sentences which might be bi-grams
# create a more efficient Phraser object for transforming "detected" sentences from Phrases()

In [None]:
lower_bigram.phrasegrams #miss taylor, mr woodhouse, mr weston
 #output score of all the detected individual bi-grams
 #which is a tuple
#One = Phraser(Phrases(lower_sents))
#One.phrasegrams

{(b'two', b'daughters'): (19, 11.080802900992637),
 (b'her', b'sister'): (201, 16.93971298099339),
 (b'very', b'early'): (25, 10.516998773665177),
 (b'her', b'mother'): (253, 10.70812618607742),
 (b'long', b'ago'): (38, 59.226442015336005),
 (b'more', b'than'): (562, 28.529926612065935),
 (b'had', b'been'): (1260, 21.583193129694834),
 (b'an', b'excellent'): (58, 37.41859680854167),
 (b'sixteen', b'years'): (15, 131.42913000977515),
 (b'miss', b'taylor'): (48, 420.4340982546865),
 (b'mr', b'woodhouse'): (132, 104.19907841850323),
 (b'very', b'fond'): (30, 24.185726346489627),
 (b'passed', b'away'): (25, 11.751473221742694),
 (b'too', b'much'): (177, 30.36309017383541),
 (b'did', b'not'): (977, 10.846196223896685),
 (b'any', b'means'): (28, 14.294148100212627),
 (b'after', b'dinner'): (22, 18.60737125272944),
 (b'mr', b'weston'): (162, 91.63290824201266),
 (b'five', b'years'): (42, 37.66428596665674),
 (b'years', b'old'): (176, 48.599094446190286),
 (b'seven', b'years'): (53, 50.3345604

In [None]:
lower_bigram["She was at new york city".split()]

['She', 'was', 'at', 'new_york', 'city']

In [None]:
lower_bigram = Phraser(Phrases(lower_sents, min_count=32, threshold=64))
lower_bigram.phrasegrams

{(b'afar', b'off'): (52, 108.14220347465505),
 (b'burnt', b'offering'): (184, 297.524653753951),
 (b'burnt', b'offerings'): (86, 299.15702343127646),
 (b'buster', b'bear'): (142, 479.87410772225826),
 (b'captain', b'benwick'): (56, 241.49037086312987),
 (b'captain', b'wentworth'): (196, 529.8756608388247),
 (b'charles', b'hayter'): (33, 92.03437785214481),
 (b'chief', b'priests'): (65, 116.31947753846512),
 (b'colonel', b'brandon'): (132, 1313.0078125),
 (b'couldn', b't'): (89, 171.76138536935215),
 (b'cut', b'off'): (217, 129.60290535032792),
 (b'dare', b'say'): (115, 89.94000515807346),
 (b'de', b'grey'): (77, 603.2109624246722),
 (b'didn', b't'): (180, 220.51081560283686),
 (b'doesn', b't'): (53, 106.2634985949418),
 (b'don', b't'): (830, 250.30957446808512),
 (b'dr', b'bull'): (65, 680.7870294599019),
 (b'dr', b'middleton'): (40, 162.73103819257668),
 (b'drawing', b'room'): (49, 84.91494947493561),
 (b'farmer', b'brown'): (100, 386.05179596892236),
 (b'father', b'brown'): (207, 91.

In [None]:
clean_sents = []
for s in lower_sents:
  clean_sents.append(lower_bigram[s])

In [None]:
clean_sents[0:9]

[['emma', 'by', 'jane', 'austen', '1816'],
 ['volume', 'i'],
 ['chapter', 'i'],
 ['emma',
  'woodhouse',
  'handsome',
  'clever',
  'and',
  'rich',
  'with',
  'a',
  'comfortable',
  'home',
  'and',
  'happy',
  'disposition',
  'seemed',
  'to',
  'unite',
  'some',
  'of',
  'the',
  'best',
  'blessings',
  'of',
  'existence',
  'and',
  'had',
  'lived',
  'nearly',
  'twenty',
  'one',
  'years',
  'in',
  'the',
  'world',
  'with',
  'very',
  'little',
  'to',
  'distress',
  'or',
  'vex',
  'her'],
 ['she',
  'was',
  'the',
  'youngest',
  'of',
  'the',
  'two',
  'daughters',
  'of',
  'a',
  'most',
  'affectionate',
  'indulgent',
  'father',
  'and',
  'had',
  'in',
  'consequence',
  'of',
  'her',
  'sister',
  's',
  'marriage',
  'been',
  'mistress',
  'of',
  'his',
  'house',
  'from',
  'a',
  'very',
  'early',
  'period'],
 ['her',
  'mother',
  'had',
  'died',
  'too',
  'long',
  'ago',
  'for',
  'her',
  'to',
  'have',
  'more',
  'than',
  'an',
 

In [None]:
clean_sents[6]

['sixteen_years',
 'had',
 'miss_taylor',
 'been',
 'in',
 'mr_woodhouse',
 's',
 'family',
 'less',
 'as',
 'a',
 'governess',
 'than',
 'a',
 'friend',
 'very_fond',
 'of',
 'both',
 'daughters',
 'but',
 'particularly',
 'of',
 'emma']

**Run word2vec**

In [118]:
#max_vocab_size can be used instead of min_count(which has increased here)
model = Word2Vec(sentences= clean_sents, size= 64, sg=1, window= 10, workers = 4, iter= 5, min_count= 10)
model.save('gutenberg_model_clean.w2v')

**Explore model**

In [119]:
#Re-training the model can be skipped using the next line
model = gensim.models.Word2Vec.load('gutenberg_model_clean.w2v')

In [121]:
len(model.wv.vocab) # Would have been ~17k w/o preprocessing

10329

In [126]:
model.wv['lion']

array([ 0.33923522, -0.01859458,  0.48543337, -0.14419103,  0.66095763,
       -0.47023186,  0.41409552,  0.41344768, -0.57909703,  0.06661064,
       -0.16423875,  0.39893666, -0.6528274 ,  0.05688303,  0.31171143,
       -0.0509177 ,  0.34777278,  0.05182242, -0.40415594, -0.02667211,
        0.08494469, -0.36179686, -0.1886809 ,  0.13126405,  0.1425655 ,
       -0.12566085, -0.5302482 , -0.36413026, -0.1252634 ,  0.6723208 ,
        0.58930546, -0.4080293 , -0.6266837 ,  0.08644899,  0.05105847,
       -0.16920468, -0.7983046 , -0.5860473 , -0.29638   , -0.3104469 ,
        0.29827178, -0.16744144, -0.3415674 , -0.9487622 , -0.27324572,
       -0.10742486,  0.15731288,  0.20563814,  0.3077484 ,  0.59782   ,
       -0.02667256, -0.4717002 , -0.344249  , -0.08492209,  0.10019538,
        0.32844827, -0.30506483,  0.09916176, -0.02567244,  0.37232086,
       -0.62746304, -0.30099884, -0.48049322, -0.5647618 ], dtype=float32)

In [127]:
len(model.wv['lion'])

64

In [129]:
model.wv.most_similar('lion', topn = 4) # note there are no uppercase letters

[('lions', 0.7388654947280884),
 ('wolf', 0.724319338798523),
 ('roaring', 0.7042638063430786),
 ('whelps', 0.6571477651596069)]

In [134]:
model.wv.most_similar('dog', topn = 5)

[('puppy', 0.7809305191040039),
 ('brahmin', 0.760854959487915),
 ('tiger', 0.7486941814422607),
 ('broth', 0.7285583019256592),
 ('butcher', 0.7248262763023376)]

In [135]:
  model.wv.most_similar('you', topn = 5)

[('yourself', 0.7714065313339233),
 ('i', 0.7677822113037109),
 ('shouldn', 0.7495094537734985),
 ('me', 0.7435998916625977),
 ('cats', 0.7381664514541626)]

In [136]:
model.wv.most_similar('every', topn = 5)

[('another', 0.7256044745445251),
 ('each', 0.6350995302200317),
 ('one', 0.6305866837501526),
 ('same', 0.5520212650299072),
 ('own', 0.5447354316711426)]

In [137]:
model.wv.most_similar('emma', topn = 5)

[('mr_knightley', 0.9049018621444702),
 ('harriet', 0.88619065284729),
 ('mrs_weston', 0.8801501989364624),
 ('anne', 0.8647059202194214),
 ('mr_elton', 0.8583898544311523)]

In [141]:
model.wv.most_similar('ma_am', topn = 5)

[('madam', 0.8942073583602905),
 ('mamma', 0.8788061141967773),
 ('betty', 0.8713029026985168),
 ('madman', 0.827816367149353),
 ('m_sure', 0.82740718126297)]

In [144]:
model.wv.doesnt_match("mother father sister brother lion".split())

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'lion'

In [148]:
model.wv.similarity('brother', 'tiger')

0.26384956

In [149]:
model.wv.similarity('brother', 'sister')

0.7657298

In [152]:
model.wv.most_similar(positive = ['king', 'woman'], negative = ['man'])

[('daughter', 0.657007098197937),
 ('sarah', 0.6354444026947021),
 ('esther', 0.6286782026290894),
 ('amnon', 0.6205866932868958),
 ('hittite', 0.6152075529098511),
 ('sister', 0.6118481159210205),
 ('rebekah', 0.610253095626831),
 ('bethuel', 0.6028515696525574),
 ('wife', 0.6021990776062012),
 ('leah', 0.6020969152450562)]

In [157]:
model.wv.most_similar(positive = ['husband', 'woman'], negative = ['man'])

[('sister', 0.7464128732681274),
 ('wife', 0.7291111350059509),
 ('mother', 0.6825290322303772),
 ('daughter', 0.6785309314727783),
 ('maid', 0.6765927672386169),
 ('conceived', 0.6364141702651978),
 ('nurse', 0.6156196594238281),
 ('child', 0.6021891832351685),
 ('womb', 0.5952093601226807),
 ('widow', 0.5906627178192139)]

**Reduce word vector dimensionality with t-SNE**

In [158]:
tsne =  TSNE(n_components = 2 , n_iter = 1000) # t-SNE is t distributed stocastic neigbour embedding

In [159]:
X_2d = tsne.fit_transform(model.wv[model.wv.vocab])

In [160]:
coords_df = pd.DataFrame(X_2d, columns = ['x', 'y'])
coords_df['token'] = model.wv.vocab.keys()

In [161]:
coords_df.head()

Unnamed: 0,x,y,token
0,-40.799686,-48.408154,emma
1,-44.080421,17.327879,by
2,-42.250561,-45.791157,jane
3,-23.454355,2.906473,volume
4,-43.600342,-2.167481,i


In [165]:
coords_df.to_csv('clean_gutenberg_tsne.csv', index = False)

**Visualize**

In [166]:
coords_df = pd.read_csv('clean_gutenberg_tsne.csv')

In [167]:
output_notebook()

In [168]:
subset_df = coords_df.sample(n = 5000)

In [169]:
p = figure(plot_width = 800, plot_height = 800)
_ = p.text(x = subset_df.x, y = subset_df.y, text = subset_df.token)

In [170]:
  show(p)