## Best practices for NLP

Best way to see something works(removing stopwords, lowercasing, remove punkts) is actually experimenting with it

#### Load dependencies

In [58]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import gensim
from gensim.models.word2vec import Word2Vec
#Why do we need TSNE?
from sklearn.manifold import TSNE
import pandas as pd
#For interactive visualizations of our word vectors
from bokeh.io import output_notebook, output_file
from bokeh.plotting import show, figure
%matplotlib inline

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/mohammedsunasra/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
import string
from nltk.corpus import stopwords
from nltk.stem.porter import *
from gensim.models.phrases import Phraser, Phrases
from keras.preprocessing.text import one_hot

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mohammedsunasra/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#### Load data

In [5]:
from nltk.corpus import gutenberg

In [6]:
gber_sents = gutenberg.sents()

#### Preprocessing sentences

In [7]:
gber_sents[4]

['She',
 'was',
 'the',
 'youngest',
 'of',
 'the',
 'two',
 'daughters',
 'of',
 'a',
 'most',
 'affectionate',
 ',',
 'indulgent',
 'father',
 ';',
 'and',
 'had',
 ',',
 'in',
 'consequence',
 'of',
 'her',
 'sister',
 "'",
 's',
 'marriage',
 ',',
 'been',
 'mistress',
 'of',
 'his',
 'house',
 'from',
 'a',
 'very',
 'early',
 'period',
 '.']

#### Lowercase sentences

In [8]:
[w.lower() for w in gber_sents[4]]

['she',
 'was',
 'the',
 'youngest',
 'of',
 'the',
 'two',
 'daughters',
 'of',
 'a',
 'most',
 'affectionate',
 ',',
 'indulgent',
 'father',
 ';',
 'and',
 'had',
 ',',
 'in',
 'consequence',
 'of',
 'her',
 'sister',
 "'",
 's',
 'marriage',
 ',',
 'been',
 'mistress',
 'of',
 'his',
 'house',
 'from',
 'a',
 'very',
 'early',
 'period',
 '.']

#### Remove stopwords and punctuation

In [16]:
stop_words = list(set(stopwords.words('english'))) + list(string.punctuation)
stop_words

['himself',
 'our',
 'below',
 've',
 'should',
 'an',
 'as',
 'during',
 "don't",
 'didn',
 'theirs',
 'which',
 'about',
 'again',
 'm',
 "needn't",
 'themselves',
 're',
 'shouldn',
 "you'll",
 'mustn',
 'then',
 'ain',
 "that'll",
 'wasn',
 'doing',
 'd',
 'more',
 'those',
 'in',
 "weren't",
 'whom',
 'on',
 'hers',
 "couldn't",
 'over',
 'have',
 'few',
 'they',
 'herself',
 'how',
 'were',
 'once',
 'been',
 'before',
 'will',
 "hadn't",
 'if',
 'when',
 'do',
 'just',
 "aren't",
 'll',
 'being',
 'any',
 'by',
 'where',
 's',
 'here',
 'did',
 'it',
 'up',
 'and',
 'hadn',
 'or',
 'hasn',
 'does',
 'him',
 "you've",
 'because',
 'against',
 'so',
 "won't",
 'no',
 'wouldn',
 'needn',
 'at',
 'its',
 "it's",
 'had',
 'your',
 'until',
 'most',
 'further',
 'too',
 'only',
 'these',
 'can',
 'ourselves',
 'her',
 'his',
 'into',
 'y',
 'ours',
 'very',
 'not',
 'for',
 "isn't",
 'is',
 'with',
 'such',
 'myself',
 'to',
 'through',
 'yours',
 'isn',
 'won',
 "mightn't",
 'o',
 'b

In [17]:
[w.lower() for w in gber_sents[4] if w not in stop_words]

['she',
 'youngest',
 'two',
 'daughters',
 'affectionate',
 'indulgent',
 'father',
 'consequence',
 'sister',
 'marriage',
 'mistress',
 'house',
 'early',
 'period']

#### Stemming Words

In [18]:
stemmer = PorterStemmer()

In [19]:
[stemmer.stem(w.lower()) for w in gber_sents[4] if w not in stop_words]

['she',
 'youngest',
 'two',
 'daughter',
 'affection',
 'indulg',
 'father',
 'consequ',
 'sister',
 'marriag',
 'mistress',
 'hous',
 'earli',
 'period']

#### Handling Bigrams

In [20]:
phrases = Phrases(gber_sents)

In [21]:
bigram = Phraser(phrases)

In [22]:
bigram.phrasegrams

{(b'two', b'daughters'): (19, 11.966813731181546),
 (b'her', b'sister'): (195, 17.7960829227865),
 (b"'", b's'): (9781, 31.066242737744524),
 (b'very', b'early'): (24, 11.01214147275924),
 (b'Her', b'mother'): (14, 13.529425062715127),
 (b'long', b'ago'): (38, 63.22343628984788),
 (b'more', b'than'): (541, 29.023584433996874),
 (b'had', b'been'): (1256, 22.306024648925288),
 (b'an', b'excellent'): (54, 39.063874851750626),
 (b'Miss', b'Taylor'): (48, 453.75918026073305),
 (b'very', b'fond'): (28, 24.134280468850747),
 (b'passed', b'away'): (25, 12.35053642325912),
 (b'too', b'much'): (173, 31.376002029426687),
 (b'did', b'not'): (935, 11.728416217142811),
 (b'any', b'means'): (27, 14.096964108090186),
 (b'wedding', b'-'): (15, 17.4695197740113),
 (b'Her', b'father'): (18, 13.129571562488772),
 (b'after', b'dinner'): (21, 21.5285481168817),
 (b'self', b'-'): (124, 47.79018053120332),
 (b'sixteen', b'years'): (12, 107.0461671612265),
 (b'five', b'years'): (42, 40.128755673408115),
 (b'ye

In [23]:
"Jon lives in New York City".split()

['Jon', 'lives', 'in', 'New', 'York', 'City']

In [28]:
bigram["Jon lives in New York City".split()]

['Jon', 'lives', 'in', 'New_York', 'City']

#### Preprocess Corpus

In [29]:
lower_sents = []
for s in gber_sents:
    lower_sents.append([w.lower() for w in s if w not in list(string.punctuation)])

In [30]:
 lower_sents[:10]

[['emma', 'by', 'jane', 'austen', '1816'],
 ['volume', 'i'],
 ['chapter', 'i'],
 ['emma',
  'woodhouse',
  'handsome',
  'clever',
  'and',
  'rich',
  'with',
  'a',
  'comfortable',
  'home',
  'and',
  'happy',
  'disposition',
  'seemed',
  'to',
  'unite',
  'some',
  'of',
  'the',
  'best',
  'blessings',
  'of',
  'existence',
  'and',
  'had',
  'lived',
  'nearly',
  'twenty',
  'one',
  'years',
  'in',
  'the',
  'world',
  'with',
  'very',
  'little',
  'to',
  'distress',
  'or',
  'vex',
  'her'],
 ['she',
  'was',
  'the',
  'youngest',
  'of',
  'the',
  'two',
  'daughters',
  'of',
  'a',
  'most',
  'affectionate',
  'indulgent',
  'father',
  'and',
  'had',
  'in',
  'consequence',
  'of',
  'her',
  'sister',
  's',
  'marriage',
  'been',
  'mistress',
  'of',
  'his',
  'house',
  'from',
  'a',
  'very',
  'early',
  'period'],
 ['her',
  'mother',
  'had',
  'died',
  'too',
  'long',
  'ago',
  'for',
  'her',
  'to',
  'have',
  'more',
  'than',
  'an',
 

In [31]:
lower_bigram = Phraser(Phrases(lower_sents))

In [33]:
lower_bigram.phrasegrams

{(b'two', b'daughters'): (19, 11.080802900992637),
 (b'her', b'sister'): (201, 16.93971298099339),
 (b'very', b'early'): (25, 10.516998773665177),
 (b'her', b'mother'): (253, 10.70812618607742),
 (b'long', b'ago'): (38, 59.226442015336005),
 (b'more', b'than'): (562, 28.529926612065935),
 (b'had', b'been'): (1260, 21.583193129694834),
 (b'an', b'excellent'): (58, 37.41859680854167),
 (b'sixteen', b'years'): (15, 131.42913000977515),
 (b'miss', b'taylor'): (48, 420.4340982546865),
 (b'mr', b'woodhouse'): (132, 104.19907841850323),
 (b'very', b'fond'): (30, 24.185726346489627),
 (b'passed', b'away'): (25, 11.751473221742694),
 (b'too', b'much'): (177, 30.36309017383541),
 (b'did', b'not'): (977, 10.846196223896685),
 (b'any', b'means'): (28, 14.294148100212627),
 (b'after', b'dinner'): (22, 18.60737125272944),
 (b'mr', b'weston'): (162, 91.63290824201266),
 (b'five', b'years'): (42, 37.66428596665674),
 (b'years', b'old'): (176, 48.599094446190286),
 (b'seven', b'years'): (53, 50.3345604

In [34]:
lower_bigram = Phraser(Phrases(lower_sents, min_count=32, threshold=64))

In [35]:
lower_bigram.phrasegrams

{(b'miss', b'taylor'): (48, 156.44059469941823),
 (b'mr', b'woodhouse'): (132, 82.04651843976633),
 (b'mr', b'weston'): (162, 75.87438262077481),
 (b'mrs', b'weston'): (249, 160.68485093258923),
 (b'great', b'deal'): (182, 93.36368125424357),
 (b'mr', b'knightley'): (277, 161.74131790625913),
 (b'miss', b'woodhouse'): (173, 229.03802722366902),
 (b'years', b'ago'): (56, 74.31594785893046),
 (b'mr', b'elton'): (214, 121.3990121932397),
 (b'dare', b'say'): (115, 89.94000515807346),
 (b'frank', b'churchill'): (151, 1316.4456593286038),
 (b'miss', b'bates'): (113, 276.39588291692513),
 (b'drawing', b'room'): (49, 84.91494947493561),
 (b'mrs', b'goddard'): (58, 143.57843432545658),
 (b'miss', b'smith'): (58, 73.03442128232508),
 (b'few', b'minutes'): (86, 204.16834974753786),
 (b'john', b'knightley'): (58, 83.03755747111268),
 (b'don', b't'): (830, 250.30957446808512),
 (b'good', b'natured'): (66, 88.69936184891343),
 (b'few', b'moments'): (43, 107.77584531675087),
 (b'thousand', b'pounds')

In [36]:
clean_sents = []
for s in lower_sents:
    clean_sents.append(lower_bigram[s])

In [38]:
clean_sents[:7]

[['emma', 'by', 'jane', 'austen', '1816'],
 ['volume', 'i'],
 ['chapter', 'i'],
 ['emma',
  'woodhouse',
  'handsome',
  'clever',
  'and',
  'rich',
  'with',
  'a',
  'comfortable',
  'home',
  'and',
  'happy',
  'disposition',
  'seemed',
  'to',
  'unite',
  'some',
  'of',
  'the',
  'best',
  'blessings',
  'of',
  'existence',
  'and',
  'had',
  'lived',
  'nearly',
  'twenty',
  'one',
  'years',
  'in',
  'the',
  'world',
  'with',
  'very',
  'little',
  'to',
  'distress',
  'or',
  'vex',
  'her'],
 ['she',
  'was',
  'the',
  'youngest',
  'of',
  'the',
  'two',
  'daughters',
  'of',
  'a',
  'most',
  'affectionate',
  'indulgent',
  'father',
  'and',
  'had',
  'in',
  'consequence',
  'of',
  'her',
  'sister',
  's',
  'marriage',
  'been',
  'mistress',
  'of',
  'his',
  'house',
  'from',
  'a',
  'very',
  'early',
  'period'],
 ['her',
  'mother',
  'had',
  'died',
  'too',
  'long',
  'ago',
  'for',
  'her',
  'to',
  'have',
  'more',
  'than',
  'an',
 

#### Run Word2vec

In [39]:
model = Word2Vec(clean_sents, sg=1, window=10, min_count=10, seed=42)

In [40]:
model.save('../model_files/clean_word2vec.w2v')

In [41]:
model = gensim.models.Word2Vec.load('../model_files/clean_word2vec.w2v')

In [42]:
len(model.wv.vocab)

10329

#### Explore model

In [43]:
model['ma_am']

  """Entry point for launching an IPython kernel.


array([-0.21365122, -0.11401936, -0.17676716,  0.41036224,  0.09619897,
        0.48350468,  0.23163974,  0.25707546,  0.60591334,  0.89126915,
       -0.04515198,  0.10589614, -0.423108  ,  0.08237524,  0.2474729 ,
        0.41331723,  0.06065598,  0.0599636 , -0.13382809, -0.16983306,
        0.5339856 , -0.22054255, -0.22334664,  0.22766311, -0.1660783 ,
       -0.12226194,  0.01537991,  0.04800711, -0.6187926 ,  0.18831447,
       -0.08592638, -0.00806423, -0.15213048,  0.44784316, -0.37994477,
        0.35566205,  0.30885974, -0.13167918,  0.19103295,  0.25501767,
        0.1846549 , -0.19736503, -0.12839311,  0.05482799,  0.2620933 ,
       -0.20355509,  0.06863803,  0.12590112, -0.21675912,  0.02793868,
        0.05482648,  0.24467334,  0.01709569,  0.14545761, -0.32066518,
       -0.34167612,  0.20415774, -0.03545104, -0.11355912,  0.40399754,
       -0.27700558, -0.11336917, -0.16498002, -0.12515575,  0.10446592,
       -0.5234462 , -0.16500714,  0.22102597,  0.41825244,  0.59

In [44]:
model.most_similar("ma_am")

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('betty', 0.8421958684921265),
 ('madam', 0.81927490234375),
 ('mamma', 0.8180132508277893),
 ('nancy', 0.808853030204773),
 ('.--"', 0.798877477645874),
 ('madman', 0.7978969812393188),
 ('bunger', 0.7935133576393127),
 ("'--", 0.7908250689506531),
 ('ay', 0.7884966135025024),
 ('frederick', 0.7855561971664429)]

In [45]:
model.most_similar(positive=['father','woman'], negative=['man'])

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('wife', 0.7032788991928101),
 ('husband', 0.6997305154800415),
 ('mother', 0.6945711374282837),
 ('daughter', 0.6922815442085266),
 ('sister', 0.6628555059432983),
 ('sarah', 0.6451992988586426),
 ('tamar', 0.6435819864273071),
 ('daughters', 0.638407289981842),
 ('amnon', 0.6359095573425293),
 ('elder', 0.6260582208633423)]

#### Reduce similarity

In [46]:
X = model[model.wv.vocab]

  """Entry point for launching an IPython kernel.


In [48]:
tsne = TSNE(n_components=2, n_iter=1000)

In [49]:
X_2d = tsne.fit_transform(X)

In [51]:
df_cleaned = pd.DataFrame(X_2d, columns=['x','y'])
df_cleaned['token'] = model.wv.vocab.keys()
df_cleaned.head()

Unnamed: 0,x,y,token
0,-62.188011,-4.949489,emma
1,0.352014,-26.590994,by
2,-61.196312,-7.449203,jane
3,-11.633715,-17.161047,volume
4,-22.090017,-32.757023,i


In [52]:
output_notebook()

In [54]:
subset_df = df_cleaned.sample(n=5000)

In [55]:
p = figure(plot_width=800, plot_height=800)
_ = p.text(x=subset_df.x, y=subset_df.y, text=subset_df.token)

In [56]:
show(p)

In [61]:
output_file(filename='word2vec_gutenberg.html')