Using word2vec to create an unsupervised neural network.

Word2vec is a shallow neural network model for converting words to vectors using distributed representation, each word is represented by many neurons, and each neuron is involved in representing many words. 

Useful for parsing requests written by people, but works well only for larger datasets (i.e. corpus that is several billion words long)

In [144]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import gutenberg, stopwords
from nltk.stem import WordNetLemmatizer

In [145]:
def text_cleaner(text):
    #text = re.sub("[\[].*?[\]]", "", text)
    text = re.sub('[^a-zA-Z]',' ',text)
    text = text.lower()
    text = text.split()
    lmz = WordNetLemmatizer()
    text = [lmz.lemmatize(word) for word in text if not word in set(stopwords.words('english'))]
    text = ' '.join(text)
    return text

In [146]:
austen = ""
for novel in ['persuasion','emma','sense']:
    work = gutenberg.raw('austen-'+novel+'.txt')
    austen += work

In [147]:
austen = re.sub(r'Chapter \d+','',austen)
austen = re.sub(r'--',' ',austen)

In [148]:
# parse the data into sentences
austen_sent = nltk.sent_tokenize(austen)
#austen_sent = nltk.word_tokenize(austen)

In [149]:
corpus = []
for sent in list(austen_sent):
    cleaned_sentence = text_cleaner(sent)
    cleaned_sentence = nltk.word_tokenize(cleaned_sentence)
    corpus.append(cleaned_sentence)

In [150]:
print(corpus[20])
print('We have {} sentences and {} tokens.'.format(len(corpus), len(austen_sent)))

['one', 'daughter', 'eldest', 'would', 'really', 'given', 'thing', 'much', 'tempted']
We have 17565 sentences and 17565 tokens.


In [151]:
"loud" in corpus

False

In [155]:
import gensim
from gensim.models import word2vec

model = word2vec.Word2Vec(
    corpus,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=6,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

In [156]:
# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))
# Similarity is calculated using the cosine, so again 1 is total
# similarity and 0 is no similarity.
print("\n",model.wv.similarity('loud', 'aloud'))
print("\n",model.wv.similarity('mr', 'miss'))

# One of these things is not like the other...
print("\n",model.doesnt_match("breakfast marriage dinner lunch".split()))

[('miss', 0.5413392782211304), ('blame', 0.5381035804748535), ('shew', 0.5361624360084534), ('niece', 0.5323333144187927), ('compare', 0.49518486857414246), ('pressing', 0.4615521728992462), ('mr', 0.45491349697113037), ('handsome', 0.4526069164276123), ('friend', 0.4508165121078491), ('eligible', 0.43741124868392944)]

 0.63148165

 0.67760015

 marriage


  if np.issubdtype(vec.dtype, np.int):
  # This is added back by InteractiveShellApp.init_path()
