In [None]:
import pandas as pd
import spacy
from glob import glob
# import word2vec
# import gensim
# from gensim.test.utils import common_texts
# from gensim.models import Word2Vec
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
import json
from mpl_toolkits.mplot3d import Axes3D, proj3d
from numpy import dot
from numpy.linalg import norm
%matplotlib notebook
plt.rcParams["figure.figsize"] = (12,8)

In [None]:
nlp = spacy.load('en_core_web_lg')

# Advanced Topics in Word Embeddings

## NYCDH Week, February 2019

Here are some more resources for getting started: 

- [A classic primer on Word Embeddings, from Google (uses TensorFlow)](https://www.tensorflow.org/tutorials/representation/word2vec)
- [Another word2vec tutorial using TensorFlow](https://adventuresinmachinelearning.com/word2vec-tutorial-tensorflow/)
- [The original documentation of word2vec](https://code.google.com/archive/p/word2vec/)
- [Spacy Docs on vector similarity](https://spacy.io/usage/vectors-similarity)
- [Gensim Docs](https://radimrehurek.com/gensim/models/keyedvectors.html)


For getting started in NLP more generally, here are two notebooks of mine: 

- [Introduction to Text Analysis](https://github.com/JonathanReeve/dataviz-workshop-2017)
- [Advanced Text Analysis Workshop](https://github.com/JonathanReeve/advanced-text-analysis-workshop-2017)

## An Example of Document Vectors: Project Gutenberg

![First 30 Books of Project Gutenberg](example-gut.png)

# Word Vector Similarity

In [None]:
africanSwallow = nlp('African swallow')
europeanSwallow = nlp('European swallow')
coconut = nlp('coconut')

In [None]:
africanSwallow.similarity(europeanSwallow)

In [None]:
africanSwallow.similarity(coconut)

In [None]:
def similarity(vecA, vecB):
    return dot(vecA, vecB) / (norm(vecA, ord=2) * norm(vecB, ord=2))

In [None]:
similarity(africanSwallow.vector, europeanSwallow.vector)

In [None]:
swallowArithmetic = (africanSwallow.vector - europeanSwallow.vector)

In [None]:
def mostSimilar(vec):
    highestSimilarities = [0]
    highestWords = [""]
    for w in nlp.vocab:
        sim = similarity(vec, w.vector)
        if sim > highestSimilarities[-1]:
            highestSimilarities.append(sim)
            highestWords.append(w.text.lower())
    return list(zip(highestWords, highestSimilarities))[-10:]

# Analogies (Linear Algebra)

In [None]:
mostSimilar(swallowArithmetic)

In [None]:
mostSimilar(coconut.vector)

In [None]:
king, queen, woman, man = [nlp(w).vector for w in ['king', 'queen', 'woman', 'man']]

In [None]:
answer = king - man + woman

In [None]:
mostSimilar(answer)

In [None]:
paris, france, germany = [nlp(w).vector for w in ['Paris', 'France', 'Germany']]
answer = paris - france + germany
mostSimilar(answer)

# Pride and Prejudice

In [None]:
pride = open('pride.txt').read()

In [None]:
prideDoc = nlp(pride)

In [None]:
prideNouns = [w for w in prideDoc if w.pos_.startswith('N')][:40]
prideNounLabels = [w.lemma_ for w in prideNouns]

In [None]:
prideNounVecs = [w.vector for w in prideNouns]

In [None]:
reduced = PCA(n_components=3).fit_transform(prideNounVecs)

In [None]:
prideDF = pd.DataFrame(reduced)

In [None]:
%matplotlib notebook
plt.rcParams["figure.figsize"] = (10,8)

def plotResults3D(df, labels): 
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(df[0], df[1], df[2], marker='o')
    for i, label in enumerate(labels):
        ax.text(df.loc[i][0], df.loc[i][1], df.loc[i][2], label)

In [None]:
plotResults3D(prideDF, prideNounLabels)

In [None]:
# Redo this function with only nouns from Pride and Prejudice
def mostSimilar(vec):
    highestSimilarities = [0]
    highestWords = [""]
    for w in prideNouns:
        sim = similarity(vec, w.vector)
        if sim > highestSimilarities[-1]:
            highestSimilarities.append(sim)
            highestWords.append(w.text.lower())
    return list(zip(highestWords, highestSimilarities))[-10:]

In [None]:
mostSimilar(nlp('wife').vector)

# Senses

In [None]:
senseDocs = [nlp(w) for w in ['sound', 'sight', 'touch', 'smell']]
def whichSense(word):
    doc = nlp(word)
    return {sense: doc.similarity(sense) for sense in senseDocs}


In [None]:
whichSense('symphony')

In [None]:
%matplotlib inline
plt.rcParams["figure.figsize"] = (14,8)

In [None]:
testWords = 'symphony itchy flower crash'.split()
pd.DataFrame([whichSense(w) for w in testWords], index=testWords).plot(kind='bar')

# The Inaugural Address Corpus

In [None]:
inauguralFilenames = sorted(glob('inaugural/*'))
inauguralLabels = [fn[10:-4] for fn in inauguralFilenames]
inauguralDates = [int(label[:4]) for label in inauguralLabels]
parties = 'rrrbbrrrbbbbbrrbbrrbrrrbbrrbr' # I did this manually. There are probably errors.
inauguralRaw = [open(f, errors="ignore").read() for f in inauguralFilenames]

In [None]:
# Sanity check: peek
for i in range(4): 
    print(inauguralLabels[i][:30], inauguralDates[i], inauguralRaw[i][:30])

In [None]:
inauguralDocs = [nlp(text) for text in inauguralRaw]

In [None]:
inauguralVecs = [doc.vector for doc in inauguralDocs]

In [None]:
# Compute a similarity matrix.
# Check the similarity of everything against everything else. 
# Note: this is not very efficient. 
similarities = []
for vec in inauguralDocs: 
    thisSimilarities = [vec.similarity(other) for other in inauguralDocs]
    similarities.append(thisSimilarities)

In [None]:
df = pd.DataFrame(similarities, columns=inauguralLabels, index=inauguralLabels)

In [None]:
df[df < 1].idxmax()

In [None]:
embedded = PCA(n_components=2).fit_transform(inauguralVecs)

In [None]:
xs, ys = embedded[:,0], embedded[:,1]
for i in range(len(xs)): 
    plt.scatter(xs[i], ys[i], c=parties[i], s=inauguralDates[i]-1900)
    plt.annotate(inauguralLabels[i], (xs[i], ys[i]))

# Detective Novels

In [None]:
detectiveJSON = open('detectives.json')
detectivesData = json.load(detectiveJSON)
detectivesData = detectivesData[1:] # Chop of #1, which is actually a duplicate

In [None]:
detectiveTexts = [book['text'] for book in detectivesData]

In [None]:
detectiveLengths = [len(text) for text in detectiveTexts] # How long are they? We might want to cut them down
detectiveLengths

In [None]:
detectiveTextsTruncated = [t[:min(detectiveLengths)] for t in detectiveTexts]

In [None]:
detectiveDocs = [nlp(book) for book in detectiveTextsTruncated] # This should take a while

In [None]:
extraWords = "gun knife snake diamond".split()
extraDocs = [nlp(word) for word in extraWords]
extraVecs = [doc.vector for doc in extraDocs]

In [None]:
detectiveVecs = [doc.vector for doc in detectiveDocs]
detectiveLabels = [doc['author'].split(',')[0] +  '-' + doc['title'][:20] for doc in detectivesData]

In [None]:
detectiveLabels

In [None]:
pcaOut = PCA(n_components=10).fit_transform(detectiveVecs + extraVecs)
tsneOut = TSNE(n_components=2).fit_transform(pcaOut)

In [None]:
xs, ys = tsneOut[:,0], tsneOut[:,1]
for i in range(len(xs)): 
    plt.scatter(xs[i], ys[i])
    plt.annotate((detectiveLabels + extraWords)[i], (xs[i], ys[i]))

In [None]:
# Sanity check
counts = {label: {w: 0 for w in extraWords} for label in detectiveLabels}
for i, doc in enumerate(detectiveDocs):
    for w in doc: 
        if w.lemma_ in extraWords: 
            counts[detectiveLabels[i]][w.lemma_] += 1

In [None]:
pd.DataFrame(counts).T.plot(kind='bar')

# Train Your Own

In [None]:
# Requires Gensim. See commented-out imports above.
model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4)

In [None]:
word_vectors = model.wv

In [None]:
[w for w in dir(word_vectors) if not w.startswith('_')]