# Solutions VI: Word Embeddings

In [1]:
import spacy
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
nlp = spacy.load("en_core_web_md")

In [3]:
def most_similar(text, n=5):
    """Find the most similar words in SpaCy's vocabulary."""
    vector = nlp(text).vector

    # Query vocabulary for most similar word vectors.
    similar = nlp.vocab.vectors.most_similar(np.array([vector]), n=n)
    keys = similar[0][0]
    scores = similar[2][0]

    return {
        # Look up word for each key in the result.
        nlp.vocab[key].text: score
        for key, score in zip(keys, scores)
    }

### Word Vectors

In [4]:
def similarity(text_left, text_right):
    """Compute cosine similarity between two texts."""

    # Create Doc objects using nlp().
    doc_left = nlp(text_left)
    doc_right = nlp(text_right)

    # Use Doc.similarity(Doc) to compute similarity.
    similarity = doc_left.similarity(doc_right)

    return similarity

In [None]:
# Compute similarity between "cash" and "money".
# Note: Very high similarity between both.
similarity("money", "cash")

In [None]:
# Compute similarity between "money" and "dollar".
# Note: Similar, but lower than money - cash; maybe because different currencies exist.
similarity("money", "dollar")

In [None]:
# Compute similarity between "cash" and "dash".
# Note: Very low similarity as you would expect.
# Note: Levenshtein distance would be minimal.
similarity("cash", "dash")

In [None]:
# Most similar words.
# Note": Some "interesting" results...
most_similar("cash", 10)

### Doc Vectors

In [None]:
# Compute document vector for "money talks".
money_talks = nlp("money talks")

# Length of the document vector.
len(money_talks.vector)

In [None]:
# Look at the first 5 numbers.
money_talks.vector[0:5].round(2)

In [None]:
# Compute the word vector for "money"
nlp("money").vector[0:5].round(2)

In [None]:
# Compute the word vector for "talks"
nlp("talks").vector[0:5].round(2)

In [None]:
# Doc vector is the average of the word vectors.
average = (nlp("money").vector + nlp("talks").vector) / 2
average[0:5]

### Model Performance

In [None]:
# Load the full IMDB review dataset.
reviews = pd.read_csv("../../0_data/imdb/imdb_reviews_small.csv", compression="zip")
reviews.shape

In [14]:
# Construct X, y for the training set.
X_train = reviews.loc[reviews["dataset"] == "Train"]
y_train = X_train["label"]
X_train = X_train["content"]

# Construct X, y for the test set.
X_test = reviews.loc[reviews["dataset"] == "Test"]
y_test = X_test["label"]
X_test = X_test["content"]

In [15]:
def nlp_vectors(dataset):
    # Convert to Docs.
    docs = nlp.pipe(
        dataset,
        n_process=-1,
        disable=[
            'tagger',
            'parser',
            'senter',
            'attribute_ruler',
            'lemmatizer',
            'ner'
        ]
    )

    return [doc.vector for doc in docs]

In [16]:
# Get document vectors for all reviews.
Xt_train = nlp_vectors(X_train)
Xt_test = nlp_vectors(X_test)

In [None]:
# Create and fit a RandomForest classifier.
model = RandomForestClassifier(n_jobs=-1)
model.fit(Xt_train, y_train)

In [None]:
# Accuracy on the test set.
accuracy_score(y_test, model.predict(Xt_test))