In [None]:
import spacy
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
print(nlp.component_names)

## Similarity

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
# Define target to match to.
target = "certain"

In [None]:
# Define documents to match.
docs = [
    "certain",
    "sure",
    "uncertain",
]

In [None]:
# Vectorizer to convert to numeric data.
vectorizer = CountVectorizer().fit(docs)
vectorizer.vocabulary_

In [None]:
# Transform the target.
target_vector = vectorizer.transform([target])
target_vector.todense()

In [None]:
# Transform the documents.
doc_vectors = vectorizer.transform(docs)
doc_vectors.todense()

In [None]:
# Compute similarities.
for doc, doc_vector in zip(docs, doc_vectors):
    similarity = cosine_similarity(target_vector, doc_vector)[0 ,0]
    print(f"Similarity {target} - {doc:10s} {similarity:.2f}")

In [None]:
# Transform the target.
target_vector = nlp(target).vector.reshape(1, -1)
target_vector

In [None]:
# Transform the documents.
doc_vectors = [nlp(doc).vector.reshape(1, -1) for doc in docs]

In [None]:
# Compute similarities.
for doc, doc_vector in zip(docs, doc_vectors):
    similarity = cosine_similarity(target_vector, doc_vector)[0 ,0]
    print(f"Similarity {target} - {doc:10s} {similarity:.2f}")

In [None]:
tmp = nlp("certain sure uncertain")

In [None]:
tmp[0].similarity(tmp[2])

### Vector model

In [None]:
import spacy
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
# Load the IMDB review dataset.
reviews = pd.read_csv("../0_data/imdb/imdb_reviews_small.csv", compression="zip")
reviews.shape

In [None]:
reviews.columns

In [None]:
reviews[["dataset", "label"]].value_counts(sort=False)

In [None]:
nlp = spacy.load("en_core_web_md")
nlp.component_names

In [None]:
X_train = reviews.query("dataset == 'Train'")["content"]
y_train = reviews.query("dataset == 'Train'")["label"]

X_test = reviews.query("dataset == 'Test'")["content"]
y_test = reviews.query("dataset == 'Test'")["label"]

In [None]:
def nlp_vectors(dataset):
    # Convert to Docs.
    docs = nlp.pipe(
        dataset,
        n_process=3,
        disable=[
            'tagger',
            'parser',
            'senter',
            'attribute_ruler',
            'lemmatizer',
            'ner'
        ]
    )

    return [doc.vector for doc in docs]


In [None]:
Xt_train = nlp_vectors(X_train)
Xt_test = nlp_vectors(X_test)

In [None]:
model = RandomForestClassifier(n_jobs=3)

In [None]:
model.fit(Xt_train, y_train)

In [None]:
accuracy_score(y_train, model.predict(Xt_train))

In [None]:
accuracy_score(y_test, model.predict(Xt_test))

In [None]:
vectorizer = CountVectorizer(stop_words="english", min_df=15)

In [None]:
Xv_train = vectorizer.fit_transform(X_train)
Xv_test = vectorizer.transform(X_test)

In [None]:
rf = RandomForestClassifier(n_jobs=-1)

In [None]:
rf.fit(Xv_train, y_train)

In [None]:
accuracy_score(y_test, rf.predict(Xv_test))

### Positional encoding

In [None]:
def pos_encode(pos, embed, n_embed, scale=10_000):
    if embed % 2:
        return np.cos(pos / scale ** ((embed - 1)/ n_embed))
    return np.sin(pos / scale ** (embed / n_embed))
    i = i - 1

In [None]:
n_pos = 6
s_pos = 2
n_embed = 4
scale = 2


x_draw = np.linspace(0, n_pos)
x_points = np.array([_ for _ in range(n_pos)])

fig, axes = plt.subplots(1, n_embed, figsize=(n_embed * 1.2, 2), sharex=True, sharey=True)
for i in range(n_embed):

    y_draw = pos_encode(x_draw, i, n_embed, scale)
    y_points = pos_encode(x_points, i, n_embed, scale)

    axes[i].plot(x_draw, y_draw)
    axes[i].scatter(x_points, y_points, s=10)
    axes[i].axvline(s_pos, color="orange")
    axes[i].annotate(f"{y_points[s_pos]:.2f}", (s_pos, -1))
    axes[i].set_yticks([-1, 0, 1])

# fig.supylabel('Embedding')
# fig.supxlabel('Word Position')

fig.tight_layout()