# Notes

[Text Similarities and Dimension Reducation Visualizations for Embeddings](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/11.Text_Similarities_and_dimension_reduction_visualizations_for_Embeddings.ipynb)

Let's use doc2vec to get embeddings for all documents in our sample dataset.

We'll then use a variety of dimensionality reduction techniques to plot 2D and 3D representations of the dataset.

This will help inform our attempts at document clustering.

In [13]:
%matplotlib inline
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.decomposition import *
from sklearn.manifold import *
from sklearn.cluster import KMeans, DBSCAN, AffinityPropagation, SpectralClustering
from sparknlp.base import *
from sparknlp.annotator import *
import sparknlp

Set `m1=False` if you are not running on macOS with Apple Silicon

In [14]:
spark = sparknlp.start(m1=True, memory='16G')

In [15]:
documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

sentence_detector = SentenceDetector() \
    .setInputCols('document') \
    .setOutputCol('sentence')

tokenizer = Tokenizer() \
    .setInputCols("sentence") \
    .setOutputCol("token")

embeddings = Doc2VecApproach() \
    .setInputCols("token") \
    .setOutputCol("embeddings")

pipeline = Pipeline() \
    .setStages([
        documentAssembler,
        sentence_detector,
        tokenizer,
        embeddings
    ])

In [16]:
KNOWLEDGE = '../data/x_b6ec60d2-207c-43d5-bccd-a353fccda5af.json'

dataset = spark.read.json(KNOWLEDGE)
dataset.printSchema()

train, test = dataset.randomSplit([0.95, 0.05])

root
 |-- access_link: string (nullable = true)
 |-- associated_project: string (nullable = true)
 |-- associated_project_id: string (nullable = true)
 |-- content_size: long (nullable = true)
 |-- date_accessed: string (nullable = true)
 |-- date_created: string (nullable = true)
 |-- description: string (nullable = true)
 |-- flagged: string (nullable = true)
 |-- id: string (nullable = true)
 |-- ingest_type: string (nullable = true)
 |-- mime_type: string (nullable = true)
 |-- text: string (nullable = true)
 |-- title: string (nullable = true)
 |-- topics: string (nullable = true)



In [None]:
# Fit and Transform dataset
pipelineModel = pipeline.fit(train)
preds = pipelineModel.transform(train)
preds.show()

In [18]:
preds.printSchema()

root
 |-- access_link: string (nullable = true)
 |-- associated_project: string (nullable = true)
 |-- associated_project_id: string (nullable = true)
 |-- content_size: long (nullable = true)
 |-- date_accessed: string (nullable = true)
 |-- date_created: string (nullable = true)
 |-- description: string (nullable = true)
 |-- flagged: string (nullable = true)
 |-- id: string (nullable = true)
 |-- ingest_type: string (nullable = true)
 |-- mime_type: string (nullable = true)
 |-- text: string (nullable = true)
 |-- title: string (nullable = true)
 |-- topics: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueCo

In [19]:
df = preds.select(preds.title, preds.embeddings.embeddings).toPandas().dropna()
df.columns = ['title', 'embeddings']

                                                                                

In [20]:
df.embeddings = df.embeddings.apply(lambda x : x[0])

In [21]:
v = np.stack(df.embeddings)
X = v

Uncomment any of the `X_emb = <model>` lines below. The corresponding model will be fit, reduced, and plotted.

Make sure only one line is uncommented, otherwise only the last uncommented model will be used (and you will increase execution time by needlessly fitting an unused model).

In [99]:
# Manifold Learning Algorithms


# #LLE Locally Linear Embedding https://scikit-learn.org/stable/modules/generated/sklearn.manifold.LocallyLinearEmbedding.html#sklearn.manifold.LocallyLinearEmbedding
# X_emb, err = locally_linear_embedding(n_components=3, n_neighbors=7, X=X)
#
# # Spectral Embeddings https://scikit-learn.org/stable/modules/generated/sklearn.manifold.SpectralEmbedding.html#sklearn.manifold.SpectralEmbedding
# X_emb = SpectralEmbedding(n_components=3).fit_transform(X)
#
# # Isomap https://scikit-learn.org/stable/modules/generated/sklearn.manifold.Isomap.html#sklearn.manifold.Isomap
X_emb = Isomap(n_components=3).fit_transform(X)
#
# # MDS  Multidimensional scaling. https://scikit-learn.org/stable/modules/generated/sklearn.manifold.MDS.html#sklearn.manifold.MDS
# X_emb = MDS(n_components=3).fit_transform(X)
#
# # Dictionary Learning https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.DictionaryLearning.html#sklearn.decomposition.DictionaryLearning
# X_emb = DictionaryLearning(n_components=3).fit_transform(X)
#
# # Factor Analysis https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.FactorAnalysis.html#sklearn.decomposition.FactorAnalysis
# X_emb = FactorAnalysis(n_components=3, random_state=0).fit_transform(X)
#
# # Fact ICA https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.FastICA.html#sklearn.decomposition.FastICA
# X_emb = FastICA(n_components=3,random_state=0).fit_transform(X)


# Matrix Decomposition Algorithms


# # Incremental PCA  https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.IncrementalPCA.html#sklearn.decomposition.IncrementalPCA
# X_emb = IncrementalPCA(n_components=3, batch_size=200).fit_transform(X)
#
# # Kernel PCA  https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.KernelPCA.html#sklearn.decomposition.KernelPCA
# X_emb =  KernelPCA(n_components=3, kernel='linear').fit_transform(X)
#
# # PCA https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#sklearn.decomposition.PCA
# X_emb = PCA(n_components=3).fit_transform(X)
#
# # Sparse PCA https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.SparsePCA.html#sklearn.decomposition.SparsePCA
# X_emb = SparsePCA(n_components=3).fit_transform(X)
#
# # TruncatedSVD https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html#sklearn.decomposition.TruncatedSVD
# X_emb = TruncatedSVD(n_components=3).fit_transform(X)
#
# ## LLE and NMPF only work on positve valued data
# # Laten Dirichlet Allocation (LDA) https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html#sklearn.decomposition.LatentDirichletAllocation
# X_emb = LatentDirichletAllocation(n_components=3,random_state=0).fit_transform(np.abs(X))
#
# # NMF https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html#sklearn.decomposition.NMF
# X_emb = NMF(init='random', random_state=0).fit_transform(np.abs(X))




# X_emb = PCA(n_components=3).fit_transform(X)
# X_emb = LatentDirichletAllocation(n_components=5,random_state=0).fit(np.abs(X))
# X_emb = TruncatedSVD(n_components=2).fit_transform(X)
# X_emb = FactorAnalysis(n_components=7, random_state=0).fit_transform(X)
# X_emb = DictionaryLearning(n_components=3).fit_transform(X)
x,y,z = X_emb[:,:1].flatten(), X_emb[:,1:2].flatten(), X_emb[:,2:3].flatten()
d = pd.DataFrame({'x':x,'y':y, 'z':z})
fig = px.scatter_3d(d,x='x', y='y', z='z')
fig.show()


The number of connected components of the neighbors graph is 12 > 1. Completing the graph to fit Isomap might be slow. Increase the number of neighbors to avoid this issue.


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix

In [100]:
# 2d
d = pd.DataFrame({'x':x,'y':y})
fig = px.scatter(d, x='x', y='y')
fig.show()

# Clustering

## DBSCAN

In [101]:
clustering = DBSCAN(eps=0.25, min_samples=5, metric='euclidean').fit_predict(X_emb)
cluster_df = pd.DataFrame.from_dict(clustering)
X_df = pd.DataFrame.from_dict(X_emb)
X_df['cluster'] = cluster_df
X_df['title'] = df.title
X_df.columns = ['x', 'y', 'z', 'cluster', 'title']

print('Cluster counts:')
print(X_df.cluster.value_counts())
fig = px.scatter(X_df, x='x', y='y', hover_name='title', color='cluster')
fig.show()
fig = px.scatter_3d(X_df, x='x', y='y', z='z', hover_name='title', color='cluster')
fig.show()

Cluster counts:
 0     501
 7      43
-1      29
 4      19
 9      11
 3       9
 5       8
 1       7
 2       6
 6       6
 8       6
 10      6
 11      6
Name: cluster, dtype: int64


One interesting observation is that many of the Sources from GitHub show up in the cluster around (1.5, -0.5)

## KMeans

In [102]:
clustering = KMeans().fit_predict(X_emb)
cluster_df = pd.DataFrame.from_dict(clustering)
X_df = pd.DataFrame.from_dict(X_emb)

X_df['cluster'] = cluster_df
X_df['title'] = df.title
X_df.columns = ['x', 'y', 'z', 'cluster', 'title']
print('Cluster counts:')
print(X_df.cluster.value_counts())
fig = px.scatter(X_df, x='x', y='y', hover_name='title', color='cluster')
fig.show()
fig = px.scatter_3d(X_df, x='x', y='y', z='z', hover_name='title', color='cluster')
fig.show()

Cluster counts:
0    182
3    149
6    115
1     93
5     50
2     49
4     13
7      6
Name: cluster, dtype: int64


## Affinity Propagation

In [103]:
clustering = AffinityPropagation().fit_predict(X_emb)
cluster_df = pd.DataFrame.from_dict(clustering)
X_df = pd.DataFrame.from_dict(X_emb)
X_df['cluster'] = cluster_df
X_df['title'] = df.title
X_df.columns = ['x', 'y', 'z', 'cluster', 'title']
print('Cluster counts:')
print(X_df.cluster.value_counts())
fig = px.scatter(X_df, x='x', y='y', hover_name='title', color='cluster')
fig.show()
fig = px.scatter_3d(X_df, x='x', y='y', z='z', hover_name='title', color='cluster')
fig.show()

Cluster counts:
28    58
23    55
26    53
17    44
22    32
15    31
18    31
3     28
8     27
11    26
2     24
12    24
19    21
6     20
13    20
10    20
14    18
1     17
0     13
27    12
21    11
5      9
7      8
16     7
9      6
24     6
25     6
20     4
36     2
50     2
4      2
30     1
41     1
49     1
48     1
47     1
46     1
45     1
44     1
43     1
42     1
40     1
29     1
39     1
38     1
37     1
35     1
34     1
32     1
31     1
33     1
Name: cluster, dtype: int64



Affinity propagation did not converge, this model may return degenerate cluster centers and labels.



In [104]:
clustering = SpectralClustering().fit_predict(X_emb)
cluster_df = pd.DataFrame.from_dict(clustering)
X_df = pd.DataFrame.from_dict(X_emb)
X_df['cluster'] = cluster_df
X_df['title'] = df.title
X_df.columns = ['x', 'y', 'z', 'cluster', 'title']
print('Cluster counts:')
print(X_df.cluster.value_counts())
fig = px.scatter(X_df, x='x', y='y', hover_name='title', color='cluster')
fig.show()
fig = px.scatter_3d(X_df, x='x', y='y', z='z', hover_name='title', color='cluster')
fig.show()

Cluster counts:
0    469
7     80
4     48
5     39
6      8
3      7
1      4
2      2
Name: cluster, dtype: int64
