In [1]:
# Common imports
import pandas as pd 
import numpy as np 
seed = 42069
np.random.seed(seed)
import cupy as cp
import altair as alt 
import plotly.express as px
from matplotlib import pyplot as plt 
import seaborn as sns
import re
from pprint import pprint
from functools import reduce

# Custom imports
import utils

# ML imports
import scispacy
import spacy
import en_core_sci_lg # Biomedical word embeddings
from spacy_langdetect import LanguageDetector
from sklearn.decomposition import PCA
from sklearn.manifold import LocallyLinearEmbedding, MDS, Isomap
from hdbscan import HDBSCAN
from umap import UMAP
from spacy.lang.en import English
import nltk
from nltk.corpus import stopwords
# nltk.download('stopwords')

from importlib import reload
reload(utils)

<module 'utils' from '/home/jonavin/inf368-exercise-3/code/utils.py'>

In [2]:
df = pd.read_csv('data/cord-19-data.csv')

In [3]:
print(df.columns)

Index(['paper_id', 'abstract', 'body_text', 'cord_uid', 'source', 'title',
       'doi', 'pmcid', 'pubmed_id', 'license', 'publish_time', 'authors',
       'journal', 'Microsoft Academic Paper ID', 'WHO #Covidence',
       'has_full_text', 'full_text_file', 'url', 'language'],
      dtype='object')


In [4]:
nlp = spacy.load('en_core_sci_lg')

In [5]:
%%time
texts = df.body_text.sample(n=10, random_state=seed)
tokens = nlp(utils.clean_text(reduce(str.__add__, texts)))

CPU times: user 13.1 s, sys: 1.67 s, total: 14.8 s
Wall time: 5.59 s


In [6]:
X = np.array([token.vector for token in tokens])
X_sub = X[::2]

In [7]:
clusterer = HDBSCAN(min_cluster_size=20)
# Fit on every other word to save time 
# Thought: Maybe remove duplace tokens
clusterer.fit(X_sub)

HDBSCAN(algorithm='best', allow_single_cluster=False, alpha=1.0,
        approx_min_span_tree=True, cluster_selection_epsilon=0.0,
        cluster_selection_method='eom', core_dist_n_jobs=4,
        gen_min_span_tree=False, leaf_size=40,
        match_reference_implementation=False, memory=Memory(location=None),
        metric='euclidean', min_cluster_size=20, min_samples=None, p=None,
        prediction_data=False)

In [8]:
X_pca = PCA(2).fit_transform(X_sub)
X_umap = UMAP(n_neighbors=69).fit_transform(X_sub)


[1m
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.
[1m
File "../../anaconda3/lib/python3.7/site-packages/umap/rp_tree.py", line 135:[0m
[1m@numba.njit(fastmath=True, nogil=True, parallel=True)
[1mdef euclidean_random_projection_split(data, indices, rng_state):
[0m[1m^[0m[0m
[0m


[1m[1m
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.
[1m
File "../../anaconda3/lib/python3.7/site-packages/umap/utils.py", line 409:[0m
[1m@numba.njit(parallel=True)
[1mdef build_candidates(current_graph, n_vertices, n_neighbors, max_candidates, rng_state):
[0m[1m^[0m[0m
[0m[0m


[1m
T

In [9]:
df_viz = pd.DataFrame()
df_viz['PCA1'], df_viz['PCA2'] =  X_pca.T[0], X_pca.T[1]
df_viz['UMAP1'], df_viz['UMAP2'] =  X_umap.T[0], X_umap.T[1]
df_viz['Cluster'] = clusterer.labels_
df_viz['Word'] = [str(token) for token in tokens][::2]

In [10]:
px.scatter(df_viz, x='PCA1', y='PCA2', color='Cluster', hover_name='Word')

In [11]:
px.scatter(df_viz, x='UMAP1', y='UMAP2', color='Cluster', hover_name='Word')