# Building the Extended AI Dictionary
This notebook requires a running instance of the neo4j Graph Database with all the data from the steps before loaded and a built core dictionary. It will extend the core dictionary with the data from the graph database and save it as a new dictionary.

In [4]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
from sentence_transformers import SentenceTransformer

import logging
logging.basicConfig(level=logging.ERROR)

# Import plotly
import plotly.express as px

from tqdm.auto import tqdm
# register tqdm with pandas
tqdm.pandas()

import numpy as np
import ast

from hdbscan import HDBSCAN

from helper.keyword_helper import get_tsne_coordinates, representation_generator

In [2]:
DICT_PATH = "data/dictionaries"

In [3]:
# Load the core &  extended dictionary that was produced by the 03b_Extended_Dictionary_Helper.py script
dict_df = pd.read_csv(f"{DICT_PATH}/dictionary.csv").sample(25000, random_state=42)
print(f"Combined dictionary size: {dict_df.shape}")

Combined dictionary size: (25000, 5)


In [5]:
print(f"Number of cso_core key phrases: {len(dict_df[(dict_df['dict'] == 'core') & (dict_df['source'] == 'cso')])}")
print(f"Number of cso_extended key phrases: {len(dict_df[(dict_df['dict'] == 'extended') & (dict_df['source'] == 'cso')])}")
print(f"Number of cso key phrases in total: {len(dict_df[dict_df['source'] == 'cso'])}")

print(f"Number of method_core key phrases: {len(dict_df[(dict_df['dict'] == 'core') & (dict_df['source'] == 'method')])}")
print(f"Number of method_extended key phrases: {len(dict_df[(dict_df['dict'] == 'extended') & (dict_df['source'] == 'method')])}")
print(f"Number of method key phrases in total: {len(dict_df[dict_df['source'] == 'method'])}")

print(f"Number of task_core key phrases: {len(dict_df[(dict_df['dict'] == 'core') & (dict_df['source'] == 'task')])}")
print(f"Number of task_extended key phrases: {len(dict_df[(dict_df['dict'] == 'extended') & (dict_df['source'] == 'task')])}")
print(f"Number of task key phrases in total: {len(dict_df[dict_df['source'] == 'task'])}")

print(f"Number of dataset_core key phrases: {len(dict_df[(dict_df['dict'] == 'core') & (dict_df['source'] == 'dataset')])}")
print(f"Number of dataset_extended key phrases: {len(dict_df[(dict_df['dict'] == 'extended') & (dict_df['source'] == 'dataset')])}")
print(f"Number of dataset key phrases in total: {len(dict_df[dict_df['source'] == 'dataset'])}")

Number of cso_core key phrases: 76
Number of cso_extended key phrases: 3461
Number of cso key phrases in total: 3537
Number of method_core key phrases: 289
Number of method_extended key phrases: 10982
Number of method key phrases in total: 11271
Number of task_core key phrases: 149
Number of task_extended key phrases: 2482
Number of task key phrases in total: 2631
Number of dataset_core key phrases: 1008
Number of dataset_extended key phrases: 6553
Number of dataset key phrases in total: 7561


In [6]:
tsne_dict_df = dict_df.copy()

tsne_dict_df["embedding"] = tsne_dict_df["embedding"].progress_apply(ast.literal_eval)
x = np.array(tsne_dict_df["embedding"].tolist())
print(f"Shape of x_core: {x.shape}")
print(f"Shape of example from x_core: {x[0].shape}")

keyword_embeddings_tsne_2d = get_tsne_coordinates(x)
print(f"Shape of keyword_embeddings_tsne_2d_core: {keyword_embeddings_tsne_2d.shape}")

# Add the TSNE coordinates to the dataframe
tsne_dict_df["tsne_x"] = keyword_embeddings_tsne_2d[:,0]
tsne_dict_df["tsne_y"] = keyword_embeddings_tsne_2d[:,1]

  0%|          | 0/25000 [00:00<?, ?it/s]

Shape of x_core: (25000, 768)
Shape of example from x_core: (768,)
===> Finding 600 nearest neighbors using Annoy approximate search using cosine distance...
   --> Time elapsed: 41.46 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 3.60 seconds
===> Calculating PCA-based initialization...
   --> Time elapsed: 0.77 seconds
--------------------------------------------------------------------------------
TSNE(early_exaggeration=12, n_jobs=8, verbose=True)
--------------------------------------------------------------------------------
===> Running optimization with exaggeration=12.00, lr=2083.33 for 250 iterations...
Iteration   50, KL divergence 4.8853, 50 iterations in 1.5375 sec
Iteration  100, KL divergence 4.9391, 50 iterations in 1.5100 sec
Iteration  150, KL divergence 4.9395, 50 iterations in 1.5217 sec
Iteration  200, KL divergence 4.9394, 50 iterations in 1.6486 sec
Iteration  250, KL divergence 4.9394, 50 iterations in 1.6470 sec
   --> Time elapsed: 7.87 seco

In [7]:
# Clustering
keyword_embeddings_hdbscan = HDBSCAN(
    min_cluster_size = 50,
    min_samples = 5,
    metric="euclidean",
    # cluster_selection_method="eom",
    cluster_selection_method="leaf",
    prediction_data=True,
    core_dist_n_jobs=8,
).fit(keyword_embeddings_tsne_2d)
print(f"Got {len(set(keyword_embeddings_hdbscan.labels_))} clusters")

# Add the cluster labels to the dataframe
tsne_dict_df["cluster"] = keyword_embeddings_hdbscan.labels_

Got 164 clusters


In [8]:
# Generate representations for each cluster
cluster_keywords = tsne_dict_df.groupby('cluster')['keyword'].apply(list).reset_index()
display(cluster_keywords.head())

# Apply the function representation_generator to the list of keywords for each cluster
keywords_list = cluster_keywords['keyword'].tolist()
cluster_keywords['representation'] = representation_generator(keywords_list)
display(cluster_keywords.head())

# Assign each cluster its representation
tsne_dict_df = tsne_dict_df.merge(cluster_keywords[['cluster', 'representation']], on='cluster', how='left')
display(tsne_dict_df.head())

Unnamed: 0,cluster,keyword
0,-1,"[fewer prior information, pretrained residual ..."
1,0,"[similar ensemble approach, original ensemble,..."
2,1,"[purely transformer-based model, enhanced tran..."
3,2,"[aware loss function, enhanced loss function, ..."
4,3,"[one-shot network, few-shot classification tas..."


Generating representations...:   0%|          | 0/164 [00:00<?, ?it/s]



Unnamed: 0,cluster,keyword,representation
0,-1,"[fewer prior information, pretrained residual ...",computational learning model
1,0,"[similar ensemble approach, original ensemble,...",ubc machine learning
2,1,"[purely transformer-based model, enhanced tran...",learning models
3,2,"[aware loss function, enhanced loss function, ...",open-source computational recursive neural
4,3,"[one-shot network, few-shot classification tas...",Artificial intelligence


Unnamed: 0,keyword,source,embedding,dict,trie_id,tsne_x,tsne_y,cluster,representation
0,standard ultrasound transducer,dataset,"[0.5186449289321899, -0.09616893529891968, 0.5...",extended,446922,-13.350599,35.724459,124,open-source covid
1,meaningful clustering measure,task,"[0.055283308029174805, 0.021428877487778664, 0...",extended,318547,-61.618585,47.160939,11,classification and classification problems
2,fewer prior information,cso,"[0.16202954947948456, -0.02446877583861351, -0...",extended,61882,0.868736,-61.612423,-1,computational learning model
3,proper dataset,task,"[-0.03730485215783119, -0.32432374358177185, -...",extended,321393,12.513974,-19.342143,143,artificial intelligence in applications
4,limiting distribution,task,"[-0.3170706033706665, -0.2181326448917389, -0....",extended,291323,-14.498732,-60.521299,35,human-machine interfaces


In [9]:
# Rename all rows with cluster = -1 to noise in column representation
tsne_dict_df.loc[tsne_dict_df['cluster'] == -1, 'representation'] = 'noise'

# Make a plotly scatter plot of all keywords. Color the keywords by their cluster label and group them by their source dictionary
fig = px.scatter(
    tsne_dict_df,
    x="tsne_x",
    y="tsne_y",
    color="representation",
    hover_name="keyword",
    symbol="dict",
    symbol_map={
        "core": "circle",
        "extended": "cross",
    },
    hover_data=["keyword", "cluster", "representation", "dict", "source"])
# Save the plot
fig.write_html("plots/keyword_embeddings.html")