# Building the Extended AI Dictionary
This notebook requires a running instance of the neo4j Graph Database with all the data from the steps before loaded and a built core dictionary. It will extend the core dictionary with the data from the graph database and save it as a new dictionary.

In [None]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
from sentence_transformers import SentenceTransformer

import logging
logging.basicConfig(level=logging.ERROR)

# Import plotly
import plotly.express as px

from tqdm.auto import tqdm
# register tqdm with pandas
tqdm.pandas()

import numpy as np

from hdbscan import HDBSCAN

from helper.keyword_helper import get_tsne_coordinates, representation_generator

In [None]:
DICT_PATH = "data/dictionaries"

In [None]:
# Load the core &  extended dictionary that was produced by the 03b_Extended_Dictionary_Helper.py script
core_dict_df = pd.read_csv(DICT_PATH + "/core_dictionary.csv")
ext_dict_df = pd.read_csv(DICT_PATH + "/extended_dictionary.csv")
neg_keywords_df = pd.read_csv(DICT_PATH + "/negative_keywords.csv")
print("Core dictionary size: ", core_dict_df.shape)
print("Extended dictionary size: ", ext_dict_df.shape)
print("Negative keywords size: ", neg_keywords_df.shape)

In [None]:
# Combine the core and extended dictionary dataframes into one by adding a column that indicates the source dict
core_dict_df["dict"] = "core"
ext_dict_df["dict"] = "extended"

# Combine the core and extended dictionary dataframes into one. Take the columns "keyword", "source" and "embedding" from the core_dict and the column "keyword", "source" and "embedding" from the ext_dict
dict_df = pd.concat([core_dict_df[["keyword", "source", "embedding", "dict"]], ext_dict_df[["keyword", "source", "embedding", "dict"]]], ignore_index=True)
dict_df = dict_df.reset_index(drop=True)
display(dict_df.head())

In [None]:
# Make word clouds for each of the four sources in the dict_df
# Create a list of the four sources
sources = dict_df["source"].unique().tolist()

# Make a word cloud for each of the four sources
for source in sources:
    # Create a dataframe that only contains the keywords of the current source
    source_df = dict_df[dict_df["source"] == source]
    # Create a list of the keywords for the current source
    keywords = source_df["keyword"].tolist()
    # Create a list of the embeddings for the current source
    embeddings = source_df["embedding"].tolist()
    # Create a list of the embeddings for the current source
    dicts = source_df["dict"].tolist()
    # Create a word cloud for the current source
    representation_generator(keywords, embeddings, dicts, source)

In [None]:
# TSNE for the embeddings
x = np.array(dict_df["embedding"].tolist())
print(f"Shape of x_core: {x.shape}")
print(f"Shape of example from x_core: {x[0].shape}")

keyword_embeddings_tsne_2d = get_tsne_coordinates(x)
print(f"Shape of keyword_embeddings_tsne_2d_core: {keyword_embeddings_tsne_2d.shape}")

# Add the TSNE coordinates to the dataframe
dict_df["tsne_x"] = keyword_embeddings_tsne_2d[:,0]
dict_df["tsne_y"] = keyword_embeddings_tsne_2d[:,1]

In [None]:
# Clustering
keyword_embeddings_hdbscan = HDBSCAN(
    min_cluster_size = 30,
    min_samples = 5,
    metric="euclidean",
    # cluster_selection_method="eom",
    cluster_selection_method="leaf",
    prediction_data=True,
    core_dist_n_jobs=8,
).fit(keyword_embeddings_tsne_2d)
print(f"Got {len(set(keyword_embeddings_hdbscan.labels_))} clusters")

# Add the cluster labels to the dataframe
dict_df["cluster"] = keyword_embeddings_hdbscan.labels_

In [None]:
# Generate representations for each cluster
cluster_keywords = dict_df.groupby('cluster')['keyword'].apply(list).reset_index()

# Apply the function representation_generator to each cluster
cluster_keywords['representation'] = cluster_keywords['keyword'].progress_apply(lambda x: representation_generator(x))

display(cluster_keywords.head())

# Assign each cluster its representation
dict_df = dict_df.merge(cluster_keywords[['cluster', 'representation']], on='cluster', how='left')
display(dict_df.head())

In [None]:
# Make a plotly scatter plot of all keywords. Color the keywords by their cluster label and group them by their source dictionary
fig = px.scatter(
    dict_df,
    x="tsne_x",
    y="tsne_y",
    color="cluster",
    symbol="dict",
    hover_data=["keyword", "cluster", "dict"])
# Save the plot
fig.write_html("plots/keyword_embeddings.html")