# Building the Extended AI Dictionary
This notebook requires a running instance of the neo4j Graph Database with all the data from the steps before loaded and a built core dictionary. It will extend the core dictionary with the data from the graph database and save it as a new dictionary.

In [None]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
from sentence_transformers import SentenceTransformer

import logging
logging.basicConfig(level=logging.ERROR)

from tqdm.auto import tqdm
# register tqdm with pandas
tqdm.pandas()

In [None]:
DICT_PATH = "data/dictionaries"

In [None]:
# Load the core &  extended dictionary that was produced by the 03b_Extended_Dictionary_Helper.py script
core_dict_df = pd.read_csv(DICT_PATH + "/core_dictionary.csv")
ext_dict_df = pd.read_csv(DICT_PATH + "/extended_dictionary.csv")
neg_keywords_df = pd.read_csv(DICT_PATH + "/negative_keywords.csv")
print("Core dictionary size: ", core_dict_df.shape)
print("Extended dictionary size: ", ext_dict_df.shape)
print("Negative keywords size: ", neg_keywords_df.shape)

In [None]:
# Strip the extended dictionary of any keywords that are already in the core dictionary
initial_ext_size = ext_dict_df.shape[0]
ext_dict_df = ext_dict_df[~ext_dict_df.keyword.isin(core_dict_df.keyword)]
print(f"Stripped {initial_ext_size - ext_dict_df.shape[0]} keywords from extended dictionary that were already in core dictionary.")
# Strip the extended dictionary of any keywords that are already in the negative keywords
initial_ext_size = ext_dict_df.shape[0]
ext_dict_df = ext_dict_df[~ext_dict_df.keyword.isin(neg_keywords_df.keyword)]
print(f"Stripped {initial_ext_size - ext_dict_df.shape[0]} keywords from extended dictionary that were already in negative keywords.")

# Calculate embeddings for all extended keywords
print("Embedding all extended keywords...")
print("")
print("Loading Sentence Transformer model...")
MODEL_SCINCL = SentenceTransformer('malteos/SciNCL')
print("Done.")
print("Embedding extended keywords...")
ext_dict_df['embedding'] = MODEL_SCINCL.encode(ext_dict_df['keyword'].tolist(), show_progress_bar=True).tolist()
print("Done.")

In [None]:
# Save the extended dictionary
ext_dict_df.to_csv(DICT_PATH + "/extended_dictionary.csv", index=False)