# Building the Extended AI Dictionary
This notebook requires a running instance of the neo4j Graph Database with all the data from the steps before loaded and a built core dictionary. It will extend the core dictionary with the data from the graph database and save it as a new dictionary.

In [4]:
from neo4j import GraphDatabase
import pandas as pd
import pickle, os
pd.options.mode.chained_assignment = None  # default='warn'
from sentence_transformers import SentenceTransformer

import logging
logging.basicConfig(level=logging.ERROR)

from nltk.stem import WordNetLemmatizer

import spacy
# nlp = spacy.load("en_core_web_trf") # For better accuracy
nlp = spacy.load("en_core_web_sm") # For better efficiency

import numpy as np

from tqdm.auto import tqdm
# register tqdm with pandas
tqdm.pandas()

from helper.keyword_helper import process_keywords_for_papers, make_aho_automation

In [5]:
NUM_CORES = 20

KEYWORD_FREQ_RANGE = (5,5000)
COS_THRESHOLD = 0.95

DICT_PATH = "data/dictionaries"

NEO4J_URL = "bolt://localhost:37687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "neo4jpassword"
driver = GraphDatabase.driver(NEO4J_URL, auth=(NEO4J_USER, NEO4J_PASSWORD))

def fetch_data(query):
  with driver.session() as session:
    result = session.run(query)
    return pd.DataFrame([r.values() for r in result], columns=result.keys())

In [6]:
# Get all keywords related to papers - takes around 2 minutes
query = """
MATCH (p:Paper)-[r]->(k:Keyword)
WITH k.keyword AS keyword, p.title AS paper_title, p.id AS paper_id
RETURN keyword, paper_title, paper_id
"""
print("Fetching data...")
paper_keywords = fetch_data(query)
print("Done.")
print(f"Number of keywords: {len(paper_keywords)}")

Fetching data...
Done.
Number of keywords: 3316130


In [7]:
dedupe_keywords_f = process_keywords_for_papers(paper_keywords, KEYWORD_FREQ_RANGE)
display(dedupe_keywords_f.head())

  0%|          | 0/3043337 [00:00<?, ?it/s]

  0%|          | 0/56427 [00:00<?, ?it/s]

Unnamed: 0,keyword,paper_title,paper_id,frequency,paper_ids,paper_counts
0,reinforcement learning,Learning from Outside the Viability Kernel: Wh...,86954928390154250184510152301458594669,4980,"[86954928390154250184510152301458594669, 31546...",4980
1,video,Task-Relevant Object Discovery and Categorizat...,212168751528671519369758360408197454323,4800,"[212168751528671519369758360408197454323, 1775...",4800
2,accuracy,Cross-modality image synthesis from unpaired d...,114413421609101646152816570603996432946,4666,"[114413421609101646152816570603996432946, 4333...",4666
3,recent year,Fast Kernelized Correlation Filters without Bo...,149673711123358820267766216923619565226,4554,"[149673711123358820267766216923619565226, 9289...",4554
4,user,Discovering Latent Patterns of Urban Cultural ...,196468073673052363069926166181151783702,4375,"[196468073673052363069926166181151783702, 3230...",4375


In [8]:
print("Loading Sentence Transformer model...")
model_scincl = SentenceTransformer('malteos/SciNCL')
print("Done.")
print("Embedding all keywords...")
dedupe_keywords_f['embedding'] = model_scincl.encode(dedupe_keywords_f['keyword'].tolist(), show_progress_bar=True).tolist()

Loading Sentence Transformer model...
Done.
Embedding all keywords...


Batches:   0%|          | 0/1335 [00:00<?, ?it/s]

In [9]:
# Load the core keywords with their embeddings into a df
core_keywords = pd.read_csv('data/dictionaries/core_keywords.csv')
core_keywords['embedding'] = core_keywords['embedding'].apply(lambda x: eval(x))

In [11]:
from sklearn.metrics.pairwise import cosine_similarity
from concurrent.futures import ThreadPoolExecutor

core_keywords_cso = core_keywords[core_keywords['source'] == 'cso']
core_keywords_method = core_keywords[core_keywords['source'] == 'method']
core_keywords_task = core_keywords[core_keywords['source'] == 'task']
core_keywords_dataset = core_keywords[core_keywords['source'] == 'dataset']

# Assuming core_keywords_sample and dedupe_keywords_f are already defined

def get_keywords_above_threshold(core_embedding, extended_embeddings, cos_threshold=0.1):
    similarities = cosine_similarity([core_embedding], extended_embeddings)[0]
    return np.where(similarities > cos_threshold)[0]

def batch_process_embeddings(core_embeddings, extended_embeddings, cos_threshold):
    with ThreadPoolExecutor(max_workers=NUM_CORES) as executor:
        results = list(executor.map(lambda embedding: get_keywords_above_threshold(embedding, extended_embeddings, cos_threshold), core_embeddings))
    return results

def process_keywords(df, dedupe_keywords, cos_threshold, source):
    df['keywords_above_threshold'] = batch_process_embeddings(df['embedding'].tolist(), np.array(dedupe_keywords['embedding'].tolist()), cos_threshold)
    df = df.drop(columns=['embedding'])
    df['keywords_above_threshold'] = df['keywords_above_threshold'].apply(lambda indices: dedupe_keywords.iloc[indices]['keyword'].tolist())
    df = df[['keyword', 'source', 'keywords_above_threshold']]
    df['keywords_above_threshold'] = df['keywords_above_threshold'].apply(lambda keywords: list(set(keywords) - set(df['keyword'].tolist())))
    df_all = pd.DataFrame({keyword for keywords in df['keywords_above_threshold'] for keyword in keywords}, columns=['keyword'])
    df_all['source'] = source
    return df_all

print("Finding similar keywords...")
print("")
extended_keywords_cso = process_keywords(core_keywords_cso, dedupe_keywords_f, COS_THRESHOLD, "cso")
print(f"Got {len(extended_keywords_cso)} unique extended cso keywords after processing core keywords ({len(extended_keywords_cso) / len(dedupe_keywords_f) * 100:.2f}%)")
extended_keywords_method = process_keywords(core_keywords_method, dedupe_keywords_f, COS_THRESHOLD, "method")

print(f"Got {len(extended_keywords_method)} unique extended method keywords after processing core keywords ({len(extended_keywords_method) / len(dedupe_keywords_f) * 100:.2f}%)")
extended_keywords_task = process_keywords(core_keywords_task, dedupe_keywords_f, COS_THRESHOLD, "task")

print(f"Got {len(extended_keywords_task)} unique extended task keywords after processing core keywords ({len(extended_keywords_task) / len(dedupe_keywords_f) * 100:.2f}%)")
extended_keywords_dataset = process_keywords(core_keywords_dataset, dedupe_keywords_f, COS_THRESHOLD, "dataset")

print(f"Got {len(extended_keywords_dataset)} unique extended dataset keywords after processing core keywords ({len(extended_keywords_dataset) / len(dedupe_keywords_f) * 100:.2f}%)")
print("")
extended_keywords = pd.concat([extended_keywords_cso, extended_keywords_method, extended_keywords_task, extended_keywords_dataset])
extended_keywords = extended_keywords.reset_index(drop=True)


Finding similar keywords...

Got 4411 unique extended cso keywords after processing core keywords (10.33%)
Got 2167 unique extended method keywords after processing core keywords (5.07%)
Got 5769 unique extended task keywords after processing core keywords (13.51%)
Got 5763 unique extended dataset keywords after processing core keywords (13.49%)

Done saving.


In [None]:
# Make a new folder in the DICT_PATH for the ahocorasick dumps
if not os.path.exists(DICT_PATH + "/extended_aho_automation"):
    os.mkdir(DICT_PATH + "/extended_aho_automation")
    
extended_keywords_cso_automation = make_aho_automation(extended_keywords_cso['keyword'].tolist())
extended_keywords_cso_automation.save(f"{DICT_PATH}/extended_aho_automation/cso_aho_automation.pkl", pickle.dumps)

extended_keywords_method_automation = make_aho_automation(extended_keywords_method['keyword'].tolist())
extended_keywords_method_automation.save(f"{DICT_PATH}/extended_aho_automation/method_aho_automation.pkl", pickle.dumps)

extended_keywords_task_automation = make_aho_automation(extended_keywords_task['keyword'].tolist())
extended_keywords_task_automation.save(f"{DICT_PATH}/extended_aho_automation/task_aho_automation.pkl", pickle.dumps)

extended_keywords_dataset_automation = make_aho_automation(extended_keywords_dataset['keyword'].tolist())
extended_keywords_dataset_automation.save(f"{DICT_PATH}/extended_aho_automation/dataset_aho_automation.pkl", pickle.dumps)

In [None]:
# Calculate embeddings for all extended keywords
print("Embedding all extended keywords...")
extended_keywords['embedding'] = model_scincl.encode(extended_keywords['keyword'].tolist(), show_progress_bar=True).tolist()
print("Done.")

In [None]:

# Save the extended keywords to a csv
extended_keywords.to_csv('data/dictionaries/extended_keywords.csv', index=False)
print("Done saving.")