# Building the Extended AI Dictionary
This notebook requires a running instance of the neo4j Graph Database with all the data from the steps before loaded and a built core dictionary. It will extend the core dictionary with the data from the graph database and save it as a new dictionary.

In [1]:
import pandas as pd
import pickle, os
pd.options.mode.chained_assignment = None  # default='warn'
from sentence_transformers import SentenceTransformer

import logging
logging.basicConfig(level=logging.ERROR)
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
from concurrent.futures import ThreadPoolExecutor

from tqdm.auto import tqdm
# register tqdm with pandas
tqdm.pandas()

from helper.keyword_helper import process_keywords_for_papers, make_aho_automation, neo4j_fetch_data

In [2]:
NUM_CORES = 20

KEYWORD_FREQ_RANGE = (4,5000)
COS_THRESHOLD = 0.95

DICT_PATH = "data/dictionaries"

# Create a dict of neo4j credentials
NEO4J_CREDENTIALS = {"url": "bolt://localhost:37687", "user": "neo4j", "password": "neo4jpassword"}

In [3]:
# Get all keywords related to papers - takes around 2 minutes
query = """
MATCH (p:Paper)-[r]->(k:Keyword)
WITH k.keyword AS keyword, p.title AS paper_title, p.id AS paper_id
RETURN keyword, paper_title, paper_id
"""
print("Fetching data...")
paper_keywords = neo4j_fetch_data(query, NEO4J_CREDENTIALS)
print("Done.")
print(f"Number of keywords: {len(paper_keywords)}")

Fetching data...
Done.
Number of keywords: 4429807


In [12]:
dedupe_keywords_f = (paper_keywords
                     .value_counts(subset=['keyword'])
                     .reset_index(name='frequency')
                     .sort_values(by='frequency', ascending=False)
                     .assign(frequency_normalized=lambda df: df['frequency'] / df['frequency'].max()))

print(f"Got {len(dedupe_keywords_f)} keywords after deduplication.")
display(dedupe_keywords_f.head(50))

Number of occurences of the keyword 'model': 144068
Got 887067 keywords after deduplication.


Unnamed: 0,keyword,frequency,frequency_normalized
0,model,76353,1.0
1,data,65387,0.856378
2,method,44093,0.577489
3,task,41994,0.549998
4,paper,30000,0.392912
5,image,29576,0.387359
6,approach,25300,0.331356
7,deep,23491,0.307663
8,work,22360,0.29285
9,algorithm,20691,0.270991


In [13]:
print("Loading Sentence Transformer model...")
model_scincl = SentenceTransformer('malteos/SciNCL')
print("Done.")
print("Embedding all keywords...")
dedupe_keywords_f['embedding'] = model_scincl.encode(dedupe_keywords_f['keyword'].tolist(), show_progress_bar=True).tolist()

Loading Sentence Transformer model...
Done.
Embedding all keywords...


Batches:   0%|          | 0/27721 [00:00<?, ?it/s]

In [None]:
# Load the core keywords with their embeddings into a df
core_keywords = pd.read_csv('data/dictionaries/core_keywords.csv')
core_keywords['embedding'] = core_keywords['embedding'].apply(lambda x: eval(x))

In [None]:
# Calculating similarity - this step will take a while (around 1h)

core_keywords_cso = core_keywords[core_keywords['source'] == 'cso']
core_keywords_method = core_keywords[core_keywords['source'] == 'method']
core_keywords_task = core_keywords[core_keywords['source'] == 'task']
core_keywords_dataset = core_keywords[core_keywords['source'] == 'dataset']

# Assuming core_keywords_sample and dedupe_keywords_f are already defined

def get_keywords_above_threshold(core_embedding, extended_embeddings, cos_threshold=0.1):
    similarities = cosine_similarity([core_embedding], extended_embeddings)[0]
    return np.where(similarities > cos_threshold)[0]

def batch_process_embeddings(core_embeddings, extended_embeddings, cos_threshold):
    with ThreadPoolExecutor(max_workers=NUM_CORES) as executor:
        results = list(executor.map(lambda embedding: get_keywords_above_threshold(embedding, extended_embeddings, cos_threshold), core_embeddings))
    return results

def process_keywords(df, dedupe_keywords, cos_threshold, source):
    df['keywords_above_threshold'] = batch_process_embeddings(df['embedding'].tolist(), np.array(dedupe_keywords['embedding'].tolist()), cos_threshold)
    df = df.drop(columns=['embedding'])
    df['keywords_above_threshold'] = df['keywords_above_threshold'].apply(lambda indices: dedupe_keywords.iloc[indices]['keyword'].tolist())
    df = df[['keyword', 'source', 'keywords_above_threshold']]
    df['keywords_above_threshold'] = df['keywords_above_threshold'].apply(lambda keywords: list(set(keywords) - set(df['keyword'].tolist())))
    df_all = pd.DataFrame({keyword for keywords in df['keywords_above_threshold'] for keyword in keywords}, columns=['keyword'])
    df_all['source'] = source
    return df_all

print("Finding similar keywords...")
print("")

print("Processing core keywords...")
extended_keywords_cso = process_keywords(core_keywords_cso, dedupe_keywords_f, COS_THRESHOLD, "cso")
print(f"Got {len(extended_keywords_cso)} unique extended cso keywords after processing core keywords ({len(extended_keywords_cso) / len(dedupe_keywords_f) * 100:.2f}%)")

print("Processing method keywords...")
extended_keywords_method = process_keywords(core_keywords_method, dedupe_keywords_f, COS_THRESHOLD, "method")
print(f"Got {len(extended_keywords_method)} unique extended method keywords after processing core keywords ({len(extended_keywords_method) / len(dedupe_keywords_f) * 100:.2f}%)")

print("Processing task keywords...")
extended_keywords_task = process_keywords(core_keywords_task, dedupe_keywords_f, COS_THRESHOLD, "task")
print(f"Got {len(extended_keywords_task)} unique extended task keywords after processing core keywords ({len(extended_keywords_task) / len(dedupe_keywords_f) * 100:.2f}%)")

print("Processing dataset keywords...")
extended_keywords_dataset = process_keywords(core_keywords_dataset, dedupe_keywords_f, COS_THRESHOLD, "dataset")
print(f"Got {len(extended_keywords_dataset)} unique extended dataset keywords after processing core keywords ({len(extended_keywords_dataset) / len(dedupe_keywords_f) * 100:.2f}%)")

print("")
print("Done.")

extended_keywords = pd.concat([extended_keywords_cso, extended_keywords_method, extended_keywords_task, extended_keywords_dataset])
extended_keywords = extended_keywords.reset_index(drop=True)

Finding similar keywords...

Got 9182 unique extended cso keywords after processing core keywords (10.98%)
Got 4548 unique extended method keywords after processing core keywords (5.44%)
Got 13013 unique extended task keywords after processing core keywords (15.56%)
Got 11364 unique extended dataset keywords after processing core keywords (13.59%)



In [None]:
# Make a new folder in the DICT_PATH for the ahocorasick dumps
if not os.path.exists(DICT_PATH + "/extended_aho_automation"):
    os.mkdir(DICT_PATH + "/extended_aho_automation")
    
extended_keywords_cso_automation = make_aho_automation(extended_keywords_cso['keyword'].tolist())
extended_keywords_cso_automation.save(f"{DICT_PATH}/extended_aho_automation/cso_aho_automation.pkl", pickle.dumps)

extended_keywords_method_automation = make_aho_automation(extended_keywords_method['keyword'].tolist())
extended_keywords_method_automation.save(f"{DICT_PATH}/extended_aho_automation/method_aho_automation.pkl", pickle.dumps)

extended_keywords_task_automation = make_aho_automation(extended_keywords_task['keyword'].tolist())
extended_keywords_task_automation.save(f"{DICT_PATH}/extended_aho_automation/task_aho_automation.pkl", pickle.dumps)

extended_keywords_dataset_automation = make_aho_automation(extended_keywords_dataset['keyword'].tolist())
extended_keywords_dataset_automation.save(f"{DICT_PATH}/extended_aho_automation/dataset_aho_automation.pkl", pickle.dumps)

In [None]:
# Calculate embeddings for all extended keywords
print("Embedding all extended keywords...")
extended_keywords['embedding'] = model_scincl.encode(extended_keywords['keyword'].tolist(), show_progress_bar=True).tolist()
print("Done.")

Embedding all extended keywords...


Batches:   0%|          | 0/1191 [00:00<?, ?it/s]

Done.


In [None]:

# Save the extended keywords to a csv
extended_keywords.to_csv('data/dictionaries/extended_keywords.csv', index=False)
print("Done saving.")

Done saving.
