# Validation of the AI Dictionary
tbd

In [None]:
import pandas as pd
import numpy as np
import pickle

from tqdm.auto import tqdm
# register tqdm with pandas
tqdm.pandas()

import ahocorasick

from helper.keyword_helper import get_clean_keywords, neo4j_fetch_data

In [None]:
# Create a dict of neo4j credentials
NEO4J_CREDENTIALS = {"url": "bolt://localhost:37687", "user": "neo4j", "password": "neo4jpassword"}

DICT_DIRECTORY = "data/dictionaries/"

In [None]:
# Import the core and extended dictionaries
core_dict = pd.read_csv("data/dictionaries/core_keywords.csv")
extended_dict = pd.read_csv("data/dictionaries/extended_keywords.csv")
extended_dict_neg = pd.read_csv("data/dictionaries/extended_keywords_neg.csv")

# Load Aho-Corasick automatons from dictionaries
cso_automation = ahocorasick.load(DICT_DIRECTORY + "core_aho_automation/cso_aho_automation.pkl", pickle.loads)
dataset_automation = ahocorasick.load(DICT_DIRECTORY + "core_aho_automation/dataset_aho_automation.pkl", pickle.loads)
method_automation = ahocorasick.load(DICT_DIRECTORY + "core_aho_automation/method_aho_automation.pkl", pickle.loads)
task_automation = ahocorasick.load(DICT_DIRECTORY + "core_aho_automation/task_aho_automation.pkl", pickle.loads)

cso_extended_automation = ahocorasick.load(DICT_DIRECTORY + "extended_aho_automation/cso_aho_automation.pkl", pickle.loads)
dataset_extended_automation = ahocorasick.load(DICT_DIRECTORY + "extended_aho_automation/dataset_aho_automation.pkl", pickle.loads)
method_extended_automation = ahocorasick.load(DICT_DIRECTORY + "extended_aho_automation/method_aho_automation.pkl", pickle.loads)
task_extended_automation = ahocorasick.load(DICT_DIRECTORY + "extended_aho_automation/task_aho_automation.pkl", pickle.loads)

cso_extended_automation_neg = ahocorasick.load(DICT_DIRECTORY + "extended_neg_aho_automation/cso_aho_automation.pkl", pickle.loads)
dataset_extended_automation_neg = ahocorasick.load(DICT_DIRECTORY + "extended_neg_aho_automation/dataset_aho_automation.pkl", pickle.loads)
method_extended_automation_neg = ahocorasick.load(DICT_DIRECTORY + "extended_neg_aho_automation/method_aho_automation.pkl", pickle.loads)
task_extended_automation_neg = ahocorasick.load(DICT_DIRECTORY + "extended_neg_aho_automation/task_aho_automation.pkl", pickle.loads)

In [None]:
# Get fulltexts from papers in neo4j
query = """
MATCH (p:Paper)-[:HAS_FULLTEXT]->(f:Fulltext)
RETURN p.id. AS id, p.title AS title, p.abstract AS abstract, f.text AS fulltext
LIMIT 100
"""
print("Fetching data...")
pwc_oalex_ids = neo4j_fetch_data(query, NEO4J_CREDENTIALS)
print("Done.")

In [None]:
# Apply ahocorasick automaton to each fulltext. Make a column for each applied automation
print("Applying automations...")
pwc_oalex_ids["cso_core_aho"] = pwc_oalex_ids["fulltext"].progress_apply(lambda x: list(cso_automation.iter(x)))
pwc_oalex_ids["cso_core_aho"] = pwc_oalex_ids["cso_core_aho"].apply(lambda x: [y[1][1] for y in x])
pwc_oalex_ids["cso_core_aho_length"] = pwc_oalex_ids["cso_core_aho"].apply(lambda x: len(x))
pwc_oalex_ids["dataset_core_aho"] = pwc_oalex_ids["fulltext"].progress_apply(lambda x: list(dataset_automation.iter(x)))
pwc_oalex_ids["dataset_core_aho"] = pwc_oalex_ids["dataset_core_aho"].apply(lambda x: [y[1][1] for y in x])
pwc_oalex_ids["dataset_core_aho_length"] = pwc_oalex_ids["dataset_core_aho"].apply(lambda x: len(x))
pwc_oalex_ids["method_core_aho"] = pwc_oalex_ids["fulltext"].progress_apply(lambda x: list(method_automation.iter(x)))
pwc_oalex_ids["method_core_aho"] = pwc_oalex_ids["method_core_aho"].apply(lambda x: [y[1][1] for y in x])
pwc_oalex_ids["method_core_aho_length"] = pwc_oalex_ids["method_core_aho"].apply(lambda x: len(x))
pwc_oalex_ids["task_core_aho"] = pwc_oalex_ids["fulltext"].progress_apply(lambda x: list(task_automation.iter(x)))
pwc_oalex_ids["task_core_aho"] = pwc_oalex_ids["task_core_aho"].apply(lambda x: [y[1][1] for y in x])
pwc_oalex_ids["task_core_aho_length"] = pwc_oalex_ids["task_core_aho"].apply(lambda x: len(x))

pwc_oalex_ids["cso_extended_aho"] = pwc_oalex_ids["fulltext"].progress_apply(lambda x: list(cso_extended_automation.iter(x)))
pwc_oalex_ids["cso_extended_aho"] = pwc_oalex_ids["cso_extended_aho"].apply(lambda x: [y[1][1] for y in x])
pwc_oalex_ids["cso_extended_aho_length"] = pwc_oalex_ids["cso_extended_aho"].apply(lambda x: len(x))
pwc_oalex_ids["dataset_extended_aho"] = pwc_oalex_ids["fulltext"].progress_apply(lambda x: list(dataset_extended_automation.iter(x)))
pwc_oalex_ids["dataset_extended_aho"] = pwc_oalex_ids["dataset_extended_aho"].apply(lambda x: [y[1][1] for y in x])
pwc_oalex_ids["dataset_extended_aho_length"] = pwc_oalex_ids["dataset_extended_aho"].apply(lambda x: len(x))
pwc_oalex_ids["method_extended_aho"] = pwc_oalex_ids["fulltext"].progress_apply(lambda x: list(method_extended_automation.iter(x)))
pwc_oalex_ids["method_extended_aho"] = pwc_oalex_ids["method_extended_aho"].apply(lambda x: [y[1][1] for y in x])
pwc_oalex_ids["method_extended_aho_length"] = pwc_oalex_ids["method_extended_aho"].apply(lambda x: len(x))
pwc_oalex_ids["task_extended_aho"] = pwc_oalex_ids["fulltext"].progress_apply(lambda x: list(task_extended_automation.iter(x)))
pwc_oalex_ids["task_extended_aho"] = pwc_oalex_ids["task_extended_aho"].apply(lambda x: [y[1][1] for y in x])
pwc_oalex_ids["task_extended_aho_length"] = pwc_oalex_ids["task_extended_aho"].apply(lambda x: len(x))

pwc_oalex_ids["cso_extended_aho_neg"] = pwc_oalex_ids["fulltext"].progress_apply(lambda x: list(cso_extended_automation_neg.iter(x)))
pwc_oalex_ids["cso_extended_aho_neg"] = pwc_oalex_ids["cso_extended_aho_neg"].apply(lambda x: [y[1][1] for y in x])
pwc_oalex_ids["cso_extended_aho_neg_length"] = pwc_oalex_ids["cso_extended_aho_neg"].apply(lambda x: len(x))
pwc_oalex_ids["dataset_extended_aho_neg"] = pwc_oalex_ids["fulltext"].progress_apply(lambda x: list(dataset_extended_automation_neg.iter(x)))
pwc_oalex_ids["dataset_extended_aho_neg"] = pwc_oalex_ids["dataset_extended_aho_neg"].apply(lambda x: [y[1][1] for y in x])
pwc_oalex_ids["dataset_extended_aho_neg_length"] = pwc_oalex_ids["dataset_extended_aho_neg"].apply(lambda x: len(x))
pwc_oalex_ids["method_extended_aho_neg"] = pwc_oalex_ids["fulltext"].progress_apply(lambda x: list(method_extended_automation_neg.iter(x)))
pwc_oalex_ids["method_extended_aho_neg"] = pwc_oalex_ids["method_extended_aho_neg"].apply(lambda x: [y[1][1] for y in x])
pwc_oalex_ids["method_extended_aho_neg_length"] = pwc_oalex_ids["method_extended_aho_neg"].apply(lambda x: len(x))
pwc_oalex_ids["task_extended_aho_neg"] = pwc_oalex_ids["fulltext"].progress_apply(lambda x: list(task_extended_automation_neg.iter(x)))
pwc_oalex_ids["task_extended_aho_neg"] = pwc_oalex_ids["task_extended_aho_neg"].apply(lambda x: [y[1][1] for y in x])
pwc_oalex_ids["task_extended_aho_neg_length"] = pwc_oalex_ids["task_extended_aho_neg"].apply(lambda x: len(x))

print("Done.")

