# Building a negative Dictionary for Keyword Extraction
This notebook requires a running instance of a OpenAlex Postgres Database, a neo4j Graph Database as well as core and extended AI dictionaries built. It will create a negative dictionary sampled from the OpenAlex database. The negative dictionary will be used to filter out keywords that are not relevant to the AI domain.

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

import logging
logging.basicConfig(level=logging.ERROR)

from sqlalchemy import create_engine, URL, text

from tqdm.auto import tqdm
# register tqdm with pandas
tqdm.pandas()

import ahocorasick
import pickle, re

from helper.keyword_helper import get_clean_keywords, neo4j_fetch_data, make_aho_automation

In [2]:
AHO_LENGTH = 5
OALEX_SAMPLE_LIMIT = 100000 # Remember, around 60% will be filtered out...
# OALEX_SAMPLE_FRAC = 0.01
KEYWORD_FREQ_RANGE = (2,6000)

DICT_PATH = "data/dictionaries"

url_object = URL.create(
    drivername='postgresql+psycopg2',
    username='tie',
    password='TIE%2023!tuhh',
    host='134.28.58.100',
    # host='tie-workstation.tail6716.ts.net',
    # host='localhost',
    port=45432,
    database='openalex_db',
)
engine = create_engine(url_object)
engine.connect()

# Create a dict of neo4j credentials
NEO4J_CREDENTIALS = {"url": "bolt://localhost:37687", "user": "neo4j", "password": "neo4jpassword"}

In [3]:
# Get a Sample of around 0.01% of the data
# sql_query = f'''
#     SELECT *
#     FROM openalex.works
#     TABLESAMPLE SYSTEM ({OALEX_SAMPLE_FRAC})
#     WHERE abstract_inverted_index IS NOT NULL
# '''

sql_query = f'''
    SELECT *
    FROM openalex.works
    WHERE abstract_inverted_index IS NOT NULL
    LIMIT {OALEX_SAMPLE_LIMIT}
'''

with engine.connect() as conn:
    openalex = pd.read_sql(sql=text(sql_query), con=conn)
    print(f"Got {len(openalex)} OpenAlex Works")

Got 100000 OpenAlex Works


In [3]:
# Get all openalex_ids from papers for a negative list
query = """
MATCH (p:Paper)
RETURN p.id_openalex as openalex_id
"""
print("Fetching data...")
pwc_oalex_ids = neo4j_fetch_data(query, NEO4J_CREDENTIALS)
print("Done.")
print(f"Got {len(pwc_oalex_ids)} OpenAlex IDs from PwC Papers.")
pwc_oalex_ids.dropna(inplace=True)
pwc_oalex_ids.reset_index(drop=True, inplace=True)
print(f"Got {len(pwc_oalex_ids)} OpenAlex IDs from PwC Papers after dropping NA.")

Fetching data...
Done.
Got 256386 OpenAlex IDs from PwC Papers.
Got 256386 OpenAlex IDs from PwC Papers after dropping NA.


In [5]:
# Remove all openalex rows, where the id is in the pwc_oalex_ids df
print(f"Number of openalex sample abstracts before removing PwC papers: {len(openalex)}")
openalex_id_filtered = openalex[~openalex['id'].isin(pwc_oalex_ids['openalex_id'])]
print(f"Number of openalex sample abstracts after removing PwC papers: {len(openalex_id_filtered)} ({round(len(openalex_id_filtered)/len(openalex), 3) * 100}% remain)")

Number of openalex sample abstracts before removing PwC papers: 100000
Number of openalex sample abstracts after removing PwC papers: 99718 (99.7% remain)


In [6]:
# Some cleaning
openalex_sample = openalex_id_filtered.copy()
openalex_sample['abstract_inverted_index'] = openalex_sample['abstract_inverted_index'].apply(lambda x: x['InvertedIndex'])
openalex_sample['abstract'] = [" ".join(list(d.keys())) if d else None for d in openalex_sample['abstract_inverted_index']]
openalex_sample.drop(columns=['abstract_inverted_index'], inplace=True)
openalex_sample['abstract'] = (openalex_sample['abstract']
                                .str.replace(r"[^a-zA-Z0-9 ]", " ", regex=True)
                                .str.replace(r"abstract", "", flags=re.IGNORECASE)
                                .str.strip()
                                .str.replace(r"\s+", " ", regex=True)
                                .astype(str)
                                .str.lower()
                                )
openalex_sample['abstract'] = openalex_sample['abstract'].apply(lambda x: x if len(x.split()) > 10 else None)
openalex_sample = openalex_sample[openalex_sample['abstract'].notna()]
openalex_sample.reset_index(drop=True, inplace=True)
print(f"Number of rows after cleaning: {len(openalex_sample)} ({round((len(openalex_sample))/len(openalex_id_filtered), 2) * 100}% remain)")

Number of rows after cleaning: 99291 (100.0% remain)


In [7]:
# Load the aho automation for the core keywords
print("Loading aho automation...")
cso_aho_automation = ahocorasick.load('data/dictionaries/core_aho_automation/cso_aho_automation.pkl', pickle.loads)

openalex_sample_aho = openalex_sample.copy()

# Apply the automation on the abstracts and make a new column with all the results
print("Applying aho automation...")
openalex_sample_aho['aho_results'] = openalex_sample_aho['abstract'].progress_apply(lambda x: list(cso_aho_automation.iter_long(x)))
# Extract only the keywords, not the positions
openalex_sample_aho['aho_results'] = openalex_sample_aho['aho_results'].apply(lambda x: [y[1][1] for y in x])
# Make a new column aho_length with the length of the results
openalex_sample_aho['aho_length'] = openalex_sample_aho['aho_results'].apply(lambda x: len(x))
# Sort by aho_length descending
openalex_sample_aho.sort_values(by='aho_length', ascending=False, inplace=True)
# Reset the index
openalex_sample_aho.reset_index(drop=True, inplace=True)

# Remove all rows with an aho_length of 5 or more
print(f"Found all core ai keywords in abstracts, will now remove all rows with an aho_length of {AHO_LENGTH} or more.")
openalex_sample_aho = openalex_sample_aho[openalex_sample_aho['aho_length'] < AHO_LENGTH]
print(f"Number of rows after removing all rows with an aho_length of {AHO_LENGTH} or more: {len(openalex_sample_aho)} ({round(len(openalex_sample_aho)/len(openalex_sample), 3) * 100}% remain)")

Loading aho automation...
Applying aho automation...


  0%|          | 0/99291 [00:00<?, ?it/s]

Found all core ai keywords in abstracts, will now remove all rows with an aho_length of 5 or more.
Number of rows after removing all rows with an aho_length of 5 or more: 34946 (35.199999999999996% remain)


In [8]:
# Get the keywords (this will take some time)
negative_keywords_df = get_clean_keywords(openalex_sample_aho, ["title", "abstract"])

Generating keywords for title...


  0%|          | 0/34946 [00:00<?, ?it/s]




Extracted keywords for title
Filtered out keywords with less than 4 characters.
Ended up with 34946 keywords.
Generating keywords for abstract...


  0%|          | 0/34946 [00:00<?, ?it/s]

Extracted keywords for abstract
Filtered out keywords with less than 4 characters.
Ended up with 34946 keywords.


In [9]:
negative_title_keywords = negative_keywords_df.title_keywords.tolist()
negative_title_keywords = [keyword for keywords in negative_title_keywords for keyword, score in keywords]
negative_abstract_keywords = negative_keywords_df.abstract_keywords.tolist()
negative_abstract_keywords = [keyword for keywords in negative_abstract_keywords for keyword, score in keywords]

# Make a list with all keywords
all_negative_keywords = negative_title_keywords + negative_abstract_keywords
print(f"Got {len(all_negative_keywords)} keywords including title and abstract keywords.")

# Make a df out of it
all_negative_keywords_df = pd.DataFrame(all_negative_keywords, columns=['keyword'])
# I want you to convert the all_negative_keywords_df so that only unique keywords remain. But I need the number of occurences for each keyword in the original df. Therefore the unique df has a new column "frequency" that contains the number of occurences of the keyword in the original df.
all_negative_keywords_df['frequency'] = all_negative_keywords_df['keyword'].map(all_negative_keywords_df['keyword'].value_counts())
# Drop duplicates
all_negative_keywords_df.drop_duplicates(inplace=True)
# Drop all rows where the frequency is not in KEYWORD_FREQ_RANGE
all_negative_keywords_df = all_negative_keywords_df[all_negative_keywords_df['frequency'].between(KEYWORD_FREQ_RANGE[0], KEYWORD_FREQ_RANGE[1])]
# Reset index
all_negative_keywords_df.reset_index(drop=True, inplace=True)
# Make a list out of it
all_negative_keywords_dedupe = all_negative_keywords_df.keyword.tolist()

print(f"Got {len(all_negative_keywords_dedupe)} unique keywords in the list named all_negative_keywords_dedupe ({round(len(all_negative_keywords_dedupe)/len(all_negative_keywords), 3) * 100}% remain).")

Got 349731 keywords including title and abstract keywords.
Got 31315 unique keywords in the list named all_negative_keywords_dedupe (9.0% remain).


In [10]:
# Subtract the all_negative_keywords_dedupe from each ext dict

# Make a new folder if it doesn't exist "data/dictionaries/extended_keywords/"
import os
if not os.path.exists('data/dictionaries/extended_keywords_neg/'):
    os.makedirs('data/dictionaries/extended_keywords_neg/')

# Load the extended dictionaries
print("Loading extended dictionaries...")
extended_keywords = pd.read_csv('data/dictionaries/extended_keywords.csv')
cso_ext = extended_keywords[extended_keywords['source'] == 'cso']
method_ext = extended_keywords[extended_keywords['source'] == 'method']
task_ext = extended_keywords[extended_keywords['source'] == 'task']
dataset_ext = extended_keywords[extended_keywords['source'] == 'dataset']
print("Done.")

# Subtract the all_negative_keywords_dedupe from each ext dict
print("Subtracting all_negative_keywords_dedupe from each ext dict...")

all_keywords = pd.DataFrame(columns=['keyword', 'source'])

for ext, name in [(cso_ext, 'cso'), (method_ext, 'method'), (task_ext, 'task'), (dataset_ext, 'dataset')]:
    initial_length_ext = len(ext)
    removed_keywords = ext[ext['keyword'].isin(all_negative_keywords_dedupe)]
    ext = ext[~ext['keyword'].isin(all_negative_keywords_dedupe)]
    ext.dropna(inplace=True)
    ext.reset_index(drop=True, inplace=True)
    print(f"Number of rows in {name}_ext after removing all_negative_keywords_dedupe: {len(ext)} out of {initial_length_ext} ({round(len(ext)/initial_length_ext, 3) * 100}% remain)")
    # Save the removed keywords to a CSV file
    removed_keywords.to_csv(f'data/dictionaries/extended_keywords_neg/removed_keywords_extended_{name}.csv', index=False)
    # Make a new column "source" with the name of the dictionary
    ext['source'] = name
    # Concatenate this df with all_keywords
    all_keywords = pd.concat([all_keywords, ext], ignore_index=True)
print("Done.")

Loading extended dictionaries...
Done.
Subtracting all_negative_keywords_dedupe from each ext dict...
Number of rows in cso_ext after removing all_negative_keywords_dedupe: 4013 out of 4411 (91.0% remain)
Number of rows in method_ext after removing all_negative_keywords_dedupe: 1889 out of 2167 (87.2% remain)
Number of rows in task_ext after removing all_negative_keywords_dedupe: 5209 out of 5769 (90.3% remain)
Number of rows in dataset_ext after removing all_negative_keywords_dedupe: 4990 out of 5763 (86.6% remain)
Done.
Saving extended dictionary...
Done.


: 

In [None]:
# Make a new folder in the DICT_PATH for the ahocorasick dumps
if not os.path.exists(DICT_PATH + "/extended_neg_aho_automation"):
    os.mkdir(DICT_PATH + "/extended_neg_aho_automation")
    
extended_neg_keywords_cso_automation = make_aho_automation(all_keywords[all_keywords['source'] == 'cso'].keyword.tolist())
extended_neg_keywords_cso_automation.save(DICT_PATH + "/extended_neg_aho_automation/cso_aho_automation.pkl")

extended_neg_keywords_method_automation = make_aho_automation(all_keywords[all_keywords['source'] == 'method'].keyword.tolist())
extended_neg_keywords_method_automation.save(DICT_PATH + "/extended_neg_aho_automation/method_aho_automation.pkl")

extended_neg_keywords_task_automation = make_aho_automation(all_keywords[all_keywords['source'] == 'task'].keyword.tolist())
extended_neg_keywords_task_automation.save(DICT_PATH + "/extended_neg_aho_automation/task_aho_automation.pkl")

extended_neg_keywords_dataset_automation = make_aho_automation(all_keywords[all_keywords['source'] == 'dataset'].keyword.tolist())
extended_neg_keywords_dataset_automation.save(DICT_PATH + "/extended_neg_aho_automation/dataset_aho_automation.pkl")

In [None]:
# Save the extended dictionaries
print("Saving extended dictionary...")
all_keywords.to_csv(DICT_PATH + 'extended_keywords_neg.csv', index=False)
print("Done.")