In [13]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
from sentence_transformers import SentenceTransformer

import logging
logging.basicConfig(level=logging.ERROR)

# Import plotly
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots

from tqdm.auto import tqdm
# register tqdm with pandas
tqdm.pandas()

import re

from sqlalchemy import create_engine, URL, text

from helper.keyword_helper import neo4j_fetch_data, clean_abstracts, make_weight_wordtrie
from helper.wordtrie_builder import WordTrie

In [14]:
SAMPLE_LIMIT = 300000
NEO4J_CREDENTIALS = {"url": "bolt://localhost:37687", "user": "neo4j", "password": "neo4jpassword"}
AHO_LENGTH = 3 # Paper Abstract has to contain AHO_LENGTH or more core keywords to be labeled as ai and deleted from neg sample

MAKE_ALL_PLOTS = False

url_object = URL.create(
    drivername='postgresql+psycopg2', 
    username='tie',
    password='TIE%2023!tuhh',
    host='134.28.58.100',
    # host='tie-workstation.tail6716.ts.net',
    # host='localhost',
    port=45432,
    database='openalex_db',
)
engine = create_engine(url_object)
engine.connect()

<sqlalchemy.engine.base.Connection at 0x7f4af78e3610>

In [16]:
# Load papers from neo4j
query = f"""
MATCH (p:Paper)
WHERE p.abstract IS NOT NULL
RETURN p.id AS id, p.abstract AS abstract, p.title AS title
ORDER BY rand()
LIMIT {SAMPLE_LIMIT + int(SAMPLE_LIMIT * 0.1)}
"""
print("Fetching data...")
papers_origin = neo4j_fetch_data(query, NEO4J_CREDENTIALS)
print("Done.")
print(f"Got {len(papers_origin)} papers.")
print("")

query2 = f"""
MATCH (p:Paper)
WHERE p.id_openalex IS NOT NULL
RETURN p.id_openalex AS openalex_id
"""
print("Fetching data...")
pwc_oalex = neo4j_fetch_data(query2, NEO4J_CREDENTIALS)
print("Done.")
print(f"Got {len(pwc_oalex)} OpenAlex IDs from PwC.")

Fetching data...
Done.
Got 330000 papers.

Fetching data...
Done.
Got 255908 OpenAlex IDs from PwC.


In [17]:
sql_query = f'''
    SELECT id, title, abstract, abstract_inverted_index
    FROM openalex.works TABLESAMPLE BERNOULLI(10)
    WHERE abstract_inverted_index IS NOT NULL
    LIMIT {SAMPLE_LIMIT + int(SAMPLE_LIMIT * 0.2)}
'''

with engine.connect() as conn:
    print("Fetching data from Postgres...")
    openalex = pd.read_sql(sql=text(sql_query), con=conn)
    print(f"Got {len(openalex)} OpenAlex Works")

Fetching data from Postgres...
Got 360000 OpenAlex Works


In [18]:
sql_query3 = f'''
    SELECT wc.work_id
    FROM openalex.works_concepts wc
    JOIN openalex.concepts c ON wc.concept_id = c.id
    WHERE c.display_name = 'Artificial intelligence' OR c.display_name = 'Machine learning' OR c.display_name = 'Natural language processing' AND wc.score > 0.2
'''

with engine.connect() as conn:
    print("Fetching data from Postgres...")
    openalex_ids_with_AI = pd.read_sql(sql=text(sql_query3), con=conn)
    print(f"Got {len(openalex_ids_with_AI)} OpenAlex Work IDs with AI")

Fetching data from Postgres...
Got 18116999 OpenAlex Work IDs with AI


In [19]:
# Remove all openalex rows where the id is in the pwc_oalex df
openalex_id_filtered = openalex[~openalex['id'].isin(pwc_oalex['openalex_id'])]
print(f"Got {len(openalex_id_filtered)} OpenAlex Works after removing PwC papers")
# Remove all openalex rows where the id is in the openalex_ids_with_AI df
openalex_id_filtered = openalex_id_filtered[~openalex_id_filtered['id'].isin(openalex_ids_with_AI['work_id'])]
print(f"Got {len(openalex_id_filtered)} OpenAlex Works after removing AI papers")

# Perform cleaning operations
# def clean_abstracts(df, inv_index=False, lang='en'):
#     if inv_index:
#         # Extract and join keys from abstract_inverted_index
#         print("Extracting abstract from abstract_inverted_index...")
#         df['abstract'] = df['abstract_inverted_index'].progress_apply(lambda x: " ".join(x['InvertedIndex'].keys()) if x else None)
    
#     # Regular expressions and string operations
#     df['abstract'] = df['abstract'].str.replace(r"[^a-zA-Z0-9 ]", " ", regex=True)\
#                                     .str.replace(r"abstract", "", flags=re.IGNORECASE)\
#                                     .str.strip()\
#                                     .str.replace(r"\s+", " ", regex=True)\
#                                     .str.lower()
    
#     # Filter by abstract length
#     df = df[df['abstract'].str.split().str.len() > 10]

#     if lang:
#         try:
#             # Detect language and keep only English abstracts
#             import gcld3
#             detector = gcld3.NNetLanguageIdentifier(min_num_bytes=0, max_num_bytes=1000)
#             print(f"Detecting language {lang}...")
#             # Sample output 
#             # print(detector.FindLanguage(text="This is a test").language)
#             df['abstract_lang'] = df['abstract'].progress_apply(lambda x: detector.FindLanguage(text=x).language if pd.notna(x) else None)
#             df = df[df['abstract_lang'] == lang]
#         except:
#             print("Language detection failed. Keeping all abstracts.")
#             pass

#     # Calculate abstract length by counting the words
#     df['abstract_length'] = df['abstract'].str.split().str.len()

#     return df[df['abstract_length'] > 10]

# Clean both DataFrames
openalex_sample = clean_abstracts(openalex_id_filtered.copy(), "abstract", inv_index=True, lang='en')
papers = clean_abstracts(papers_origin.copy(), "abstract", inv_index=False, lang='en')

# Reset index and apply sample limit if necessary
openalex_sample.reset_index(drop=True, inplace=True)
if len(openalex_sample) > SAMPLE_LIMIT:
    openalex_sample = openalex_sample.sample(n=SAMPLE_LIMIT, random_state=42)

# Drop unnecessary columns
openalex_sample.drop(columns=['abstract_inverted_index', 'abstract_lang'], inplace=True)
papers.drop(columns=['abstract_lang'], inplace=True)

# Prints for confirmation
print(f"Got {len(openalex_id_filtered)} OpenAlex Works after removing PwC papers")
print(f"Number of rows in OpenAlex Sample after cleaning: {len(openalex_sample)} ({round((len(openalex_sample))/len(openalex_id_filtered), 2) * 100}% remain)")
if len(openalex_sample) > SAMPLE_LIMIT:
    print(f"Cut down to {len(openalex_sample)} OpenAlex Works")

Got 358823 OpenAlex Works after removing PwC papers
Got 323203 OpenAlex Works after removing AI papers
Extracting abstract from abstract_inverted_index...


  0%|          | 0/323203 [00:00<?, ?it/s]

Detecting language en...


  0%|          | 0/311350 [00:00<?, ?it/s]

Detecting language en...


  0%|          | 0/329764 [00:00<?, ?it/s]

Got 323203 OpenAlex Works after removing PwC papers
Number of rows in OpenAlex Sample after cleaning: 282702 (87.0% remain)


In [20]:
# Import the dictionary.csv
dict_df = pd.read_csv("data/dictionaries/dictionary_kl.csv")
print(f"Got {len(dict_df)} rows from dictionary.csv")
dict_df["trie_id"] = dict_df.index

Got 315245 rows from dictionary.csv


In [21]:
wordtrie = WordTrie(word_filter=True, text_filter=True, show_progress_bar=True, weights=True)

trie_dict = {
    "core_all": wordtrie.from_json("data/dictionaries/tries/core_trie.json"),
    "core_cso": wordtrie.from_json("data/dictionaries/tries/core_cso_trie.json"),
    "core_method": wordtrie.from_json("data/dictionaries/tries/core_method_trie.json"),
    "core_task": wordtrie.from_json("data/dictionaries/tries/core_task_trie.json"),
    "core_dataset": wordtrie.from_json("data/dictionaries/tries/core_dataset_trie.json"),
    "extended_all": wordtrie.from_json("data/dictionaries/tries/extended_trie.json"),
    "extended_cso": wordtrie.from_json("data/dictionaries/tries/extended_cso_trie.json"),
    "extended_method": wordtrie.from_json("data/dictionaries/tries/extended_method_trie.json"),
    "extended_task": wordtrie.from_json("data/dictionaries/tries/extended_task_trie.json"),
    "extended_dataset": wordtrie.from_json("data/dictionaries/tries/extended_dataset_trie.json"),
    "all": wordtrie.from_json("data/dictionaries/tries/all_trie.json"),
    "cso": wordtrie.from_json("data/dictionaries/tries/all_cso_trie.json"),
    "method": wordtrie.from_json("data/dictionaries/tries/all_method_trie.json"),
    "task": wordtrie.from_json("data/dictionaries/tries/all_task_trie.json"),
    "dataset": wordtrie.from_json("data/dictionaries/tries/all_dataset_trie.json"),
}

Weights are enabled.


100%|██████████| 315245/315245 [00:04<00:00, 72355.99it/s] 


Weights are enabled.


100%|██████████| 63414/63414 [00:00<00:00, 101205.66it/s]


Weights are enabled.


100%|██████████| 204243/204243 [00:01<00:00, 103172.32it/s]


Weights are enabled.


100%|██████████| 47588/47588 [00:00<00:00, 102583.54it/s]


Weights are enabled.


100%|██████████| 9390/9390 [00:00<00:00, 105731.94it/s]


Weights are enabled.


100%|██████████| 1632/1632 [00:00<00:00, 118259.63it/s]


Weights are enabled.


100%|██████████| 5129/5129 [00:00<00:00, 102815.92it/s]


Weights are enabled.


100%|██████████| 2629/2629 [00:00<00:00, 112074.90it/s]


In [23]:
if MAKE_ALL_PLOTS:
    # Make a plot for each trie. Remember that it is 3 columns and five rows, e.g. key 0 is in one row with key 5 and key 10
    keys = list(trie_dict.keys())
    keys_split = [keys[i:i + 2] for i in range(0, len(keys), 2)]
    subplot_titles = [f"{key1} vs {key2}" for key1, key2 in zip(*keys_split)]
    print(subplot_titles)

    def process_key(key, df):
        wordtrie = trie_dict[key]
        print(f"Processing {key}")
        print(f"Number of nodes: {wordtrie.count_nodes()}")
        
        metadata = df['abstract'].progress_apply(lambda x: wordtrie.aggregate_search_info(x))
        df[f"trie_abs_{key}"] = metadata.apply(lambda x: x[0])
        df[f"trie_ratio_{key}"] = metadata.apply(lambda x: x[1])
        df[f"trie_score_{key}"] = metadata.apply(lambda x: x[2])
        df[f"trie_abs_score_{key}"] = metadata.apply(lambda x: x[3])

    for key in keys:
        process_key(key, openalex_sample)
        process_key(key, papers)
        
    combined_figs = [make_subplots(rows=5, cols=3, subplot_titles=subplot_titles) for _ in range(len(keys))]

    def add_histograms(figs, df, key, row, col, type, hist_color):
        for fig, column in zip(figs, [f'trie_abs_{key}', f'trie_ratio_{key}', f'trie_score_{key}', f'trie_abs_score_{key}']):
            fig.add_trace(go.Histogram(x=df[column], opacity=0.75, name=f'{type}: {key}', marker=dict(color=hist_color)), row=row, col=col)
            fig.update_yaxes(type='log', row=row, col=col)

    for i, (key1, key2) in enumerate(zip(*keys_split)):
        add_histograms(combined_figs, openalex_sample, key1, i+1, 1, "openalex", "red")
        add_histograms(combined_figs, papers, key1, i+1, 1, "papers", "blue")
        add_histograms(combined_figs, openalex_sample, key2, i+1, 2, "openalex", "red")
        add_histograms(combined_figs, papers, key2, i+1, 2, "papers", "blue")

    for fig, title in zip(combined_figs, ["Trie Search Absolute", "Trie Search Ratio (len(Words) - len(Abstract)", "Trie Search KL Divergence Mean", "Trie Search Absolute times KL Divergence Mean"]):
        fig.update_layout(height=2000, width=1800, yaxis_type="log", title_text=f"Histograms of {title}")
        fig.write_html(f"plots/trie_histograms_{title.replace(' ', '_').lower()}.html")

    print("Saved the figures")

In [24]:
dict_df_wo_dataset = dict_df.copy()
dict_df_wo_dataset_trie = make_wordtrie(dict_df_wo_dataset["keyword"].tolist(), dict_df_wo_dataset["trie_id"].tolist(), dict_df_wo_dataset["kl_divergence_normalized"].tolist())

openalex_metadata = openalex_sample['abstract'].progress_apply(lambda x: dict_df_wo_dataset_trie.aggregate_search_info(x))
openalex_sample[f"test_trie_abs"] = openalex_metadata.apply(lambda x: x[0])
openalex_sample[f"test_trie_ratio"] = openalex_metadata.apply(lambda x: x[1])
openalex_sample[f"test_trie_score"] = openalex_metadata.apply(lambda x: x[2])
openalex_sample[f"test_trie_abs_score"] = openalex_metadata.apply(lambda x: x[3])

papers_metadata = papers['abstract'].progress_apply(lambda x: dict_df_wo_dataset_trie.aggregate_search_info(x))
papers[f"test_trie_abs"] = papers_metadata.apply(lambda x: x[0])
papers[f"test_trie_ratio"] = papers_metadata.apply(lambda x: x[1])
papers[f"test_trie_score"] = papers_metadata.apply(lambda x: x[2])
papers[f"test_trie_abs_score"] = papers_metadata.apply(lambda x: x[3])

Weights are enabled.


100%|██████████| 315245/315245 [00:03<00:00, 96474.77it/s] 


  0%|          | 0/282702 [00:00<?, ?it/s]

  0%|          | 0/329072 [00:00<?, ?it/s]

In [25]:
openalex_sample_plot = openalex_sample[openalex_sample["test_trie_abs_score"] > 0.01]
papers_plot = papers[papers["test_trie_abs_score"] > 0.01]
openalex_sample_plot = openalex_sample_plot[openalex_sample_plot["test_trie_score"] < 0.9]
papers_plot = papers_plot[papers_plot["test_trie_score"] < 0.9]


# Make a subplot with 4 histograms (2x2) 
fig = make_subplots(
    rows=2,
    cols=2,
    subplot_titles=["Trie Search Absolute Count", "Trie Search Ratio (len(Words) - len(Abstract)", "Trie Search KL Divergence Mean", "Trie Search Absolute times KL Divergence Mean"]
)

# Add histograms
fig.add_trace(go.Histogram(x=openalex_sample_plot["test_trie_abs"], opacity=0.75, name="negative sample", marker=dict(color="red")), row=1, col=1)
fig.add_trace(go.Histogram(x=papers_plot["test_trie_abs"], opacity=0.75, name="AI Papers", marker=dict(color="blue")), row=1, col=1)
fig.add_trace(go.Histogram(x=openalex_sample_plot["test_trie_ratio"], opacity=0.75, name="negative sample", marker=dict(color="red")), row=1, col=2)
fig.add_trace(go.Histogram(x=papers_plot["test_trie_ratio"], opacity=0.75, name="AI Papers", marker=dict(color="blue")), row=1, col=2)
fig.add_trace(go.Histogram(x=openalex_sample_plot["test_trie_score"], opacity=0.75, name="negative sample", marker=dict(color="red")), row=2, col=1)
fig.add_trace(go.Histogram(x=papers_plot["test_trie_score"], opacity=0.75, name="AI Papers", marker=dict(color="blue")), row=2, col=1)
fig.add_trace(go.Histogram(x=openalex_sample_plot["test_trie_abs_score"], opacity=0.75, name="negative sample", marker=dict(color="red")), row=2, col=2)
fig.add_trace(go.Histogram(x=papers_plot["test_trie_abs_score"], opacity=0.75, name="AI Papers", marker=dict(color="blue")), row=2, col=2)

# Update yaxis to log scale
# fig.update_yaxes(type='log', row=1, col=1)
# fig.update_yaxes(type='log', row=1, col=2)
# fig.update_yaxes(type='log', row=2, col=1)
# fig.update_yaxes(type='log', row=2, col=2)

# Update title and height
fig.update_layout(title_text="Histograms of Trie Search Results")
# fig.update_layout(
#     annotations=[
#         dict(
#             x=0.5,  # x-coordinate position of the text box
#             y=-0.15,  # y-coordinate position of the text box
#             xref="paper",
#             yref="paper",
#             text="Where:<br>D = Dictionary of words<br>T = Analyzed text",
#             showarrow=False
#         )
#     ]
# )
fig.write_html("plots/test_histograms.html")

# Calculate mean and standard deviation of all 4 columns
print("OpenAlex Negative Sample")
print(f"Mean of test_trie_abs: {openalex_sample_plot['test_trie_abs'].mean()}")
print(f"Mean of test_trie_ratio: {openalex_sample_plot['test_trie_ratio'].mean()}")
print(f"Mean of test_trie_score: {openalex_sample_plot['test_trie_score'].mean()}")
print(f"Mean of test_trie_abs_score: {openalex_sample_plot['test_trie_abs_score'].mean()}")
print("")
print(f"Std of test_trie_abs: {openalex_sample_plot['test_trie_abs'].std()}")
print(f"Std of test_trie_ratio: {openalex_sample_plot['test_trie_ratio'].std()}")
print(f"Std of test_trie_score: {openalex_sample_plot['test_trie_score'].std()}")
print(f"Std of test_trie_abs_score: {openalex_sample_plot['test_trie_abs_score'].std()}")
print("")
print("PwC AI Papers")
print(f"Mean of test_trie_abs: {papers_plot['test_trie_abs'].mean()}")
print(f"Mean of test_trie_ratio: {papers_plot['test_trie_ratio'].mean()}")
print(f"Mean of test_trie_score: {papers_plot['test_trie_score'].mean()}")
print(f"Mean of test_trie_abs_score: {papers_plot['test_trie_abs_score'].mean()}")
print("")
print(f"Std of test_trie_abs: {papers_plot['test_trie_abs'].std()}")
print(f"Std of test_trie_ratio: {papers_plot['test_trie_ratio'].std()}")
print(f"Std of test_trie_score: {papers_plot['test_trie_score'].std()}")
print(f"Std of test_trie_abs_score: {papers_plot['test_trie_abs_score'].std()}")

# Make boxplots of all 4 columns
fig = make_subplots(
    rows=2,
    cols=2,
    subplot_titles=["Trie Search Absolute Count", "Trie Search Ratio (len(Words) div len(Abstract)", "Trie Search KL Divergence Mean", "Trie Search Absolute mul KL Divergence Mean"],
    
)

# Add boxplots
fig.add_trace(go.Box(y=openalex_sample_plot["test_trie_abs"], name="negative sample", marker=dict(color="red")), row=1, col=1)
fig.add_trace(go.Box(y=papers_plot["test_trie_abs"], name="AI Papers", marker=dict(color="blue")), row=1, col=1)
fig.add_trace(go.Box(y=openalex_sample_plot["test_trie_ratio"], name="negative sample", marker=dict(color="red")), row=1, col=2)
fig.add_trace(go.Box(y=papers_plot["test_trie_ratio"], name="AI Papers", marker=dict(color="blue")), row=1, col=2)

fig.add_trace(go.Box(y=openalex_sample_plot["test_trie_score"], name="negative sample", marker=dict(color="red")), row=2, col=1)
fig.add_trace(go.Box(y=papers_plot["test_trie_score"], name="AI Papers", marker=dict(color="blue")), row=2, col=1)
fig.add_trace(go.Box(y=openalex_sample_plot["test_trie_abs_score"], name="negative sample", marker=dict(color="red")), row=2, col=2)
fig.add_trace(go.Box(y=papers_plot["test_trie_abs_score"], name="AI Papers", marker=dict(color="blue")), row=2, col=2)

# Update title and height
fig.update_layout(title_text="Boxplots of Trie Search Results")

fig.write_html("plots/test_boxplots.html")

OpenAlex Negative Sample
Mean of test_trie_abs: 20.59784064177574
Mean of test_trie_ratio: 0.18722097120086176
Mean of test_trie_score: 0.848923369680318
Mean of test_trie_abs_score: 17.600327356907243

Std of test_trie_abs: 13.212154134605399
Std of test_trie_ratio: 0.06695226268371167
Std of test_trie_score: 0.04836695337161977
Std of test_trie_abs_score: 11.569862026961538

PwC AI Papers
Mean of test_trie_abs: 50.020033587620176
Mean of test_trie_ratio: 0.29103495689170605
Mean of test_trie_score: 0.8421660745998888
Mean of test_trie_abs_score: 42.21400654403401

Std of test_trie_abs: 17.238764537451022
Std of test_trie_ratio: 0.06052088305293432
Std of test_trie_score: 0.040199481367433475
Std of test_trie_abs_score: 14.916351180201836
