In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

import logging
logging.basicConfig(level=logging.ERROR)

# Import plotly
import plotly.graph_objs as go
from plotly.subplots import make_subplots

from tqdm.auto import tqdm
tqdm.pandas()

import os
from helper.keyword_helper import make_weight_wordtrie
from helper.wordtrie_builder import WordTrie

In [2]:
DICT_PATH = "data/dictionaries"
SAMPLE_PATH = "data/samples"

In [3]:
print("Using prepared samples...")
try:
    positive_sample = pd.read_parquet(os.path.join(SAMPLE_PATH, "positive_sample.parquet"))
    negative_sample = pd.read_parquet(os.path.join(SAMPLE_PATH, "negative_sample.parquet"))
    print("Done.")
except:
    print("Samples not found. Try running the helper function first.")

Using prepared samples...
Done.


In [4]:
print("Using the weighted dictionary...")
try:
    dict_df = pd.read_csv(os.path.join(DICT_PATH, "dictionary_weights.csv"))
    print(f"Got {len(dict_df)} rows from dictionary.csv")
    dict_df["trie_id"] = dict_df.index
    print("Done.")
except:
    print("Dictionary not found. Try running the helper function first.")

Using the weighted dictionary...
Got 52747 rows from dictionary.csv
Done.


In [5]:
wordtrie = WordTrie(word_filter=True, text_filter=True, show_progress_bar=True, weights=True)

trie_dict = {
    "core_all": wordtrie.from_json("data/dictionaries/tries/core_trie.json"),
    "core_cso": wordtrie.from_json("data/dictionaries/tries/core_cso_trie.json"),
    "core_method": wordtrie.from_json("data/dictionaries/tries/core_method_trie.json"),
    "core_task": wordtrie.from_json("data/dictionaries/tries/core_task_trie.json"),
    "extended_all": wordtrie.from_json("data/dictionaries/tries/extended_trie.json"),
    "extended_cso": wordtrie.from_json("data/dictionaries/tries/extended_cso_trie.json"),
    "extended_method": wordtrie.from_json("data/dictionaries/tries/extended_method_trie.json"),
    "extended_task": wordtrie.from_json("data/dictionaries/tries/extended_task_trie.json"),
    "all": wordtrie.from_json("data/dictionaries/tries/all_trie.json"),
    "cso": wordtrie.from_json("data/dictionaries/tries/all_cso_trie.json"),
    "method": wordtrie.from_json("data/dictionaries/tries/all_method_trie.json"),
    "task": wordtrie.from_json("data/dictionaries/tries/all_task_trie.json"),
}

In [6]:
dict_df_process = dict_df.copy()
dict_df_process_trie = make_weight_wordtrie(dict_df_process["keyword"].tolist(), dict_df_process["trie_id"].tolist(), dict_df_process["weight"].tolist())

negative_sample_metadata = negative_sample['text'].progress_apply(lambda x: dict_df_process_trie.aggregate_search_info(x))
negative_sample[f"test_trie_abs"] = negative_sample_metadata.apply(lambda x: x[0])
negative_sample[f"test_trie_ratio"] = negative_sample_metadata.apply(lambda x: x[1])
negative_sample[f"test_trie_score"] = negative_sample_metadata.apply(lambda x: x[2])
negative_sample[f"test_trie_abs_score"] = negative_sample_metadata.apply(lambda x: x[3])

positive_sample_metadata = positive_sample['text'].progress_apply(lambda x: dict_df_process_trie.aggregate_search_info(x))
positive_sample[f"test_trie_abs"] = positive_sample_metadata.apply(lambda x: x[0])
positive_sample[f"test_trie_ratio"] = positive_sample_metadata.apply(lambda x: x[1])
positive_sample[f"test_trie_score"] = positive_sample_metadata.apply(lambda x: x[2])
positive_sample[f"test_trie_abs_score"] = positive_sample_metadata.apply(lambda x: x[3])

Weights are enabled.


100%|██████████| 52747/52747 [00:01<00:00, 32931.52it/s]


  0%|          | 0/592158 [00:00<?, ?it/s]

  0%|          | 0/402501 [00:00<?, ?it/s]

In [7]:
negative_sample_plot = negative_sample[negative_sample["test_trie_abs_score"] > 0.01]
positive_sample_plot = positive_sample[positive_sample["test_trie_abs_score"] > 0.01]
negative_sample_plot = negative_sample_plot[negative_sample_plot["test_trie_score"] < 0.9]
positive_sample_plot = positive_sample_plot[positive_sample_plot["test_trie_score"] < 0.9]

# Make a subplot with 4 histograms (2x2) 
fig = make_subplots(
    rows=2,
    cols=2,
    subplot_titles=["Trie Search Absolute Count", "Trie Search Ratio (len(Words) div len(Abstract)", "Trie Search Weighted Mean", "Trie Search Absolute weighted"],
)

# Add histograms
fig.add_trace(go.Histogram(x=negative_sample_plot["test_trie_abs"], opacity=0.75, name="Negative Sample", marker=dict(color="red")), row=1, col=1)
fig.add_trace(go.Histogram(x=positive_sample_plot["test_trie_abs"], opacity=0.75, name="Positive Sample", marker=dict(color="blue")), row=1, col=1)
fig.add_trace(go.Histogram(x=negative_sample_plot["test_trie_ratio"], opacity=0.75, name="Negative Sample", marker=dict(color="red")), row=1, col=2)
fig.add_trace(go.Histogram(x=positive_sample_plot["test_trie_ratio"], opacity=0.75, name="Positive Sample", marker=dict(color="blue")), row=1, col=2)
fig.add_trace(go.Histogram(x=negative_sample_plot["test_trie_score"], opacity=0.75, name="Negative Sample", marker=dict(color="red")), row=2, col=1)
fig.add_trace(go.Histogram(x=positive_sample_plot["test_trie_score"], opacity=0.75, name="Positive Sample", marker=dict(color="blue")), row=2, col=1)
fig.add_trace(go.Histogram(x=negative_sample_plot["test_trie_abs_score"], opacity=0.75, name="Negative Sample", marker=dict(color="red")), row=2, col=2)
fig.add_trace(go.Histogram(x=positive_sample_plot["test_trie_abs_score"], opacity=0.75, name="Positive Sample", marker=dict(color="blue")), row=2, col=2)
# Update yaxis to log scale
# fig.update_yaxes(type='log', row=1, col=1)
# fig.update_yaxes(type='log', row=1, col=2)
# fig.update_yaxes(type='log', row=2, col=1)
# fig.update_yaxes(type='log', row=2, col=2)

# Update title and height
fig.update_layout(title_text="Histograms of Trie Search Results")
fig.write_html("plots/test_histograms.html")

# Calculate mean and standard deviation of all 4 columns
print("OpenAlex Negative Sample")
print(f"Mean of test_trie_abs: {negative_sample_plot['test_trie_abs'].mean()}")
print(f"Mean of test_trie_ratio: {negative_sample_plot['test_trie_ratio'].mean()}")
print(f"Mean of test_trie_score: {negative_sample_plot['test_trie_score'].mean()}")
print(f"Mean of test_trie_abs_score: {negative_sample_plot['test_trie_abs_score'].mean()}")
print("")
print(f"Std of test_trie_abs: {negative_sample_plot['test_trie_abs'].std()}")
print(f"Std of test_trie_ratio: {negative_sample_plot['test_trie_ratio'].std()}")
print(f"Std of test_trie_score: {negative_sample_plot['test_trie_score'].std()}")
print(f"Std of test_trie_abs_score: {negative_sample_plot['test_trie_abs_score'].std()}")
print("")
print("PwC AI Papers")
print(f"Mean of test_trie_abs: {positive_sample_plot['test_trie_abs'].mean()}")
print(f"Mean of test_trie_ratio: {positive_sample_plot['test_trie_ratio'].mean()}")
print(f"Mean of test_trie_score: {positive_sample_plot['test_trie_score'].mean()}")
print(f"Mean of test_trie_abs_score: {positive_sample_plot['test_trie_abs_score'].mean()}")
print("")
print(f"Std of test_trie_abs: {positive_sample_plot['test_trie_abs'].std()}")
print(f"Std of test_trie_ratio: {positive_sample_plot['test_trie_ratio'].std()}")
print(f"Std of test_trie_score: {positive_sample_plot['test_trie_score'].std()}")
print(f"Std of test_trie_abs_score: {positive_sample_plot['test_trie_abs_score'].std()}")

# Make boxplots of all 4 columns
fig = make_subplots(
    rows=2,
    cols=2,
    subplot_titles=["Trie Search Absolute Count", "Trie Search Ratio (len(Words) div len(Abstract)", "Trie Search Weighted Mean", "Trie Search Absolute weighted"],
)

# Add boxplots
fig.add_trace(go.Box(y=negative_sample_plot["test_trie_abs"], name="Negative Sample", marker=dict(color="red")), row=1, col=1)
fig.add_trace(go.Box(y=positive_sample_plot["test_trie_abs"], name="Positive Sample", marker=dict(color="blue")), row=1, col=1)
fig.add_trace(go.Box(y=negative_sample_plot["test_trie_ratio"], name="Negative Sample", marker=dict(color="red")), row=1, col=2)
fig.add_trace(go.Box(y=positive_sample_plot["test_trie_ratio"], name="Positive Sample", marker=dict(color="blue")), row=1, col=2)

fig.add_trace(go.Box(y=negative_sample_plot["test_trie_score"], name="Negative Sample", marker=dict(color="red")), row=2, col=1)
fig.add_trace(go.Box(y=positive_sample_plot["test_trie_score"], name="Positive Sample", marker=dict(color="blue")), row=2, col=1)
fig.add_trace(go.Box(y=negative_sample_plot["test_trie_abs_score"], name="Negative Sample", marker=dict(color="red")), row=2, col=2)
fig.add_trace(go.Box(y=positive_sample_plot["test_trie_abs_score"], name="Positive Sample", marker=dict(color="blue")), row=2, col=2)

# Update title and height
fig.update_layout(title_text="Boxplots of Trie Search Results")
fig.write_html("plots/test_boxplots.html")

OpenAlex Negative Sample
Mean of test_trie_abs: 8.059498145270998
Mean of test_trie_ratio: 0.10291973550721949
Mean of test_trie_score: 0.041713259812606415
Mean of test_trie_abs_score: 0.34729673517150805

Std of test_trie_abs: 6.949749714426807
Std of test_trie_ratio: 0.05341637378328754
Std of test_trie_score: 0.026344940550138637
Std of test_trie_abs_score: 0.3926009669424632

PwC AI Papers
Mean of test_trie_abs: 30.537694606383162
Mean of test_trie_ratio: 0.2978932872416225
Mean of test_trie_score: 0.10257388759599163
Mean of test_trie_abs_score: 3.217836503334107

Std of test_trie_abs: 12.828419072255468
Std of test_trie_ratio: 0.08909906925816666
Std of test_trie_score: 0.03735244684172729
Std of test_trie_abs_score: 1.8440119099162733
