In [None]:
%load_ext autoreload
%autoreload 2

# hide UserWarning
import warnings
warnings.simplefilter("ignore", UserWarning)

import os
os.environ['TOKENIZERS_PARALLELISM'] = 'true'

In [None]:
from utils.get_paper import get_papers_to_keyword
from utils.ai_tools import add_keywords
from utils.data_tools import flatten

import pandas as pd
import multiprocessing as mp
import pickle
from tqdm.notebook import tqdm, trange
tqdm.pandas()

keyword = 'Pervasive Computing' 

In [None]:
keywords_searched = set()
all_papers = pd.DataFrame()
all_papers = pd.concat([all_papers, add_keywords(get_papers_to_keyword(keyword = keyword, logging=True),logging=True)])
new_keywords = flatten(all_papers['keywords'].to_list())
keywords_searched.add(keyword)

In [None]:
n_levels = 4

for i in trange(n_levels):
    # Create a pool once outside the loop (consider performance implications)
    with mp.Pool(processes=mp.cpu_count()) as pool:
        # Process keywords in parallel
        results = pool.map(get_papers_to_keyword, new_keywords)

        # Concatenate results efficiently
        all_results = pd.concat(results)

    print(f"Level {i}: {len(all_results)} papers found.")
    # Add keywords
    all_results = all_results.reset_index(drop=True)
    all_results = add_keywords(all_results, logging=True)
    all_papers = pd.concat([all_papers, all_results])

    # Update new_keywords 
    new_keywords = [kw for kw in set(flatten(all_papers['keywords'].to_list())) if kw not in keywords_searched]
    keywords_searched.update(new_keywords)
    
    # Save the results as a pickle file
    meta_data = {'keywords_searched': keywords_searched, 'new_keywords': new_keywords}
    with open(f'./data/keywords_{i}_meta.pkl', 'wb') as f:
        pickle.dump(meta_data, f)
    with open(f'./data/keywords_{i}_papers.pkl', 'wb') as f:
        pickle.dump(all_papers, f)
    