In [None]:
import pandas as pd    
import glob
from tqdm import tqdm
import spacy.lang.en
from collections import defaultdict
from spacy.matcher import PhraseMatcher
import os
from mair import papers_processing_utils
import json
tqdm.pandas()

In [None]:
S2_ORC_INPUT_PATHS = '../data/s2orc/metadata/comp_sci/*.jsonl'
INDEX_OUT_PATH = '../data/s2orc/ai_papers_ids.json'
AI_PAPERS_OUT_PATH='../data/s2orc/ai_papers.csv'
s2orc_paths = glob.glob(S2_ORC_INPUT_PATHS)

en=spacy.lang.en.English()
# en = spacy.load('en_core_web_sm')
# en.disable_pipes(['parser', 'ner'])
ai_papers_index = dict()


In [None]:
def find_ai_papers(df, matcher):
    df['cleaned_abstract'] = df['abstract'].str.replace('\n',' ')

    text_to_search = df.apply(lambda row: str(row['title'])+' '+str(row['cleaned_abstract']), axis=1).str.lower()

    docs = text_to_search.apply(en)

    foundings=docs.apply(matcher)
    ai_papers_ids = list(df[foundings.str.len()!=0].paper_id)
    return ai_papers_ids

def prepare_matcher():
    patterns = [
        "ai",
        "artificial intelligence",
        "machine learning",
        "classifier",
        "neural network",
        "deep learning",
        "data science",
        "nlp",
        "machine-learning",
        "computer vision"
    ]
    matcher = PhraseMatcher(en.vocab, attr="NORM") #TODO: change to lemma
    for pattern in patterns:
        matcher.add(pattern, None, en(pattern))
    return matcher

In [None]:
matcher = prepare_matcher()

for path in tqdm(s2orc_paths):
    filename = os.path.basename(path)
    if filename in ai_papers_index.keys():
        print(f"Already filtered ({filename}), skipping...", flush=True)
    else:
        df = pd.read_json(path, lines=True)    
        ai_papers_ids = find_ai_papers(df, matcher)

        ai_papers_index[filename]=ai_papers_ids

In [None]:
json.dump(ai_papers_index, open(INDEX_OUT_PATH, 'w'))

In [None]:
ai_papers_index = json.load(open(INDEX_OUT_PATH, 'r'))

In [None]:
ai_papers = pd.DataFrame()
for path in tqdm(s2orc_paths):
    filename = os.path.basename(path)
    df2 = pd.read_json(path, lines=True)
    df_filtered = df2[df2['paper_id'].isin(ai_papers_index[filename])]
    ai_papers = ai_papers.append(df_filtered)

In [None]:
ai_papers.to_csv(AI_PAPERS_OUT_PATH)