In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("./openalex_papers5.csv").fillna("").reset_index(drop=True)

In [3]:
df.shape

(46191, 5)

In [4]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict, Counter
import json

In [5]:
stop_words = set(stopwords.words("english")) # assuming english lexicon (dropped all other)

In [6]:
def tokenize(text):
    tokens = word_tokenize(text.lower())
    return [
        t for t in tokens
        if t.isalpha() and t not in stop_words
    ]

In [7]:
def build_inverted_index(text):
    """
    Build an inverted index from a string:
    """
    if not text:
        return {}

    tokens = tokenize(text)  # lowercase, remove stopwords
    inv_index = defaultdict(list)
    for pos, token in enumerate(tokens):
        inv_index[token].append(pos)
    return dict(inv_index)

In [8]:
df = df[df["abstract_text"].notna() & (df["abstract_text"] != "")] # remove all pages with empty descriptions

In [9]:
df["abstract_inverted_index"] = df["abstract_text"].apply(build_inverted_index) # construct inverted index

In [10]:
df["tokens"] = df["abstract_inverted_index"].apply(
    lambda inv: [t.lower() for t, pos_list in inv.items() for _ in pos_list]
)
# Expand the inverted index into a token list by:
# - repeating each term once per occurrence
# - converting all tokens to lowercase

In [11]:
df["tokens"] = df["tokens"].apply(lambda x: json.loads(x) if isinstance(x, str) else [])

In [12]:
df.shape

(34560, 7)