In [1]:
import pandas as pd
import re
import math
from collections import defaultdict, Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import json
import numpy as np

In [None]:
df = pd.read_csv("openalex_papers4.csv").fillna('').reset_index(drop=True)

In [3]:
class BM25:
    def __init__(self, frq, sd, avgwdl, k1, b, N, n_qt):
        """
            frq: Frequency of word in document D,
            sd: size of document (in words),
            avgwdl: average documents length in corpus(/collection),
            k1: term frequency scaling,
            b: document length normalization,
            N: Total documents,
            n_qt: Number of documents containing query term
        """

        self.frq = frq
        self.sd = sd
        self.avgwdl = avgwdl
        self.k1 = k1
        self.b = b
        self.N = N
        self.n_qt = n_qt

    def Idf(self):
        upper = self.N - self.n_qt + 0.5
        lower = self.n_qt + 0.5
        return np.log(upper / lower)

    def Tf(self):
        upper = self.frq
        lower = self.frq + self.k1 * (
            1 - self.b + self.b * (self.sd / self.avgwdl)
        )
        return upper / lower


    def formula(self):
        return np.sum(self.Idf() * self.Tf(), axis=0, dtype=np.float64)


In [4]:
stop_words = set(stopwords.words("english"))

In [5]:

def tokenize(text):
    tokens = word_tokenize(text.lower())
    return [
        t for t in tokens
        if t.isalpha() and t not in stop_words
    ]


In [6]:
df["tokens"] = df["abstract_text"].fillna("").apply(tokenize)

In [7]:
df["sd"] = df["tokens"].apply(len)


In [8]:

sd = df["sd"].values.reshape(1, -1)   # (1, n_docs)
avgwdl = df["sd"].mean()
N = len(df)

In [9]:
query = "kernel ridge regression"

query_terms = tokenize(query)

In [10]:
query_terms

['kernel', 'ridge', 'regression']

In [11]:
doc_term_counts = df["tokens"].apply(Counter)

frq = np.array([
    [doc.get(term, 0) for doc in doc_term_counts]
    for term in query_terms
])

In [12]:
n_qt = np.array([
    sum(term in doc for doc in doc_term_counts)
    for term in query_terms
]).reshape(-1, 1)


In [13]:
bm25 = BM25(
    frq=frq,
    sd=sd,
    avgwdl=avgwdl,
    k1=1.5,
    b=0.75,
    N=N,
    n_qt=n_qt
)


In [14]:

scores = bm25.formula()    # (n_docs,)


In [15]:
df["bm25_score"] = scores
df = df.sort_values("bm25_score", ascending=False)


In [17]:

df[["title", "year", "bm25_score"]]


Unnamed: 0,title,year,bm25_score
117,Neural Tangent Kernel: Convergence and General...,2018,4.978045
133,Binarized Neural Networks,2016,2.779744
86,Binarized Neural Networks: Training Deep Neura...,2016,2.750516
103,Quantized Neural Networks: Training Neural Net...,2016,2.473138
226,Exploiting Unlabeled Data in CNNs by Self-Supe...,2019,2.021151
...,...,...,...
130,PoseCNN: A Convolutional Neural Network for 6D...,2018,0.000000
129,Session-Based Recommendation with Graph Neural...,2019,0.000000
128,BinaryConnect: Training Deep Neural Networks w...,2015,0.000000
127,Convolutional Neural Networks on Graphs with F...,2016,0.000000


In [22]:
len(df[df['bm25_score']!=0])

12

In [23]:
df

Unnamed: 0,id,title,abstract,year,concepts,abstract_text,tokens,sd,bm25_score
117,https://openalex.org/W2809090039,Neural Tangent Kernel: Convergence and General...,"{'At': [0], 'initialization,': [1], 'artificia...",2018,"['Mathematics', 'Kernel (algebra)', 'Artificia...","At initialization, artificial neural networks ...","[initialization, artificial, neural, networks,...",125,4.978045
133,https://openalex.org/W2267635276,Binarized Neural Networks,"{'We': [0, 28], 'introduce': [1], 'a': [2, 37,...",2016,"['MNIST database', 'Computer science', 'Kernel...",We introduce a method to train Binarized Neura...,"[introduce, method, train, binarized, neural, ...",74,2.779744
86,https://openalex.org/W2319920447,Binarized Neural Networks: Training Deep Neura...,"{'We': [0], 'introduce': [1], 'a': [2, 98], 'm...",2016,"['MNIST database', 'Computer science', 'Artifi...",We introduce a method to train Binarized Neura...,"[introduce, method, train, binarized, neural, ...",77,2.750516
103,https://openalex.org/W2524428287,Quantized Neural Networks: Training Neural Net...,"{'We': [0, 68], 'introduce': [1], 'a': [2, 58,...",2016,"['Artificial neural network', 'Computer scienc...",We introduce a method to train Quantized Neura...,"[introduce, method, train, quantized, neural, ...",109,2.473138
226,https://openalex.org/W2914913933,Exploiting Unlabeled Data in CNNs by Self-Supe...,"{'For': [0, 104], 'many': [1], 'applications':...",2019,"['Artificial intelligence', 'Computer science'...",For many applications the collection of labele...,"[many, applications, collection, labeled, data...",114,2.021151
...,...,...,...,...,...,...,...,...,...
130,https://openalex.org/W2963188159,PoseCNN: A Convolutional Neural Network for 6D...,"{'Estimating': [0], 'the': [1, 14, 22, 29, 57,...",2018,"['Artificial intelligence', 'Computer science'...",Estimating the 6D pose of known objects is imp...,"[estimating, pose, known, objects, important, ...",101,0.000000
129,https://openalex.org/W2899457523,Session-Based Recommendation with Graph Neural...,"{'The': [0], 'problem': [1], 'of': [2, 50, 61,...",2019,[],The problem of session-based recommendation ai...,"[problem, recommendation, aims, predict, user,...",97,0.000000
128,https://openalex.org/W1902934009,BinaryConnect: Training Deep Neural Networks w...,"{'Deep': [0, 80], 'Neural': [1], 'Networks': [...",2015,"['MNIST database', 'Computer science', 'Dropou...",Deep Neural Networks (DNN) have achieved state...,"[deep, neural, networks, dnn, achieved, result...",102,0.000000
127,https://openalex.org/W2468907370,Convolutional Neural Networks on Graphs with F...,"{'In': [0], 'this': [1, 103], 'work,': [2], 'w...",2016,"['MNIST database', 'Convolutional neural netwo...","In this work, we are interested in generalizin...","[work, interested, generalizing, convolutional...",72,0.000000
