In [1]:
!pip install datasets




[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Jupyter Notebook: ArXiv Journal Recommender Pipeline

# Cell 1: Imports
import joblib
import pandas as pd
import implicit
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Category processing functions

import re

def simplify_category_list(category_string):
    """
    Takes a space-separated string of arXiv categories (e.g., "math.AP cs.AI hep-th")
    and returns a sorted list of unique simplified categories (e.g., ['cs', 'hep-th', 'math']).
    """
    if not category_string or not isinstance(category_string, str):
        return []
    categories = category_string.split()
    simplified_categories = set()
    for cat in categories:
        if '.' in cat:
            simplified_categories.add(cat.split('.')[0])
        else:
            simplified_categories.add(cat) # Keep categories without a dot as is
    return sorted(list(simplified_categories))

def preprocess_text(text):
    """
    Cleans text data: lowercase, filter short/non-alpha words.
    Designed for title/abstract concatenation.
    """
    if not isinstance(text, str): # Handle potential non-string data
        return ""
    text = text.lower()
    # Keep only alphabetic words longer than 3 chars
    words = [word for word in text.split() if len(word) > 3 and word.isalpha()]
    return ' '.join(words)

def preprocess_categories_and_text_batch(batch):
    """
    Applies category simplification and text preprocessing to a dataset batch.
    Adds 'categories_simplified_list' and 'text_processed' columns.
    Designed for use with datasets.map().
    """
    # Simplify Categories
    batch['categories_simplified_list'] = [simplify_category_list(cats) for cats in batch['categories']]

    # Preprocess Text (Combine title and abstract)
    texts_to_process = [(title if title else "") + ' ' + (abstract if abstract else "")
                        for title, abstract in zip(batch['title'], batch['abstract'])]
    batch['text_processed'] = [preprocess_text(text) for text in texts_to_process]

    return batch


In [4]:

# Cell 2: Load Models and Mappings
def load_models_and_mappings(model_dir='../../models'):
    """
    Load the classifier, ALS model, recommendation matrix, and mappings.
    """
    # Load classifier
    classifier = joblib.load(f'{model_dir}/arxiv_category_classifier_logreg.joblib')

    # Configure ALS model parameters
    factors = 10
    regularization = 0.01
    iterations = 30
    calculate_training_loss = True
    use_gpu = implicit.gpu.HAS_CUDA

    # Initialize and load ALS model
    als = implicit.als.AlternatingLeastSquares(
        factors=factors,
        regularization=regularization,
        iterations=iterations,
        calculate_training_loss=calculate_training_loss,
        use_gpu=use_gpu,
        random_state=42
    )
    als = als.load(f'{model_dir}/als_model.npz')

    # Load category-journal matrix and mappings
    category_journal_matrix = joblib.load(f'{model_dir}/category_journal_matrix_PREDICTED.joblib')
    id2class_simplified = joblib.load(f'{model_dir}/id2class_simplified.joblib')
    class2id_simplified = {v: k for k, v in id2class_simplified.items()}
    journal_to_id = joblib.load(f'{model_dir}/journal_to_id.joblib')
    id_to_journal = {i: j for j, i in journal_to_id.items()}

    return {
        'classifier': classifier,
        'als_model': als,
        'category_journal_matrix': category_journal_matrix,
        'id2class_simplified': id2class_simplified,
        'class2id_simplified': class2id_simplified,
        'journal_to_id': journal_to_id,
        'id_to_journal': id_to_journal
    }

In [24]:

# Cell 3: Recommendation Function
def recommend_journals(title, abstract, categories, models_data, N=10):
    """
    Given paper details, return top N journal recommendations.
    """
    # Combine and preprocess text
    text = f"{title or ''} {abstract or ''}".strip()
    processed = preprocess_text(text)

    # Predict categories
    preds = models_data['classifier'].predict([processed])
    simplified = []
    try:
        coo = preds.tocoo()
        for _, cat_idx in zip(coo.row, coo.col):
            simplified.append(models_data['id2class_simplified'].get(cat_idx))
    except AttributeError:
        # fallback if preds is plain array
        for idx, val in enumerate(preds[0]):
            if val == 1 and idx in models_data['id2class_simplified']:
                simplified.append(models_data['id2class_simplified'][idx])

    # Merge user categories if provided
    user_cats = simplify_category_list(categories) if categories else []
    final_cats = simplified or user_cats
    if not final_cats:
        raise ValueError("No categories could be determined.")

    # Score journals
    scores = {}
    als = models_data['als_model']
    matrix = models_data['category_journal_matrix']
    c2id = models_data['class2id_simplified']
    id2job = models_data['id_to_journal']

    for cat in final_cats:
        cid = c2id.get(cat)
        if cid is None:
            continue
        ids, sc = als.recommend(userid=cid,
                                 user_items=matrix[cid],
                                 N=N,
                                 filter_already_liked_items=False)
        for jid, score in zip(ids, sc):
            journal = id2job.get(jid)
            if journal:
                scores[journal] = scores.get(journal, 0) + score

    # Build DataFrame
    top = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:N]
    df = pd.DataFrame(top, columns=['Journal', 'Score'])
    df['Score'] = df['Score'].round(4)
    return df, final_cats

In [6]:
# Cell 4: Load Dataset
data_path = '../../data/arxiv-metadata-oai-snapshot.json'
data = load_dataset('json', data_files=data_path)
subset = data['train'].train_test_split(train_size=0.125, test_size=0.1, seed=42)
train_df = pd.DataFrame(subset['train'])
test_df = pd.DataFrame(subset['test'])

In [None]:

# Cell 5: Run Recommendations on Sample
models = load_models_and_mappings()

# Select a few sample papers
samples = test_df.sample(5, random_state=42).reset_index(drop=True)

In [11]:
samples[['title', 'abstract', 'categories']]

Unnamed: 0,title,abstract,categories
0,Patterns in the jump-channel statistics of ope...,A continuously measured quantum system with ...,quant-ph cond-mat.stat-mech
1,New low mass ratio contact binaries in the Cat...,We present the identification and photometri...,astro-ph.SR
2,Indexed Markov Chains for financial data: test...,A new branch based on Markov processes is de...,q-fin.ST
3,Estimation for stochastic damping Hamiltonian ...,This paper is the third part of our study st...,math.PR
4,Relaxation of the Bose-condensate oscillations...,The general system is given of nonlinear equ...,cond-mat.stat-mech cond-mat.mes-hall


In [28]:
import json

accumulator = []

for idx, row in samples.iterrows():
    try:
        recs_df, final_cats = recommend_journals(
            title=row['title'],
            abstract=row['abstract'],
            categories='',
            # categories=row.get('categories', ''),
            models_data=models,
            N=10
        )

        # build one dict per paper
        paper_dict = {
            'paper_index': idx,
            'title'      : row['title'],
            'abstract'   : row['abstract'],
            'categories' : row['categories'],
            'final_cats' : final_cats,
            'recommendations': [
                {'journal': j, 'score': float(s)}
                for j, s in zip(recs_df['Journal'], recs_df['Score'])
            ]
        }
        accumulator.append(paper_dict)

    except Exception as e:
        print(f"Error at sample {idx}: {e}")

# dump to JSON
with open('../../data/no_cat_paper_recommendations.json', 'w') as f:
    json.dump(accumulator, f, indent=2)

# (Or, to see it on-screen:)
print(json.dumps(accumulator, indent=2))


[
  {
    "paper_index": 0,
    "title": "Patterns in the jump-channel statistics of open quantum systems",
    "abstract": "  A continuously measured quantum system with multiple jump channels gives rise\nto a stochastic process described by random jump times and random emitted\nsymbols, representing each jump channel. While much is known about the waiting\ntime distributions, very little is known about the statistics of the emitted\nsymbols. In this letter we fill in this gap. First, we provide a full\ncharacterization of the resulting stochastic process, including efficient ways\nof simulating it, as well as determining the underlying memory structure.\nSecond, we show how to unveil patterns in the stochastic evolution: Some\nsystems support closed patterns, wherein the evolution runs over a finite set\nof states, or at least recurring states. But even if neither is possible, we\nshow that one may still cluster the states approximately, based on their\nability to predict future outc

In [29]:
import json

accumulator = []

for idx, row in samples.iterrows():
    try:
        recs_df, final_cats = recommend_journals(
            title=row['title'],
            abstract=row['abstract'],
            # categories='',
            categories=row.get('categories', ''),
            models_data=models,
            N=10
        )

        # build one dict per paper
        paper_dict = {
            'paper_index': idx,
            'title'      : row['title'],
            'abstract'   : row['abstract'],
            'categories' : row['categories'],
            'final_cats' : final_cats,
            'recommendations': [
                {'journal': j, 'score': float(s)}
                for j, s in zip(recs_df['Journal'], recs_df['Score'])
            ]
        }
        accumulator.append(paper_dict)

    except Exception as e:
        print(f"Error at sample {idx}: {e}")

# dump to JSON
with open('../../data/cat_paper_recommendations.json', 'w') as f:
    json.dump(accumulator, f, indent=2)

# (Or, to see it on-screen:)
print(json.dumps(accumulator, indent=2))


[
  {
    "paper_index": 0,
    "title": "Patterns in the jump-channel statistics of open quantum systems",
    "abstract": "  A continuously measured quantum system with multiple jump channels gives rise\nto a stochastic process described by random jump times and random emitted\nsymbols, representing each jump channel. While much is known about the waiting\ntime distributions, very little is known about the statistics of the emitted\nsymbols. In this letter we fill in this gap. First, we provide a full\ncharacterization of the resulting stochastic process, including efficient ways\nof simulating it, as well as determining the underlying memory structure.\nSecond, we show how to unveil patterns in the stochastic evolution: Some\nsystems support closed patterns, wherein the evolution runs over a finite set\nof states, or at least recurring states. But even if neither is possible, we\nshow that one may still cluster the states approximately, based on their\nability to predict future outc