In [1]:
import pandas as pd
import numpy as np
import datasets
import joblib
from scipy.sparse import lil_matrix, csr_matrix
import implicit
import time
from collections import defaultdict

from journal_name_preprocessor import preprocess_journal_batch
from category_preprocessor import preprocess_categories_and_text_batch, simplify_category_list

In [2]:
data_path = 'arxiv-metadata-oai-snapshot.json'
print(f"Loading data from: {data_path}")

raw_data = datasets.load_dataset(
    "json",
    data_files=data_path,
    split='train' # Load everything as train initially
)

# Select relevant columns (include original categories for now)
raw_data = raw_data.select_columns(['title', 'abstract', 'categories', 'journal-ref'])
print("Raw data loaded:")
print(raw_data)

Loading data from: arxiv-metadata-oai-snapshot.json


Generating train split: 0 examples [00:00, ? examples/s]

Raw data loaded:
Dataset({
    features: ['title', 'abstract', 'categories', 'journal-ref'],
    num_rows: 2720631
})


In [3]:
min_journal_samples = 100 # Threshold for credible journals
print(f"Applying journal preprocessing and filtering (min_samples={min_journal_samples})...")

# Apply journal cleaning using the external function via .map()
# Note: .map might be faster for large datasets than list comprehension used before
# Adjust num_proc based on available cores
data_cleaned_journals = raw_data.map(
    preprocess_journal_batch,
    batched=True,
    num_proc=4 # Adjust as needed
)

# Filter out rows where journal cleaning resulted in None
data_filtered_journals = data_cleaned_journals.filter(
    lambda x: x['journal_cleaned'] is not None,
    num_proc=4 # Adjust as needed
)
print(f"Rows after removing empty/uncleanable journal refs: {len(data_filtered_journals)}")

# Calculate journal counts and identify valid journals (requires converting a column to pandas or iterating)
# Let's do it efficiently without full pandas conversion if possible
journal_counts = defaultdict(int)
for journal in data_filtered_journals['journal_cleaned']:
    journal_counts[journal] += 1

valid_journals = [j for j, count in journal_counts.items() if count >= min_journal_samples]
valid_journal_set = set(valid_journals) # Use set for faster filtering lookup

print(f"Total unique cleaned journal names found: {len(journal_counts)}")
print(f"Number of unique journals meeting threshold (>= {min_journal_samples} papers): {len(valid_journals)}")

# Filter the dataset to keep only rows with valid journals
data_filtered = data_filtered_journals.filter(
    lambda x: x['journal_cleaned'] in valid_journal_set,
    num_proc=4 # Adjust as needed
)
print(f"Total samples after filtering by journal frequency: {len(data_filtered)}")

# Create journal mappings
journal_to_id = {journal: i for i, journal in enumerate(valid_journals)}
id_to_journal = {i: journal for journal, i in journal_to_id.items()}
num_valid_journals = len(valid_journals)

Applying journal preprocessing and filtering (min_samples=100)...


Map (num_proc=4):   0%|          | 0/2720631 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/2720631 [00:00<?, ? examples/s]

Rows after removing empty/uncleanable journal refs: 873250
Total unique cleaned journal names found: 138946
Number of unique journals meeting threshold (>= 100 papers): 549


Filter (num_proc=4):   0%|          | 0/873250 [00:00<?, ? examples/s]

Total samples after filtering by journal frequency: 577727


In [4]:
# Cell 4: Apply Category & Text Preprocessing
print("Applying category simplification and text preprocessing...")

# Use the external batch preprocessing function
data_processed = data_filtered.map(
    preprocess_categories_and_text_batch,
    batched=True,
    remove_columns=['title', 'abstract'], # Remove original text columns
    num_proc=4 # Adjust as needed
)
print("Preprocessing complete.")
print(data_processed)

# Find unique simplified categories from the *final* processed data
all_simplified_cats_lists = data_processed['categories_simplified_list']
unique_simplified_categories = sorted(list(set(cat for sublist in all_simplified_cats_lists for cat in sublist)))
num_unique_simplified_categories = len(unique_simplified_categories)

print(f"Number of unique SIMPLIFIED categories in final dataset: {num_unique_simplified_categories}")

# Create simplified category mappings
class2id_simplified = {cat: i for i, cat in enumerate(unique_simplified_categories)}
id2class_simplified = {i: cat for i, cat in enumerate(unique_simplified_categories)}

Applying category simplification and text preprocessing...


Map (num_proc=4):   0%|          | 0/577727 [00:00<?, ? examples/s]

Preprocessing complete.
Dataset({
    features: ['categories', 'journal-ref', 'journal_cleaned', 'categories_simplified_list', 'text_processed'],
    num_rows: 577727
})
Number of unique SIMPLIFIED categories in final dataset: 36


In [5]:
classifier_path = 'arxiv_category_classifier_logreg.joblib'
print(f"Loading pre-trained classifier from: {classifier_path}")
try:
    model_pipeline = joblib.load(classifier_path)
    print("Classifier loaded successfully.")
except FileNotFoundError:
    print(f"Error: Classifier file not found at {classifier_path}.")
    print("Please ensure the classifier was trained and saved separately.")
    # Handle error appropriately, maybe raise Exception
    model_pipeline = None
except Exception as e:
    print(f"Error loading classifier: {e}")
    model_pipeline = None

Loading pre-trained classifier from: arxiv_category_classifier_logreg.joblib


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Classifier loaded successfully.


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [6]:
if model_pipeline:
    print("Predicting categories for the dataset...")
    # Ensure data is in the right format (list of strings)
    texts_for_prediction = data_processed['text_processed']
    start_time = time.time()
    y_pred = model_pipeline.predict(texts_for_prediction)
    end_time = time.time()
    print(f"Prediction complete. Time taken: {end_time - start_time:.2f} seconds.")
    print(f"Predictions shape: {y_pred.shape}, Type: {type(y_pred)}")
else:
    print("Skipping prediction as classifier model was not loaded.")
    y_pred = None # Or handle error

Predicting categories for the dataset...
Prediction complete. Time taken: 19.51 seconds.
Predictions shape: (577727, 36), Type: <class 'scipy.sparse._csr.csr_matrix'>


In [7]:
if y_pred is not None:
    print("Constructing the Category-Journal interaction matrix from predictions...")

    category_journal_matrix = lil_matrix((num_unique_simplified_categories, num_valid_journals), dtype=int)

    # Get journal names corresponding to the predictions
    journal_names_processed = data_processed['journal_cleaned']

    # Iterate through predictions (efficiently using COO format)
    y_pred_coo = y_pred.tocoo()
    paper_indices = y_pred_coo.row
    predicted_cat_indices = y_pred_coo.col

    print(f"Processing {len(paper_indices)} predicted category instances...")
    for paper_idx, category_idx in zip(paper_indices, predicted_cat_indices):
        try:
            journal_name = journal_names_processed[paper_idx] # Direct index lookup
            if journal_name in journal_to_id: # Check if journal is valid (should be)
                journal_id = journal_to_id[journal_name]
                category_journal_matrix[category_idx, journal_id] += 1
            # Else: This case should ideally not happen if filtering was correct
        except IndexError:
             print(f"Error: Index {paper_idx} out of bounds for journal_names_processed (length {len(journal_names_processed)})")
             break # Stop if indexing fails

    # Convert to CSR for efficiency
    category_journal_matrix_csr = category_journal_matrix.tocsr()

    print("Category-Journal matrix construction complete.")
    print(f"Matrix shape: {category_journal_matrix_csr.shape} (Categories x Journals)")
    print(f"Number of non-zero interactions: {category_journal_matrix_csr.nnz}")

    # Save the matrix based on predictions
    joblib.dump(category_journal_matrix_csr, 'category_journal_matrix_PREDICTED.joblib')
    joblib.dump(id2class_simplified, 'id2class_simplified.joblib') # Save mappings if not done elsewhere
    joblib.dump(journal_to_id, 'journal_to_id.joblib')
else:
    print("Skipping matrix construction as predictions are unavailable.")
    category_journal_matrix_csr = None

Constructing the Category-Journal interaction matrix from predictions...
Processing 693913 predicted category instances...
Category-Journal matrix construction complete.
Matrix shape: (36, 549) (Categories x Journals)
Number of non-zero interactions: 5038


In [8]:
if category_journal_matrix_csr is not None:
    print("Preparing matrix and training ALS model...")
    # Implicit ALS: users=Categories, items=Journals
    # Input matrix: Categories x Journals
    als_input_matrix = category_journal_matrix_csr.astype(np.float32)

    # --- Parameters for ALS ---
    factors = 10
    regularization = 0.01
    iterations = 30
    calculate_training_loss = True
    use_gpu = implicit.gpu.HAS_CUDA
    # --- --- --- --- --- --- ---

    print(f"Initializing ALS model with factors={factors}, regularization={regularization}, iterations={iterations}, use_gpu={use_gpu}")
    model_als = implicit.als.AlternatingLeastSquares(
        factors=factors,
        regularization=regularization,
        iterations=iterations,
        calculate_training_loss=calculate_training_loss,
        use_gpu=use_gpu,
        random_state=42
    )

    # Train the model
    start_time = time.time()
    model_als.fit(als_input_matrix)
    end_time = time.time()
    print(f"ALS model training complete. Time taken: {end_time - start_time:.2f} seconds.")

    # Save the trained ALS model
    model_als.save('als_model.npz')
else:
    print("Skipping ALS training as interaction matrix is unavailable.")
    model_als = None

Preparing matrix and training ALS model...
Initializing ALS model with factors=10, regularization=0.01, iterations=30, use_gpu=False


  0%|          | 0/30 [00:00<?, ?it/s]

ALS model training complete. Time taken: 0.05 seconds.


In [9]:
if model_als:
    def precision_recall_at_k(model, train_matrix, k=10):
        """
        Calculate mean Precision@k and Recall@k for ALS model.
        Assumes train_matrix is the ground truth (users x items).
        """
        precisions = []
        recalls = []
        num_users = train_matrix.shape[0]

        # Get all recommendations for all users
        user_ids = np.arange(num_users)
        # This can be memory intensive for many users/items
        ids, scores = model.recommend(user_ids, train_matrix, N=k, filter_already_liked_items=False)

        for i, user_id in enumerate(user_ids):
            training_items = train_matrix[user_id].indices # Indices of items user interacted with
            if len(training_items) == 0:
                continue # Skip users with no interactions in the training data

            recommended_items = ids[i]

            # Calculate hits
            hits = np.isin(recommended_items, training_items)
            num_hits = np.sum(hits)

            precisions.append(num_hits / k)
            recalls.append(num_hits / len(training_items))

        mean_precision = np.mean(precisions) if precisions else 0.0
        mean_recall = np.mean(recalls) if recalls else 0.0
        mean_f1 = (2 * mean_precision * mean_recall) / (mean_precision + mean_recall) if (mean_precision + mean_recall) > 0 else 0.0

        return mean_precision, mean_recall, mean_f1

    def ndcg_at_k(model, train_matrix, k=10):
        """
        Calculate mean NDCG@k for ALS model.
        """
        ndcgs = []
        num_users = train_matrix.shape[0]
        user_ids = np.arange(num_users)
        ids, scores = model.recommend(user_ids, train_matrix, N=k, filter_already_liked_items=False)

        for i, user_id in enumerate(user_ids):
            training_items = train_matrix[user_id].indices
            if len(training_items) == 0:
                continue

            recommended_items = ids[i]

            # Create relevance array (1 if item was interacted with, 0 otherwise)
            relevance = np.isin(recommended_items, training_items).astype(np.float32)

            # DCG calculation
            discounts = np.log2(np.arange(len(recommended_items)) + 2)
            dcg = np.sum(relevance / discounts)

            # IDCG calculation (ideal ranking)
            ideal_relevance = np.ones_like(relevance) # Assume all interacted items are relevant=1
            ideal_dcg = np.sum(ideal_relevance[:len(training_items)] / np.log2(np.arange(min(k, len(training_items))) + 2))

            ndcgs.append(dcg / ideal_dcg if ideal_dcg > 0 else 0.0)

        return np.mean(ndcgs) if ndcgs else 0.0

    print("Evaluation functions defined.")
else:
    print("Skipping evaluation setup as ALS model is unavailable.")


Evaluation functions defined.


In [10]:
if model_als and als_input_matrix is not None:
    print("\nCalculating ALS Recommendation Metrics...")
    k_eval = 10 # Evaluate at K=10

    start_time = time.time()
    mean_precision, mean_recall, mean_f1 = precision_recall_at_k(model_als, als_input_matrix, k=k_eval)
    mean_ndcg = ndcg_at_k(model_als, als_input_matrix, k=k_eval)
    end_time = time.time()

    print(f"Evaluation complete. Time taken: {end_time - start_time:.2f} seconds.")
    print("-" * 30)
    print(f"Metrics @{k_eval}:")
    print(f"  Precision@{k_eval}: {mean_precision:.4f}")
    print(f"  Recall@{k_eval}:    {mean_recall:.4f}")
    print(f"  F1-Score@{k_eval}:  {mean_f1:.4f}")
    print(f"  NDCG@{k_eval}:      {mean_ndcg:.4f}")
    print("-" * 30)
else:
    print("Skipping metric calculation.")


Calculating ALS Recommendation Metrics...
Evaluation complete. Time taken: 0.01 seconds.
------------------------------
Metrics @10:
  Precision@10: 0.8636
  Recall@10:    0.0780
  F1-Score@10:  0.1431
  NDCG@10:      0.8642
------------------------------


In [11]:
if model_als and als_input_matrix is not None:
    print("\n--- Example Recommendations: Category -> Journals ---")
    def recommend_journals_for_category(category_name, model, user_item_matrix, N=10):
        if category_name not in class2id_simplified:
            print(f"Error: Category '{category_name}' not found.")
            return
        category_id = class2id_simplified[category_name]
        print(f"\nRecommending top {N} journals for category: '{category_name}' (ID: {category_id})")
        ids, scores = model.recommend(category_id, user_item_matrix[category_id], N=N) # Pass the category's row
        print("-" * 40)
        for journal_id, score in zip(ids, scores):
            journal_name = id_to_journal.get(journal_id, "Unknown Journal")
            print(f"- {journal_name:<30} (Score: {score:.4f})")
        print("-" * 40)

    recommend_journals_for_category('astro-ph', model_als, als_input_matrix, N=10)
    recommend_journals_for_category('cs', model_als, als_input_matrix, N=10)
    recommend_journals_for_category('math', model_als, als_input_matrix, N=10)
else:
    print("\nSkipping category->journal recommendations.")


--- Example Recommendations: Category -> Journals ---

Recommending top 10 journals for category: 'astro-ph' (ID: 3)
----------------------------------------
- PhysRevSTAccelBeams            (Score: 0.9807)
- IntModPhysA9                   (Score: 0.9802)
- PoSLattice                     (Score: 0.9088)
- LettMathPhys                   (Score: 0.7240)
- PhysicalChemistryChemicalPhysics (Score: 0.6769)
- AdvTheorMathPhys               (Score: 0.6668)
- RevMathPhys                    (Score: 0.5959)
- ACSNano                        (Score: 0.5610)
- ICLR                           (Score: 0.5238)
- symblog                        (Score: 0.5134)
----------------------------------------

Recommending top 10 journals for category: 'cs' (ID: 10)
----------------------------------------
- PhysNuclPartPhys               (Score: 0.6794)
- BrazPhys                       (Score: 0.5735)
- Nonlinearity                   (Score: 0.4479)
- ModPhysLett                    (Score: 0.4281)
- StatistPhys

In [12]:
if model_als:
    print("\n--- Example Recommendations: Journal -> Similar Journals ---")
    def find_similar_journals(journal_name, model, N=10):
        journal_name_lower = journal_name.lower()
        found_id = None
        if journal_name in journal_to_id:
             found_id = journal_to_id[journal_name]
        else: # Try partial match
             matches = [j for j in journal_to_id if journal_name_lower in j.lower()]
             if len(matches) == 1:
                 journal_name = matches[0]
                 found_id = journal_to_id[journal_name]
                 print(f"(Matched '{journal_name}')")
             elif len(matches) > 1:
                  print(f"Ambiguous journal name '{journal_name}'. Matches: {matches}")
                  return
             else:
                 print(f"Error: Journal like '{journal_name}' not found.")
                 return

        print(f"\nFinding top {N} journals similar to: '{journal_name}' (ID: {found_id})")
        ids, scores = model.similar_items(found_id, N=N+1)
        print("-" * 40)
        for other_journal_id, score in zip(ids, scores):
            if other_journal_id != found_id:
                other_journal_name = id_to_journal.get(other_journal_id, "Unknown Journal")
                print(f"- {other_journal_name:<30} (Score: {score:.4f})")
        print("-" * 40)

    find_similar_journals('PhysRevLett', model_als, N=10)
    find_similar_journals('Astrophys', model_als, N=10)
    find_similar_journals('JHEP', model_als, N=10)
else:
    print("\nSkipping similar journal recommendations.")


--- Example Recommendations: Journal -> Similar Journals ---

Finding top 10 journals similar to: 'PhysRevLett' (ID: 7)
----------------------------------------
- PhysRev                        (Score: 0.9997)
- IntModPhys                     (Score: 0.9995)
- EurPhys                        (Score: 0.9993)
- PhysicalReview                 (Score: 0.9989)
- EurophysLett                   (Score: 0.9983)
- PhysRevResearch2               (Score: 0.9970)
- NewPhys                        (Score: 0.9966)
- PhysConfSer                    (Score: 0.9964)
- EurPhysPlus                    (Score: 0.9941)
- SciPostPhys                    (Score: 0.9929)
----------------------------------------

Finding top 10 journals similar to: 'Astrophys' (ID: 3)
----------------------------------------
- MonNotRoyAstronSoc             (Score: 0.9903)
- SciChinaPhysMechAstron         (Score: 0.9901)
- AstronAstrophys                (Score: 0.9495)
- NuclInstrumMeth                (Score: 0.8429)
- Astrophysic