In [44]:
import json
import pandas as pd
from pathlib import Path

root_dir = Path("data")

# Folders you care about
target_folders = ["2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023", "2024"]

records = []

for folder in target_folders:
    folder_path = root_dir / folder

    print(f"üìÇ Scanning: {folder_path}")

    # Loop through all json files in the folder
    for json_file in folder_path.glob("*.json"):

        # Read JSON
        try:
            with open(json_file, "r") as f:
                data = json.load(f)
        except Exception as e:
            print(f"‚ö†Ô∏è Failed to read {json_file}: {e}")
            continue


        records.append(data)

# Create dataframe
df = pd.DataFrame(records)
df.head()

üìÇ Scanning: data/2012
üìÇ Scanning: data/2013
üìÇ Scanning: data/2014
üìÇ Scanning: data/2015
üìÇ Scanning: data/2016
üìÇ Scanning: data/2017
üìÇ Scanning: data/2018
üìÇ Scanning: data/2019
üìÇ Scanning: data/2020
üìÇ Scanning: data/2021
üìÇ Scanning: data/2022
üìÇ Scanning: data/2023
üìÇ Scanning: data/2024


Unnamed: 0,arxiv_id,title,authors,abstract,published_date,last_revised_date,num_revisions,pdf_url,primary_category,categories,...,citing_spaces,citing_collections,citations_by_year,citationCount,venue,citations,referenceCount,references,influentialCitationCount,embedding
0,1212.2518,Efficient Inference in Large Discrete Domains,"[{'name': 'R Sharma', 'citations_all': None, '...",In this paper we examine the problem of infere...,2012-10-19T00:00:00,2012-10-19T00:00:00,0,https://arxiv.org/pdf/1212.2518.pdf,Artificial Intelligence (cs.AI),[Artificial Intelligence (cs.AI)],...,0,0,"{'2003': 1, '2005': 1, '2006': 1, '2007': 1, '...",11.0,{'name': 'Conference on Uncertainty in Artific...,"[{'arxiv_id': None, 'referenceCount': 26, 'cit...",13,"[{'arxiv_id': None, 'referenceCount': 9, 'cita...",1,
1,1212.2511,Stochastic complexity of Bayesian networks,"[{'name': 'K Yamazaki', 'citations_all': 708, ...",Bayesian networks are now being used in enormo...,2012-10-19T00:00:00,2012-10-19T00:00:00,0,https://arxiv.org/pdf/1212.2511.pdf,Machine Learning (cs.LG),"[Machine Learning (cs.LG), Machine Learning (s...",...,0,0,"{'2003': 2, '2004': 5, '2005': 6, '2006': 5, '...",45.0,{'name': 'Conference on Uncertainty in Artific...,"[{'arxiv_id': None, 'referenceCount': 75, 'cit...",13,"[{'arxiv_id': None, 'referenceCount': 22, 'cit...",2,
2,1211.5625,A survey of computational methods for protein ...,"[{'name': 'S Srihari', 'citations_all': 3135, ...",Complexes of physically interacting proteins a...,2012-11-24T00:00:00,2012-11-24T00:00:00,0,https://arxiv.org/pdf/1211.5625.pdf,"Computational Engineering, Finance, and Scienc...","[Computational Engineering, Finance, and Scien...",...,0,0,"{'2013': 8, '2014': 11, '2015': 16, '2016': 14...",127.0,"{'name': None, 'type': None, 'ranking': None}","[{'arxiv_id': None, 'referenceCount': 91, 'cit...",77,"[{'arxiv_id': None, 'referenceCount': 27, 'cit...",4,
3,1212.248,Approximate Inference and Constrained Optimiza...,"[{'name': 'T Heskes', 'citations_all': 16337, ...",Loopy and generalized belief propagation are p...,2012-10-19T00:00:00,2012-10-19T00:00:00,0,https://arxiv.org/pdf/1212.2480.pdf,Machine Learning (cs.LG),"[Machine Learning (cs.LG), Artificial Intellig...",...,0,0,"{'2003': 3, '2004': 6, '2005': 8, '2006': 10, ...",143.0,{'name': 'Conference on Uncertainty in Artific...,"[{'arxiv_id': None, 'referenceCount': 77, 'cit...",13,"[{'arxiv_id': None, 'referenceCount': 69, 'cit...",13,
4,1212.4674,Natural Language Understanding Based on Semant...,"[{'name': 'H Kong', 'citations_all': None, 'ci...","In this paper, we define event expression over...",2012-12-19T00:00:00,2012-12-19T00:00:00,0,https://arxiv.org/pdf/1212.4674.pdf,Computation and Language (cs.CL),[Computation and Language (cs.CL)],...,0,0,{},0.0,"{'name': 'arXiv.org', 'type': None, 'ranking':...",[],5,"[{'arxiv_id': None, 'referenceCount': None, 'c...",0,


In [45]:
venue_df = pd.json_normalize(df['venue'],
                             sep='.')
df = pd.concat([df, venue_df], axis=1)
df.drop(columns=['venue'], inplace=True)
df.rename(columns={'name': 'venue_name',
                   'type': 'venue_type',
                   'ranking': 'venue_ranking'},
          inplace=True)

In [46]:
df.drop(columns=['pdf_url', 'embedding', 'venue_name'], inplace=True)

In [47]:
import re

def normalize_category(cat):
    match = re.search(r'\((.*?)\)', cat)
    return match.group(1) if match else cat.strip()

df['categories'] = df['categories'].apply(lambda lst: [normalize_category(c) for c in lst])
df['primary_category'] = df['primary_category'].apply(normalize_category)

### Remove paper with num_pages = null

In [48]:
df = df[df['num_pages'].notna()]

### Drop keywords

In [49]:
df.drop(columns='keywords', inplace=True)

### Remove paper with citations_by_year and citationCount = null

In [50]:
df = df[df['citations_by_year'].notna()]
df = df[df['citationCount'].notna()]

### Fill missing values in github_stars = 0

In [51]:
df.loc[df['github_stars'].isna(), 'github_stars'] = 0

### Fill missing (venue.type, venue.ranking) = (preprint, 0)

In [52]:
df.loc[df['venue_type'].isna(), 'venue_type'] = 'preprint'
df.loc[df['venue_ranking'].isna(), 'venue_ranking'] = 0

### Add columns 'citations_after_years{0: ..., 1: ..., 2: ..., 3: ...}'

In [57]:
df['published_date'] = pd.to_datetime(df['published_date'])
df['published_year'] = df['published_date'].dt.year

# Iterate over each row to calculate citations after X years
def calculate_citations_after_years(row):
    citations_by_year = row['citations_by_year']
    published_year = row['published_year']
    result = {}
    for year in range(published_year, 2025):
        result[year-published_year] = 0
    for year, count in citations_by_year.items():
        year = int(year)
        result[year - published_year] = count
    return result

# def reset_published_year(row):
#     # citations = row.get("citations_by_year", {})
#     # if not citations:
#     return row["published_date"].year
    
#     # cited_years = [int(y) for y, c in citations.items()]
#     # return min(cited_years)  # first year with citations
    
# # df["published_year"] = df.apply(reset_published_year, axis=1)
df['citations_after_years'] = df.apply(calculate_citations_after_years, axis=1)

### Assign outliers' venue ranking by their nearest median

In [None]:
import numpy as np

def normalize_ranking(value):
    if value in ['A*', 'A', 'B', 'C']:
        return value  # CORE ranking
    elif value in ['Q1', 'Q2', 'Q3', 'Q4']:
        return value  # Scimago quartile
    # elif  in str(value):
        # return 'National'
    # elif value in ['National', 'Multiconference', 'TBR', 'Unranked', '-']:
    #     return 'Other'
    # elif value in ['Unranked', '-']:
    #     return 'Unranked'
    else:
        return 'Other'

df['venue_ranking'] = df['venue_ranking'].apply(normalize_ranking)

venue_medians = (
    df[df['venue_ranking'] != 'Other']
    .groupby('venue_ranking')['citationCount']
    .median()
    .to_dict()
)

# print("Median citations by venue:")
# print(venue_medians)

# --- Step 2: Handle case when 'Other' is empty
other_group = df[df['venue_ranking'] == 'Other']
Q1 = np.percentile(other_group['citationCount'], 25)
Q3 = np.percentile(other_group['citationCount'], 75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers_other = other_group[
    (other_group['citationCount'] < lower_bound) |
    (other_group['citationCount'] > upper_bound)
]

print(f"\nDetected {len(outliers_other)} outliers in 'Other'")

# --- Step 3: Reassign nearest median
def nearest_venue(citation, medians):
    return min(medians.keys(), key=lambda k: abs(medians[k] - citation))

# df['venue_ranking_imputed'] = df['venue_ranking']

for idx, row in outliers_other.iterrows():
    nearest = nearest_venue(row['citationCount'], venue_medians)
    df.at[idx, 'venue_ranking'] = nearest


Detected 1276 outliers in 'Other'


### Number of authors

In [None]:
def count_authors(authors):
    return len(authors)
    
df['num_authors'] = df['authors'].apply(count_authors)

### Add authors' statistics

In [None]:
import numpy as np

def extract_author_stats(authors_list):
    if not authors_list:
        return pd.Series({
            "mean_citations_all": np.nan,
            "max_citations_all": np.nan,
            "mean_h_index_all": np.nan,
            "max_h_index_all": np.nan,
            "mean_i10_index_all": np.nan,
            "max_i10_index_all": np.nan,
        })
    citations_all = [(a.get("citations_all") or 0) for a in authors_list]
    h_all = [(a.get("h_index_all") or 0) for a in authors_list]
    i10_all = [(a.get("i10_index_all") or 0) for a in authors_list]
    return pd.Series({
        "mean_citations_all": np.mean(citations_all),
        "max_citations_all": np.max(citations_all),
        "mean_h_index_all": np.mean(h_all),
        "max_h_index_all": np.max(h_all),
        "mean_i10_index_all": np.mean(i10_all),
        "max_i10_index_all": np.max(i10_all),
    })

author_features = df["authors"].apply(extract_author_stats)
df = pd.concat([df, author_features], axis=1)

### Encoding

In [None]:
df['venue_type'] = df['venue_type'].map({'preprint':0, 'conference':1, 'journal':2})
df['venue_ranking'] = df['venue_ranking'].map({'Q4':1, 'Q3':2, 'Q2':3, 'Q1':4, 'C': 1, 'B':2, 'A':3, 'A*':4, 'Other':0})

In [None]:
from sklearn.preprocessing import LabelEncoder

# Create encoder
le = LabelEncoder()
df['primary_category'] = le.fit_transform(df['primary_category'])

### Add slope of trend for each category (based on number of papers and citations in each category over time)

We don't consider the peak time (2020 in paper slope and 2023, 2024 in citations)

In [None]:
from scipy.stats import linregress
import numpy as np

def safe_slope(x, y):
    if len(x) < 2:
        return 0
    return linregress(x, y).slope

# Exclude anomaly years for papers
papers_trend = (
    df[~df['published_year'].isin([2020])]
    .groupby(['primary_category', 'published_year'])
    .size()
    .reset_index(name='num_papers')
)

# Exclude anomaly years for citations
citations_trend = (
    df[~df['published_year'].isin([2023, 2024, 2025])]
    .groupby(['primary_category', 'published_year'])['citationCount']
    .sum()
    .reset_index(name='total_citations')
)
# display(citations_trend)
# Compute slopes
slope_papers = (
    papers_trend.groupby('primary_category')
    .apply(lambda g: safe_slope(g['published_year'], g['num_papers']), include_groups=False)
    .reset_index(name='slope_papers')
)
# display(slope_papers)
slope_citations = (
    citations_trend.groupby('primary_category')
    .apply(lambda g: safe_slope(g['published_year'], g['total_citations']), include_groups=False)
    .reset_index(name='slope_citations')
)
# display(slope_citations)
# Combine and sort
trend_df = slope_papers.merge(slope_citations, on='primary_category', how='outer')
trend_df = trend_df.sort_values('slope_citations', ascending=False)
# display(trend_df)

In [None]:
df = df.merge(trend_df, on="primary_category", how="left")
numeric_df = df.select_dtypes(include=["number"])
numeric_df.columns

Index(['num_revisions', 'primary_category', 'num_pages', 'github_stars',
       'upvote', 'citing_models', 'citing_datasets', 'citing_spaces',
       'citing_collections', 'citationCount', 'referenceCount',
       'influentialCitationCount', 'venue_type', 'venue_ranking',
       'published_year', 'num_authors', 'mean_citations_all',
       'max_citations_all', 'mean_h_index_all', 'max_h_index_all',
       'mean_i10_index_all', 'max_i10_index_all', 'slope_papers',
       'slope_citations'],
      dtype='object')

### Remove citations_by_years in 2024, 2025 to avoid data leakage
citations in 2024 is the target we wanna predict.

In [None]:
def adjust_citation_count(row):
    citations = row["citations_by_year"]
    if isinstance(citations, dict):
        subtract = citations.get("2024", 0) + citations.get("2025", 0)
        return row["citationCount"] - subtract
    return row["citationCount"]

df["citationCount"] = df.apply(adjust_citation_count, axis=1)

In [None]:
df["citations_2024"] = df["citations_by_year"].apply(
    lambda x: x.get("2024", 0) if isinstance(x, dict) else 0
)

### Log transform citationCount

In [None]:
df["citationCount_log"] = np.log1p(df["citationCount"])

### Drop citationCount

In [None]:
df.drop(columns = "citationCount", inplace=True)

### Add num_years_after_publication

In [None]:
df['num_years_after_publication'] = df['published_year'].apply(lambda x: 2025 - x)

### Add statistics about citations_after_years (mean, std)

In [None]:
df['mean_citations_over_years'] = df['citations_after_years'].apply(lambda x: np.mean(list(x.values())))
df['std_citations_over_years'] = df['citations_after_years'].apply(lambda x: np.std(list(x.values())))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


In [None]:
numeric_df = df.select_dtypes(include=["number"])

In [None]:
numeric_df.drop(columns=['published_year'], inplace=True)

In [None]:
numeric_df.fillna(0, inplace=True)

In [None]:
numeric_df.to_csv("numeric_features.csv", index=False)