In [1]:
import pandas as pd

year = 2019
train_df = pd.read_csv(f"features_{year}.csv")
test_df  = pd.read_csv(f"features_{year+1}.csv")

train_df['arxiv_id'] = train_df['arxiv_id'].astype(str)
test_df['arxiv_id']  = test_df['arxiv_id'].astype(str)

test_citationCount = (
    train_df[['arxiv_id']]
    .merge(
        test_df[['arxiv_id', 'citationCount']],
        on='arxiv_id',
        how='left'
    )['citationCount']
)
test_citationCount

0        11.0
1        45.0
2       120.0
3       142.0
4         0.0
        ...  
8416     73.0
8417     35.0
8418     41.0
8419    115.0
8420      2.0
Name: citationCount, Length: 8421, dtype: float64

In [2]:
if year < 2020:
    embed_df = pd.read_csv("train_bert.csv")
else:
    embed_df = pd.read_csv("test_bert.csv")
embed_df['arxiv_id'] = embed_df['arxiv_id'].astype(str)
embed_df.columns

Index(['arxiv_id', 'text', 'year', 'topic', 'topic_growth_rate'], dtype='object')

In [3]:
train_df = train_df.merge(embed_df, how="left", on='arxiv_id')
train_df.columns

Index(['arxiv_id', 'title', 'authors', 'abstract', 'published_date',
       'last_revised_date', 'num_revisions', 'primary_category', 'categories',
       'num_pages', 'github_stars', 'upvote', 'citing_models',
       'citing_datasets', 'citing_spaces', 'citing_collections',
       'citations_by_year', 'citationCount', 'citations', 'referenceCount',
       'references', 'influentialCitationCount', 'venue_type', 'venue_ranking',
       'published_year', 'citationCount_log', 'citations_log', 'num_authors',
       'mean_citations', 'max_citations', 'mean_h_index', 'max_h_index',
       'mean_i10_index', 'max_i10_index', 'slope_papers', 'slope_citations',
       'num_years_after_publication', 'mean_citations_over_years',
       'std_citations_over_years', 'text', 'year', 'topic',
       'topic_growth_rate'],
      dtype='object')

In [4]:
train_df.fillna(0, inplace=True)

In [5]:
numeric_df = train_df.select_dtypes(include='number')
features = numeric_df.columns.drop(['citations', 'citations_log', 'citationCount_log', 'year', 'citationCount'])
X_train = numeric_df[features]

In [6]:
X_train.columns

Index(['num_revisions', 'primary_category', 'num_pages', 'github_stars',
       'upvote', 'citing_models', 'citing_datasets', 'citing_spaces',
       'citing_collections', 'referenceCount', 'influentialCitationCount',
       'venue_type', 'venue_ranking', 'published_year', 'num_authors',
       'mean_citations', 'max_citations', 'mean_h_index', 'max_h_index',
       'mean_i10_index', 'max_i10_index', 'slope_papers', 'slope_citations',
       'num_years_after_publication', 'mean_citations_over_years',
       'std_citations_over_years', 'topic', 'topic_growth_rate'],
      dtype='object')

In [7]:
y_train = test_citationCount

In [8]:
X_train.to_csv(f"X_train.csv", index=False)
y_train.to_csv(f"y_train.csv", index=False)

In [9]:
import pandas as pd

year = 2019
train_df = pd.read_csv(f"features_{year}.csv")
train_df['arxiv_id'] = train_df['arxiv_id'].astype(str)

for y in range(year+1, 2025):
    test_df  = pd.read_csv(f"features_{y}.csv")
    test_df['arxiv_id']  = test_df['arxiv_id'].astype(str)

    y_train = (
        train_df[['arxiv_id']]
        .merge(
            test_df[['arxiv_id', 'citationCount']],
            on='arxiv_id',
            how='left'
        )['citationCount']
    )
    y_train.to_csv(f"y_train_{y}.csv", index=False)

In [10]:
import pandas as pd

train_df = pd.read_csv(f"features_2024.csv")
train_df['arxiv_id'] = train_df['arxiv_id'].astype(str)
train_df = train_df[train_df['published_year'] > 2019]

embed_df = pd.read_csv("test_bert.csv")
embed_df['arxiv_id'] = embed_df['arxiv_id'].astype(str)

train_df = train_df.merge(embed_df, how="left", on='arxiv_id')
train_df.fillna(0, inplace=True)

numeric_df = train_df.select_dtypes(include='number')
features = numeric_df.columns.drop(['citations', 'citations_log', 'citationCount_log', 'year', 'citationCount'])
X_train = numeric_df[features]
X_train.to_csv(f"X_test.csv", index=False)

In [11]:
for y in range(year+1, 2025):
    test_df  = pd.read_csv(f"features_{y}.csv")
    test_df['arxiv_id']  = test_df['arxiv_id'].astype(str)

    y_train = (
        train_df[['arxiv_id']]
        .merge(
            test_df[['arxiv_id', 'citationCount']],
            on='arxiv_id',
            how='left'
        )['citationCount']
    )
    y_train.to_csv(f"y_test_{y}.csv", index=False)