In [44]:
import pandas as pd

df = pd.read_csv(f"/content/features_2024.csv")
df['text'] = df['title'] + ' ' + df['abstract']
df["year"] = pd.to_datetime(df["published_date"], errors="coerce").dt.year
df.head(5)

Unnamed: 0,arxiv_id,title,authors,abstract,published_date,last_revised_date,num_revisions,primary_category,categories,num_pages,...,max_h_index,mean_i10_index,max_i10_index,slope_papers,slope_citations,num_years_after_publication,mean_citations_over_years,std_citations_over_years,text,year
0,1212.2518,Efficient Inference in Large Discrete Domains,"[{'name': 'R Sharma', 'citations_all': None, '...",In this paper we examine the problem of infere...,2012-10-19,2012-10-19T00:00:00,0,13,['cs.AI'],8.0,...,0.0,0.0,0.0,0,0.0,12,0.384615,0.624926,Efficient Inference in Large Discrete Domains ...,2012
1,1212.2511,Stochastic complexity of Bayesian networks,"[{'name': 'K Yamazaki', 'citations_all': 708, ...",Bayesian networks are now being used in enormo...,2012-10-19,2012-10-19T00:00:00,0,35,"['cs.LG', 'stat.ML']",8.0,...,0.0,0.0,0.0,0,0.0,12,1.307692,1.065877,Stochastic complexity of Bayesian networks Bay...,2012
2,1211.5625,A survey of computational methods for protein ...,"[{'name': 'S Srihari', 'citations_all': 3135, ...",Complexes of physically interacting proteins a...,2012-11-24,2012-11-24T00:00:00,0,16,"['cs.CE', 'q-bio.MN']",27.0,...,0.0,0.0,0.0,0,0.0,12,9.307692,5.716166,A survey of computational methods for protein ...,2012
3,1212.248,Approximate Inference and Constrained Optimiza...,"[{'name': 'T Heskes', 'citations_all': 16337, ...",Loopy and generalized belief propagation are p...,2012-10-19,2012-10-19T00:00:00,0,35,"['cs.LG', 'cs.AI', 'stat.ML']",8.0,...,0.0,0.0,0.0,0,0.0,12,4.384615,1.902972,Approximate Inference and Constrained Optimiza...,2012
4,1212.4674,Natural Language Understanding Based on Semant...,"[{'name': 'H Kong', 'citations_all': None, 'ci...","In this paper, we define event expression over...",2012-12-19,2012-12-19T00:00:00,0,18,['cs.CL'],8.0,...,0.0,0.0,0.0,0,0.0,12,0.0,0.0,Natural Language Understanding Based on Semant...,2012


In [45]:
df["arxiv_id"] = df["arxiv_id"].astype(str)
paper_text_df = (
    df.sort_values(["arxiv_id"])
      .assign(text=df["title"] + " " + df["abstract"])
      .groupby("arxiv_id", as_index=False)[["text", "year"]]
      .first()
)

In [46]:
TRAIN_END = 2019
TEST_BEGIN = 2020

train_df = paper_text_df[paper_text_df['year'] <= TRAIN_END]
test_df = paper_text_df[paper_text_df['year'] >= TEST_BEGIN]
train_df, test_df

(        arxiv_id                                               text  year
 0      1211.2569  Teichmüller extremal mapping and its applicati...  2012
 1      1211.2575  A semantic cache for enhancing Web services co...  2012
 2      1211.2632  Sequential Voronoi diagram calculations using ...  2012
 3      1211.2636  A memory versus compression ratio trade-off in...  2012
 4       1211.267  A Mazing 2+eps Approximation for Unsplittable ...  2012
 ...          ...                                                ...   ...
 8416  1912.13472  Revisiting Landscape Analysis in Deep Neural N...  2019
 8417  1912.13477  Interaction laws of monads and comonads We int...  2019
 8418   1912.1348  On the Difference Between the Information Bott...  2019
 8419  1912.13487  Winding number for arbitrary integer value in ...  2019
 8420  1912.13497  Water Supply Prediction Based on Initialized A...  2019
 
 [8421 rows x 3 columns],
          arxiv_id                                               text  y

In [47]:
train_texts = train_df["text"].astype(str).tolist()
test_texts  = test_df["text"].astype(str).tolist()

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# sentence embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

topic_model = BERTopic(
    embedding_model=embedding_model,
    language="english",
    calculate_probabilities=True,
    verbose=True
)

In [48]:
train_topics, train_probs = topic_model.fit_transform(train_texts)
test_topics, test_probs = topic_model.transform(test_texts)

train_bert_df = train_df.copy()
test_bert_df = test_df.copy()
train_bert_df["topic"] = train_topics
test_bert_df["topic"] = test_topics

2025-12-14 09:20:38,192 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/264 [00:00<?, ?it/s]

2025-12-14 09:20:59,765 - BERTopic - Embedding - Completed ✓
2025-12-14 09:20:59,766 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-12-14 09:21:12,152 - BERTopic - Dimensionality - Completed ✓
2025-12-14 09:21:12,153 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-12-14 09:21:17,369 - BERTopic - Cluster - Completed ✓
2025-12-14 09:21:17,376 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-12-14 09:21:18,400 - BERTopic - Representation - Completed ✓


Batches:   0%|          | 0/171 [00:00<?, ?it/s]

2025-12-14 09:21:34,586 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-12-14 09:21:38,398 - BERTopic - Dimensionality - Completed ✓
2025-12-14 09:21:38,399 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-12-14 09:21:38,701 - BERTopic - Probabilities - Start calculation of probabilities with HDBSCAN
2025-12-14 09:21:43,304 - BERTopic - Probabilities - Completed ✓
2025-12-14 09:21:43,305 - BERTopic - Cluster - Completed ✓


In [49]:
topic_model.get_topic_info().head()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3074,-1_the_of_to_and,"[the, of, to, and, in, we, for, is, that, this]",[Data mining when each data point is a network...
1,0,232,0_graphs_graph_vertex_vertices,"[graphs, graph, vertex, vertices, edge, planar...",[On the complexity of the vector connectivity ...
2,1,163,1_word_language_text_words,"[word, language, text, words, sentence, senten...",[Language Without Words: A Pointillist Model f...
3,2,137,2_logic_semantics_programs_type,"[logic, semantics, programs, type, program, pr...",[On the Use of Underspecified Data-Type Semant...
4,3,126,3_kernel_classification_learning_classifier,"[kernel, classification, learning, classifier,...",[Kernel Transform Learning This work proposes ...


In [50]:
topics_over_time = topic_model.topics_over_time(train_texts, train_df["year"])

8it [00:05,  1.47it/s]


In [51]:
trend_df = (
    topics_over_time
    .pivot(index="Timestamp", columns="Topic", values="Frequency")
    .fillna(0)
    .sort_index()
)

topic_growth_rate = trend_df.pct_change().replace([float("inf"), -float("inf")], 0)

growth_long = (
    topic_growth_rate
    .stack()
    .rename("topic_growth_rate")
    .reset_index()
)
growth_long

Unnamed: 0,Timestamp,Topic,topic_growth_rate
0,2013,-1,0.053118
1,2013,0,0.142857
2,2013,1,-0.333333
3,2013,2,0.875000
4,2013,3,0.538462
...,...,...,...
1054,2019,155,0.000000
1055,2019,156,0.000000
1056,2019,157,0.000000
1057,2019,158,0.000000


In [54]:
train_bert_df = train_bert_df.merge(
    growth_long,
    left_on=["year", "topic"],
    right_on=["Timestamp", "Topic"],
    how="left"
)
train_bert_df.drop(columns=["Timestamp", "Topic"], inplace=True)

test_bert_df = test_bert_df.merge(
    growth_long,
    left_on=["year", "topic"],
    right_on=["Timestamp", "Topic"],
    how="left"
)

test_bert_df.drop(columns=["Timestamp", "Topic"], inplace=True)

In [55]:
train_bert_df

Unnamed: 0,arxiv_id,text,year,topic,topic_growth_rate
0,1211.2569,Teichmüller extremal mapping and its applicati...,2012,-1,
1,1211.2575,A semantic cache for enhancing Web services co...,2012,68,
2,1211.2632,Sequential Voronoi diagram calculations using ...,2012,6,
3,1211.2636,A memory versus compression ratio trade-off in...,2012,72,
4,1211.267,A Mazing 2+eps Approximation for Unsplittable ...,2012,74,
...,...,...,...,...,...
8416,1912.13472,Revisiting Landscape Analysis in Deep Neural N...,2019,34,0.000000
8417,1912.13477,Interaction laws of monads and comonads We int...,2019,-1,61.285714
8418,1912.1348,On the Difference Between the Information Bott...,2019,-1,61.285714
8419,1912.13487,Winding number for arbitrary integer value in ...,2019,-1,61.285714


In [53]:
topic_model.visualize_topics_over_time(topics_over_time)

In [56]:
train_bert_df.to_csv("train_bert.csv", index=False)
test_bert_df.to_csv("test_bert.csv", index=False)