In [1]:
from bertopic import BERTopic
import pandas as pd
import json
import numpy as np


In [3]:
data = "arxiv-metadata-oai-snapshot.json"

def get_metadata():
    with open(data, 'r') as f:
        for line in f:
            yield line


In [4]:
metadata = get_metadata()

In [8]:
metadata = get_metadata()
ids = []
titles = []
abstracts = []
categories = []

for paper in metadata:
    metaDict = json.loads(paper)
    try:
        try:
            year = int(metaDict['journal-ref'][-4:])    ### Example Format: "Phys.Rev.D76:013009,2007"
        except:
            year = int(metaDict['journal-ref'][-5:-1])    ### Example Format: "Phys.Rev.D76:013009,(2007)"
        if(year == 2020 or year == 2019 or year == 2018):
            ids.append(metaDict['id'])
            titles.append(metaDict['title'])
            abstracts.append(metaDict['abstract'])
            categories.append(metaDict['categories'])
    except:
        pass

In [9]:
df = pd.DataFrame({'id':ids,'title':titles,'abstract':abstracts,'categories':categories})

print(len(df))

90989


In [10]:
df.head()

Unnamed: 0,id,title,abstract,categories
0,708.007,Bohmian Mechanics at Space-Time Singularities....,We develop an extension of Bohmian mechanics...,quant-ph
1,709.1457,What happens to geometric phase when spin-orbi...,Spin-orbit interaction lifts accidental band...,cond-mat.other
2,710.1849,Regularity of solutions of the isoperimetric p...,In this work we consider a question in the c...,math.DG math.AP math.MG
3,712.1975,Reentrant spin glass transition in LuFe2O4,We have carried out a comprehensive investig...,cond-mat.str-el cond-mat.mtrl-sci
4,804.3104,"Teichm\""uller Structures and Dual Geometric Gi...",The Gibbs measure theory for smooth potentia...,math.DS math.CV


In [11]:
cat_list = df['categories'].unique()
print(*cat_list,sep = "\n")

quant-ph
cond-mat.other
math.DG math.AP math.MG
cond-mat.str-el cond-mat.mtrl-sci
math.DS math.CV
physics.gen-ph
math.NT
nucl-th
physics.atom-ph
cond-mat.stat-mech
gr-qc
cs.MA cs.AI q-bio.NC
math-ph math.MP nlin.SI quant-ph
physics.flu-dyn math.NA physics.comp-ph
physics.data-an physics.hist-ph physics.pop-ph
astro-ph.CO astro-ph.HE
q-bio.NC q-bio.QM
math-ph math.MP
hep-th astro-ph.CO gr-qc
physics.comp-ph
math.DG
quant-ph hep-th math-ph math.MP
astro-ph.IM astro-ph.EP
cond-mat.str-el cond-mat.mes-hall
cond-mat.stat-mech physics.atom-ph quant-ph
math.NT math.AG
math.AG math.KT
cond-mat.dis-nn cs.DM math.CO
math.DS
astro-ph.IM astro-ph.CO
astro-ph.IM astro-ph.CO cs.IT math.IT
cond-mat.supr-con
math.NT math.GM
cond-mat.stat-mech cond-mat.dis-nn quant-ph
hep-ph hep-lat hep-th
physics.gen-ph gr-qc
cond-mat.mes-hall
math.PR q-bio.QM stat.AP stat.ML
astro-ph.HE astro-ph.SR
stat.AP math.ST stat.TH
math.MG
physics.plasm-ph
math.PR
eess.SY cs.SY
physics.soc-ph cond-mat.dis-nn cs.SI
math.QA
cond

In [12]:
ml_df = df[df['categories'].str.contains("cs.")]

sentenceList = ml_df["abstract"].tolist()

In [13]:
print(len(ml_df))

27671


In [14]:
print(sentenceList[0])

  In this paper we leave the neighborhood of the singularity at the origin and
turn to the singularity at the horizon. Using nonlinear superdistributional
geometry and supergeneralized functions it seems possible to show that the
horizon singularity is not only a coordinate singularity without leaving
Schwarzschild coordinates. However the Tolman formula for the total energy $E$
of a static and asymptotically flat spacetime,gives $E=mc^2$, as it should be.
New class Colombeau solutions to Einstein field equations is obtained.New class
Colombeau solutions to Einstein field equations is obtained. The vacuum energy
density of free scalar quantum field ${\Phi}$ with a distributional background
spacetime also is considered.It has been widely believed that, except in very
extreme situations, the influence of acceleration on quantum fields should
amount to just small, sub-dominant contributions. Here we argue that this
belief is wrong by showing that in a Rindler distributional background
spa

In [15]:
samplesentenceList = sentenceList[1:2500]

In [16]:
topic_model = BERTopic(calculate_probabilities=True)
topics,prob = topic_model.fit_transform(samplesentenceList)

In [18]:
topic_model.get_topic_info(1)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,1,69,1_cell_cells_of_the,"[cell, cells, of, the, and, in, model, we, to,...",[ Bacterial colonies are abundant on living a...


In [19]:
topic_model.get_topic(1)

[('cell', 0.020806289095481927),
 ('cells', 0.018060471187736054),
 ('of', 0.01753105853595493),
 ('the', 0.015408679077212531),
 ('and', 0.013698921609343164),
 ('in', 0.013440170748293184),
 ('model', 0.012723845729518572),
 ('we', 0.01271308602938116),
 ('to', 0.01259673793780043),
 ('that', 0.011972756568916153)]

In [20]:
topic_model.visualize_topics()

In [22]:
topic_model.visualize_heatmap()

In [26]:
topic_model.visualize_hierarchy()

In [30]:
from sklearn.decomposition import PCA 
from sklearn.cluster import KMeans 


In [31]:
dim_model = PCA(n_components=5)
cluster_model = KMeans(n_clusters=50)

topic_model = BERTopic(umap_model=dim_model,embedding_model="allenai-specter",
                       hdbscan_model=cluster_model,calculate_probabilities=True)

In [32]:
topics,probabilities = topic_model.fit_transform(samplesentenceList)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.77k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/622 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/331 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/462k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [35]:
topic_model.get_topic_info(1)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,1,90,1_graphs_we_of_for,"[graphs, we, of, for, problem, that, graph, th...",[ Here we study the NP-complete $K$-SAT probl...


In [37]:
topic_model.visualize_barchart()