In [None]:
!wget -O cleaned.json https://www.dropbox.com/scl/fi/mvxh9f53z9w6zhmcb7a2g/cleaned.json?rlkey=lie5gmox0679q6q8ecqyvrxlb&dl=1

In [None]:
!wget -O scrap_data.json https://www.dropbox.com/scl/fi/8ammieu3kins0nqf0f2q8/scrap_data.json?rlkey=xfxwqcljaqfa3ip6snw0f32rg&dl=1

In [None]:
!pip install bertopic pandas matplotlib

In [2]:
import pandas as pd

In [3]:
df = pd.read_json('cleaned.json')

In [4]:
df_scraped = pd.read_json('scrap_data.json')

In [5]:
df_scraped_dates = (
    df_scraped["date_delivered"]
    .str.extractall(
        r"(\d{1,2}) (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) (\d{4})"
    )
    .groupby(level=0)
    .first()
)

df_scraped_dates = df_scraped_dates.iloc[:, [2, 1]]
df_scraped_dates.columns = ["year", "month"]
df_scraped_dates["month"] = df_scraped_dates["month"].map(
    {
        "Jan": 1,
        "Feb": 2,
        "Mar": 3,
        "Apr": 4,
        "May": 5,
        "Jun": 6,
        "Jul": 7,
        "Aug": 8,
        "Sep": 9,
        "Oct": 10,
        "Nov": 11,
        "Dec": 12,
    }
)
df_scraped[["year", "month"]] = df_scraped_dates

In [6]:
df_all = pd.concat(
    [
        df,
        df_scraped.drop(columns="date_delivered").rename(
            columns={"citation_title": "title"}
        ),
    ],
    ignore_index=True,
)

df_all['data'] = df_all['title'] + '\n' + df_all['abstract']

In [7]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# from cuml.cluster import HDBSCAN
# from cuml.manifold import UMAP

embedding_model = "Alibaba-NLP/gte-large-en-v1.5"
sentence_model = SentenceTransformer(embedding_model, trust_remote_code=True)

# umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)
# hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, prediction_data=True)

topic_model = BERTopic(
    embedding_model=sentence_model,
    # umap_model=umap_model,
    # hdbscan_model=hdbscan_model,
    verbose=True,
    calculate_probabilities=True,
)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
embeddings = sentence_model.encode(df_all['data'], batch_size=32, show_progress_bar=True)

In [None]:
import numpy as np

np.save('embeddings.npy', embeddings)

In [None]:
topic_model.fit_transform(df_all['data'], embeddings);

In [None]:
from huggingface_hub import login

login()

In [None]:
topic_model.push_to_hf_hub(
    repo_id="panda",
    save_ctfidf=True,
    serialization="safetensors",
    save_embedding_model=embedding_model
)