In [46]:
pip install accelerate bertopic

[0mNote: you may need to restart the kernel to use updated packages.


In [None]:
from huggingface_hub import notebook_login

In [None]:
notebook_login()

In [1]:
from datasets import load_dataset

In [37]:
import torch 
import json
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm

In [3]:
embedded_ds = load_dataset("mwarchalowski/grants", "no-shorts-no-dups")

In [4]:
labeled_ds = load_dataset("mwarchalowski/grants", "labeled_subset")

In [5]:
dataset = embedded_ds

In [6]:
embedding_model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")

In [7]:
umap_model = UMAP(n_neighbors=10, n_components=8, min_dist=0.0, metric='cosine', random_state=42)
umap_model

In [8]:
hdbscan_model = HDBSCAN(min_cluster_size=50, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
hdbscan_model

In [9]:
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))
vectorizer_model

In [10]:
topic_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,

  # Hyperparameters
  top_n_words=30,
  verbose=True
)


In [11]:
splits = dataset["train"].train_test_split(test_size=0.1)
splits

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'abstr', 'text_len', 'tensors'],
        num_rows: 114100
    })
    test: Dataset({
        features: ['id', 'title', 'abstr', 'text_len', 'tensors'],
        num_rows: 12678
    })
})

In [12]:
embeddings = [np.array(json.loads(x)) for x in splits["train"]["tensors"]]

In [13]:
embeddings = np.array(embeddings)
embeddings

array([[ 0.0584911 , -0.04004974,  0.36256891, ..., -0.23645927,
        -0.05368884,  0.59382135],
       [-0.48816812,  0.27352843, -0.33533052, ..., -0.22586484,
         0.30663136,  0.52032411],
       [-0.38498646,  0.19008571, -0.51001239, ...,  0.11207752,
         0.01410746,  0.01421699],
       ...,
       [-0.04967792,  0.61160779,  0.0434274 , ..., -0.21401489,
         0.57841116, -0.94967824],
       [ 0.12018114,  0.01216343, -0.17025027, ...,  0.03735605,
         0.42089903, -0.31298813],
       [-0.33989847,  0.09613819,  0.09444391, ..., -0.02291332,
        -0.38042527, -0.09628004]])

In [14]:
topics, probs = topic_model.fit_transform(splits["train"]["abstr"], embeddings)

2024-04-09 05:19:52,981 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-04-09 05:22:17,040 - BERTopic - Dimensionality - Completed ✓
2024-04-09 05:22:17,044 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-04-09 05:22:30,278 - BERTopic - Cluster - Completed ✓
2024-04-09 05:22:30,310 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-04-09 05:24:29,071 - BERTopic - Representation - Completed ✓


In [15]:
pd.set_option('display.max_colwidth', None)
topic_model.get_topic_info()[["Count", "Representation"]][:25]


Unnamed: 0,Count,Representation
0,56177,"[research, project, new, cancer, data, study, cells, high, based, development, using, use, cell, used, studies, time, develop, health, systems, patients, br, provide, different, human, work, understanding, breast, analysis, methods, specific]"
1,2366,"[software, wireless, network, br, security, systems, br gt, lt br, lt, gt, networks, code, design, performance, hardware, applications, communication, techniques, computing, distributed, communications, mobile, internet, data, requirements, algorithms, tools, users, services, service]"
2,2012,"[species, evolutionary, populations, ecological, evolution, genetic, variation, traits, selection, population, diversity, biodiversity, ecology, conservation, change, plant, habitat, reproductive, marine, environmental, climate, br, fish, speciation, natural, food, sexual, fitness, ecosystem, patterns]"
3,1878,"[alloys, materials, material, composite, mechanical, process, manufacturing, alloy, fatigue, composites, strength, properties, temperature, crack, corrosion, high, microstructure, grain, phase, steel, coating, thermal, metal, components, coatings, fracture, deformation, wear, ceramic, steels]"
4,1646,"[beta, diabetes, islet, beta cells, insulin, t1d, beta cell, cells, islets, type diabetes, cell, pancreatic, pancreas, type, nod, mice, immune, autoimmune, transplantation, insulin producing, mouse, glucose, human, diabetic, producing, nod mice, islet transplantation, blood, disease, function]"
5,1557,"[quantum, spin, magnetic, states, systems, physics, materials, topological, superconducting, atoms, quantum information, properties, superconductivity, devices, matter, entanglement, electronic, optical, quantum systems, state, superconductors, electron, qubits, field, classical, atomic, theoretical, temperature, theory, new]"
6,1167,"[br, lt br, br gt, lt, gt, theory, geometry, algebraic, algebras, spaces, geometric, equations, manifolds, groups, mathematics, algebra, differential, conjecture, problems, mathematical, lie, invariants, finite, dimensional, varieties, operators, group, topology, functions, space]"
7,1124,"[plant, plants, arabidopsis, genes, crop, gene, proteins, resistance, wheat, protein, genetic, auxin, root, molecular, seed, crops, mutants, breeding, growth, expression, cell, pathogen, fungal, stress, genome, barley, thaliana, regulation, species, rice]"
8,1115,"[bone, cartilage, tissue, oa, joint, healing, articular, collagen, fracture, periodontal, osteoporosis, cells, osteoarthritis, tooth, implant, mechanical, formation, tissues, dental, knee, articular cartilage, muscle, clinical, regeneration, bone formation, repair, ho, matrix, injuries, cell]"
9,1050,"[mantle, seismic, rocks, earth, crust, fault, magma, earthquake, tectonic, volcanic, plate, crustal, earthquakes, magmatic, subduction, deformation, deposits, continental, mineral, minerals, rock, geological, br, lt, gt, lt br, br gt, fluid, zone, evolution]"


In [34]:
from transformers import pipeline, AutoTokenizer
evaluator = "HuggingFaceH4/zephyr-7b-beta"
tokenizer = AutoTokenizer.from_pretrained(evaluator)
pipe = pipeline("text-generation", model=evaluator, device_map="auto", torch_dtype=torch.bfloat16)


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]



In [35]:
def evaluate(words):
    messages = [
        {
            "role": "system",
            "content": "You are a knowledgable science expert. Given list of words, find category that the words fit into",
        },
        { 
            "role": "user", 
            "content": "[apple, pear, carrot, potato, banana]"
        },
        { 
            "role": "assistant", 
            "content": "Fruits and Vegetbles"
        },
        { 
            "role": "user", 
            "content": words
        }
    ]
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.3, top_k=50, top_p=0.95, return_full_text=False)
    return outputs[0]["generated_text"]


In [None]:
[evaluate("[{}]".format(", ".join(x))) for x in tqdm(topic_model.get_topic_info()["Representation"][:20])]



  0%|          | 0/20 [00:00<?, ?it/s][A[A

  5%|▌         | 1/20 [00:11<03:45, 11.86s/it][A[A

 10%|█         | 2/20 [03:50<39:58, 133.23s/it][A[A

 15%|█▌        | 3/20 [06:57<44:43, 157.85s/it][A[A

 20%|██        | 4/20 [07:02<26:00, 97.56s/it] [A[A

In [41]:
evaluate("[synthesis, reactions, chiral, chemistry, reaction, compounds, synthetic, catalysts, complexes, metal, organic, catalysis, asymmetric, bond, catalytic, ligands, reactivity, molecules, ring, new, group, enantioselective, transition metal, chemical, catalyzed, natural products, products, bonds, organometallic, reagents]")

'Organic and Inorganic Chemistry, specifically focusing on Synthetic Chemistry, Catalysis, and Organometallic Chemistry.'