In [2]:
from datasets import load_dataset
from transformers import BertTokenizer
import numpy as np


dataset = load_dataset("valurank/Topic_Classification")["train"]
# Extract abstracts to train on and corresponding titles
descriptions = dataset["article_text"]
topics = dataset["topic"]

  from .autonotebook import tqdm as notebook_tqdm


In [60]:
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk

nltk.download('punkt')

sent_map = {}
sentences = []

for desc in descriptions[:50]:
    if isinstance(desc, str):
        for sentence in sent_tokenize(desc):
            sentences.append(sentence)
            sent_map[sentence] = desc

[nltk_data] Downloading package punkt to /Users/ayushjain/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [61]:
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech

# KeyBERT
keybert_model = KeyBERTInspired()

# # Part-of-Speech
# pos_model = PartOfSpeech("en_core_web_sm")

# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.3)

In [62]:
representation_model = {
  "KeyBert": keybert_model,
  "MMR": mmr_model
}

In [63]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

In [64]:
from sentence_transformers import SentenceTransformer

e_model = SentenceTransformer("avsolatorio/GIST-Embedding-v0")
embeddings = e_model.encode(sentences, show_progress_bar=True)

Batches: 100%|██████████| 39/39 [00:08<00:00,  4.41it/s]


In [65]:
from bertopic.representation import TextGeneration
from bertopic import BERTopic

topic_model = BERTopic(representation_model=representation_model, vectorizer_model=vectorizer_model, verbose=True)

In [66]:
topic_model.fit(sentences)

2024-03-14 01:11:31,619 - BERTopic - Embedding - Transforming documents to embeddings.
Batches:   5%|▌         | 2/39 [00:00<00:06,  5.48it/s]Error during conversion: ValueError('Queue is full! Please try again.')
Batches:  10%|█         | 4/39 [00:00<00:06,  5.69it/s]Exception in thread Thread-autoconversion:
Traceback (most recent call last):
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/threading.py", line 973, in _bootstrap_inner
    self.run()
  File "/Users/ayushjain/Development/DisruptionLab/jinship_sp2024/.venv/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 761, in run_closure
    _threading_Thread_run(self)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/threading.py", line 910, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/ayushjain/Development/DisruptionLab/jinship_sp2024/.venv/lib/python3.9/site-packages/transformers/safet

<bertopic._bertopic.BERTopic at 0x2c6d893a0>

In [67]:
topic_table = topic_model.get_topic_info()

In [38]:
system_prompt = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant for labeling topics.
<</SYS>>
"""

In [39]:
example_prompt = """
I have a topic that contains the following documents:
- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
- Meat, but especially beef, is the word food in terms of emissions.
- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.

The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
[/INST] Environmental impacts of eating meat
"""

In [40]:
main_prompt = """
[INST]
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: '[KEYWORDS]'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
[/INST]
"""

In [41]:
API_TOKEN = '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'

In [42]:
import requests
headers = {"Authorization": f"Bearer {API_TOKEN}"}
API_URL = "https://api-inference.huggingface.co/models/deepset/roberta-base-squad2"
def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

In [43]:
data = query(
    {
        "inputs": {
            "question": "What's my name?",
            "context": "My name is Clara and I live in Berkeley.",
        }
    }
)

In [44]:
topic_table

Unnamed: 0,Topic,Count,Name,Representation,KeyBert,MMR,Representative_Docs
0,-1,3,-1_tennis_ball tension_tennis ball_tension,"[tennis, ball tension, tennis ball, tension, b...","[ball tension, tennis ball, tennis, tension, b...","[tennis, ball tension, tennis ball, tension, b...","[It’s called “Crimes of the Future,” and every..."
1,0,267,0_new_latest_news_way,"[new, latest, news, way, just, like, stars, ye...","[news, latest, stars, watching, fox news, fox,...","[new, latest, news, way, just, like, stars, ye...",[The new catalog quadruples the number of star...
2,1,28,1_car_used_charge_like,"[car, used, charge, like, fox news, fox, make,...","[car, efficiency, mph, charge, racing, cheap, ...","[car, used, charge, like, fox news, fox, make,...",[The Mercedes-Benz S-Class has always been a s...


In [52]:
topic_table['Representation'][2]

['car',
 'used',
 'charge',
 'like',
 'fox news',
 'fox',
 'make',
 '10',
 'far',
 'middle']

In [57]:
dota = query(
    {
        "inputs": {
            "question": "Based on the information about the topic, please create a short label of this topic.",
            "context": f"I have a topic that contains the following documents: {topic_table['Representative_Docs'][2]}. The topic is described by the following keywords: {topic_table['Representation'][2]}",
        }
    }
)

In [77]:
def make_label(docs, keywords):
  return query(
    {
        "inputs": {
            "question": "Based on the information about the topic, please create a completely original short label of this topic.",
            "context": f"I have a topic that contains the following documents: {docs}. The topic is described by the following keywords: {keywords}",
        }
    }
  )

In [78]:
label_names = []
for index, row in topic_table.iterrows():
  label = make_label(row['Representative_Docs'], row['KeyBert'])
  label_names.append(label['answer'])

In [79]:
label_names

["NEWYou can now listen to Fox News articles!'].",
 "['presume average",
 '["What To Watch For\\',
 '["',
 "['“",
 "released']",
 "['stars",
 "['celebrity",
 "['players transfer",
 "['",
 "['jurassic park",
 "['pete davidsons",
 "['“So I think I should get some dogs",
 "['nasa",
 "['neighbours",
 "['",
 "['For most of “Spiderhead",
 "['Getty Images for Disney Key Facts",
 "['klay thompson",
 "health']",
 'keywords',
 'blockbuster filmmaking',
 "['Topline\\",
 "['He arrived in Wales",
 'tries to work out what comes next',
 "['fox news",
 "['",
 'newsletter',
 "['",
 'Billy Gibbons',
 'nbcuniversal getty',
 "['beloved character",
 'Waiting for your permission to load the Instagram Media']