In [None]:
!pip install ctransformers[cuda]
!pip install --upgrade git+https://github.com/huggingface/transformers

In [10]:
!pip install einops

Collecting einops
  Downloading einops-0.7.0-py3-none-any.whl.metadata (13 kB)
Downloading einops-0.7.0-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einops
Successfully installed einops-0.7.0


In [2]:
from ctransformers import AutoModelForCausalLM
from transformers import AutoTokenizer, pipeline

# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
model = AutoModelForCausalLM.from_pretrained(
    "TheBloke/zephyr-7B-alpha-GGUF",
    model_file="zephyr-7b-alpha.Q4_K_M.gguf",
    model_type="mistral",
    gpu_layers=0,
    hf=True
)
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha")

# Pipeline
generator = pipeline(
    model=model, tokenizer=tokenizer,
    task='text-generation',
    max_new_tokens=50,
    repetition_penalty=1.1
)

  from .autonotebook import tqdm as notebook_tqdm
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 16644.06it/s]
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 15650.39it/s]


In [3]:
prompt = """<|system|>You are a helpful, respectful and honest assistant for labeling topics..</s>
<|user|>
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: '[KEYWORDS]'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.</s>
<|assistant|>"""


In [4]:
from bertopic.representation import TextGeneration
from bertopic import BERTopic

# Text generation with Zephyr
zephyr = TextGeneration(generator, prompt=prompt)
representation_model = {"Zephyr": zephyr}


# Topic Modeling
topic_model = BERTopic(representation_model=representation_model, verbose=True)

In [5]:
from datasets import load_dataset
from transformers import BertTokenizer
import numpy as np


dataset = load_dataset("valurank/Topic_Classification")["train"]
# Extract abstracts to train on and corresponding titles
descriptions = dataset["article_text"]
topics = dataset["topic"]

In [6]:
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk

nltk.download('punkt')

sentences = [sent_tokenize(desc) if isinstance(desc, str) else [] for desc in descriptions[:10]]
sentences = [sentence for doc in sentences for sentence in doc]

[nltk_data] Downloading package punkt to /Users/ayushjain/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
sentences

['NEWYou can now listen to Fox News articles!',
 'The Mercedes-Benz S-Class has always been a special car.Before it started using that name in 1972, brand’s top model was known as the Sonderklasse, which is German for "Special Class," denoting its position as the flagship of the fleet.It’s been used as a showcase for the latest technologies including new engines, airbags, anti-lock brakes and traction control, and the newest "S" follows in that tradition.Not the redesigned S-Class that launched last year, but the EQS sedan that’s now in showrooms and is Mercedes-Benz’s first purpose-built electric car.',
 'The EQS is the first purpose-built electric car from Mercedes-Benz (Mercedes-Benz)The automaker has made other electric vehicles, but on platforms shared with internal combustion engine models.',
 'The EQS is the first built on a dedicated EV chassis that will spawn other lines in the years to come.The EQS starts at $103,360, and no one would call that cheap, but it is around $9,000 

In [8]:
topic_model.fit(sentences)

2024-03-07 16:46:59,671 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 10/10 [00:01<00:00,  7.98it/s]
2024-03-07 16:47:02,395 - BERTopic - Embedding - Completed ✓
2024-03-07 16:47:02,395 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-03-07 16:47:06,095 - BERTopic - Dimensionality - Completed ✓
2024-03-07 16:47:06,095 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-07 16:47:06,105 - BERTopic - Cluster - Completed ✓
2024-03-07 16:47:06,109 - BERTopic - Representation - Extracting topics from clusters using representation models.
 56%|█████▌    | 5/9 [01:40<01:26, 21.74s/it]Number of tokens (611) exceeded maximum context length (512).
Number of tokens (612) exceeded maximum context length (512).
Number of tokens (613) exceeded maximum context length (512).
Number of tokens (614) exceeded maximum context length (512).
Number of tokens (615) exceeded maximum context length (512).
Number of to

<bertopic._bertopic.BERTopic at 0x104b80af0>

In [11]:
topic_distr, _ = topic_model.approximate_distribution(sentences)

100%|██████████| 1/1 [00:00<00:00, 29.82it/s]


In [24]:
#saved topic_distr to a file 
import numpy as np

np.save('topic_distr.npy', topic_distr)

In [25]:
topic_distr = np.load('topic_distr.npy')

In [58]:
#it is of type data frame
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Zephyr,Representative_Docs
0,-1,1,-1_tennis_nothing_balls_sick,"[tennis, nothing, balls, sick, reacting, poles...","[\n""Tennis Ball Tension: Watching Reactions to...","[Call it tennis ball tension, because I am sic..."
1,0,65,0_the_of_and_to,"[the, of, and, to, jurassic, that, world, but,...","[\n""Jurassic World: Dominion - Critical Analys...","[“Jurassic World” wasn’t horrible, but it was ..."
2,1,49,1_news_you_to_the,"[news, you, to, the, fox, follow, on, here, th...","[\n""Fox News Listening & News Alert Signups"", ...","[NEWYou can now listen to Fox News articles!, ..."
3,2,48,2_the_of_stars_gaia,"[the, of, stars, gaia, asteroids, says, data, ...","[\n""Gaia's Asteroid and Star Data"", , , , , , ...","[On June 13, the mission extended that map int..."
4,3,35,3_was_the_angels_angeles,"[was, the, angels, angeles, he, fantasy, los, ...",[\nFantasy Football Controversy Involving LA A...,[The two players got into a pregame altercatio...


In [41]:
topic_model.visualize_distribution(topic_distr[1])

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed