In [1]:
import numpy as np
import json
from bertopic import BERTopic
import openai
import tiktoken
import os
from bertopic.representation import OpenAI
from umap import UMAP

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open('image_embeddings.json') as f:
   embeddings = json.load(f)

In [3]:
with open('text_list.json', 'r') as f:
    text_list = json.load(f)

In [4]:
len(text_list)

102

In [12]:
len(embeddings[1])

2

In [13]:
x = np.array(embeddings)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (32, 2) + inhomogeneous part.

In [8]:
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(embeddings=embeddings, documents=text_list)

ValueError: Make sure to input embeddings as a numpy array or scipy.sparse.csr.csr_matrix. 

In [7]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,17,-1_the_is_to_and,"[the, is, to, and, it, from, of, we, this, lan...","[@neuralink Long-term, it is possible to shunt..."
1,0,45,0_the_of_to_and,"[the, of, to, and, on, is, in, we, it, that]",[Something that makes SWE a particularly feasi...
2,1,40,1_this_the_to_is,"[this, the, to, is, so, you, can, your, bengal...",[We're announcing TacticAI: an AI assistant ca...


In [9]:
prompt = """
I have topic that contains the following documents: \n[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the above information, can you give a short label of the topic?

The topics should have optinions based on the documents and keywords. Here are some examples of topics:
- Who Will the Bengals Draft?
- Earth on a Record Hot Streak
- Senate Authorizes Controversial Surveillance Program
- Jujutsu Kaisen -  Nah, I'd Pass
- Netflix's One Piece Review: A Not-Quite Grand Line
- Humane AI Pin Reveals its Fatal Flaw
"""

In [10]:
tokenizer= tiktoken.encoding_for_model("gpt-3.5-turbo")

client = openai.OpenAI(api_key=api_key)
representation_model = OpenAI(
    client,
    model="gpt-3.5-turbo", 
    delay_in_seconds=2, 
    chat=True,
    nr_docs=4,
    doc_length=100,
    tokenizer=tokenizer,
    prompt= prompt
)

In [11]:
topic_model = BERTopic(representation_model=representation_model, verbose=True)

In [12]:
topics, probs = topic_model.fit_transform(embeddings=x, documents=text_list)

2024-04-20 18:34:34,509 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-04-20 18:34:35,470 - BERTopic - Dimensionality - Completed ✓
2024-04-20 18:34:35,471 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-04-20 18:34:35,473 - BERTopic - Cluster - Completed ✓
2024-04-20 18:34:35,474 - BERTopic - Representation - Extracting topics from clusters using representation models.
100%|██████████| 3/3 [00:08<00:00,  2.75s/it]
2024-04-20 18:34:43,737 - BERTopic - Representation - Completed ✓


In [13]:
list(topic_model.get_topic_info()['Name'])

['-1_"Exploring Language, Thoughts, and Technology: From Stroke Recovery to Open-Source Robotics"',
 '0_- Open-Source Firmware Development for GPUs',
 '1_"Exploring the Impact of AI in Sports and Home Automation with 01 Developer Preview"']

## Visualization

In [14]:
topic_distr, topic_token_distr = topic_model.approximate_distribution(text_list, calculate_tokens=True)

df = topic_model.visualize_approximate_distribution(text_list[1], topic_token_distr[1])
df

100%|██████████| 1/1 [00:00<00:00, 66.15it/s]


Unnamed: 0,RT,JoeyB,Might,as,well,win,it,all,while,we,re,at,it.1
0_- Open-Source Firmware Development for GPUs,0.0,0.0,0.0,0.0,0.0,0.0,0.161,0.273,0.403,0.587,0.426,0.314,0.184
"1_""Exploring the Impact of AI in Sports and Home Automation with 01 Developer Preview""",0.141,0.262,0.383,0.529,0.519,0.51,0.494,0.347,0.217,0.206,0.101,0.101,0.101


In [21]:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(x)
topic_model.visualize_documents(text_list, reduced_embeddings=reduced_embeddings)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed