bertopic 

In [15]:
import re
import unicodedata
import pandas as pd
import os
import openai
import time

from bertopic import BERTopic
from hdbscan import HDBSCAN
from bertopic.representation import OpenAI


## Data Preprocessing

In [16]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Normalize unicode characters
    text = unicodedata.normalize("NFKC", text)

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove special characters (optional)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    return text

def load_and_process(file):
  df = pd.read_csv(file, names=['id', 'entity', 'label', 'text'])
  df = df.drop_duplicates().dropna()
  df['text_processed'] = df['text'].apply(preprocess_text)

  return df

df = load_and_process("~/sentiment_analysis/archive/twitter_training.csv")
df.drop_duplicates(subset='id', keep='first', inplace=True)
df = df[df['label']=='Negative']
grouped_df = df.groupby('entity')
documents_grouped = {}
for entity, group in grouped_df: 
    documents_grouped[entity] = group['text_processed'].tolist()

## Topic Modeling

In [20]:
topic_models  = {}
for entity, documents in documents_grouped.items():
    '''
    client = openai.OpenAI()
    representation_model = OpenAI(
        client, 
        model="gpt-4o-mini", 
        chat=True, 
        nr_docs=4, 
        delay_in_seconds=30, 
        exponential_backoff=True
    )
    '''
    hdbscan_model = HDBSCAN(min_cluster_size=2)
    topic_model = BERTopic(
        #representation_model=representation_model, 
        hdbscan_model=hdbscan_model, 
        nr_topics=5
    )
    topic_model.fit(documents)
    topic_models[entity] = topic_model

## Topic Summary with Prompts Engineering

In [21]:
prompts = {}
for entity, topic_model in topic_models.items(): 
    representative_docs = topic_model.get_representative_docs()
    system_msg = '''
Your goal is to create concise, human-readable summaries of topics based on their representative documents, for the entity: {entity}. 
'''.format(
        entity=entity, 
    )
    user_msg = '''
Input Details: 
•	The input is a dictionary consists of topic number and representative documents
•	The representative documents are an array of strings 

Output Requirements: 
    1.	For each topic, provide:
        ◦	A title of 3-5 words summarizing the topic.
        ◦	A sentence explaining the topic based on its top words.
        ◦	Organize them into dictionary 
    2.  Organize the topic into array 
    3.  Output result of list as json 
Input: 
{docs}
'''.format(
        docs=str(representative_docs),
    )
    prompts[entity] = [system_msg, user_msg]

In [25]:
client = openai.OpenAI()
def get_model_response(messages, 
                       model = 'gpt-4o-mini', 
                       temperature = 0, 
                       max_tokens = 1000):
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature, 
        max_tokens=max_tokens, 
    )

    return response.choices[0].message.content

for entity, prompt in prompts.items(): 
    messages = [
        {'role': 'system', 'content': prompt[0]},
        {'role': 'user', 'content': prompt[1]},
    ]
    print(entity)
    summary = get_model_response(messages)
    print(summary)
    time.sleep(30)

Amazon
```json
{
    "topics": [
        {
            "title": "Customer Service Issues",
            "summary": "Customers express frustration over delivery failures and service problems with Amazon."
        },
        {
            "title": "Order Cancellations and Frustrations",
            "summary": "Users report dissatisfaction with order cancellations and difficulties in finding products on Amazon."
        },
        {
            "title": "Mask Shortages During Pandemic",
            "summary": "Consumers struggle to find face masks on Amazon, facing delays and price gouging."
        },
        {
            "title": "Criticism of Corporate Practices",
            "summary": "Critics highlight the ethical concerns surrounding Amazon's wealth and treatment of workers."
        }
    ]
}
```
ApexLegends
```json
{
    "topics": [
        {
            "title": "Game Issues and Complaints",
            "summary": "Players express frustration over game bugs and server issues in 

In [26]:
for entity, model in topic_models.items(): 
    print(entity) 
    print(model.get_topic_info())
    print(str(model.get_representative_docs()))
    print('----------------------------------------')

Amazon
   Topic  Count                         Name  \
0     -1     17         -1_amazon_and_not_is   
1      0     57           0_amazon_the_of_to   
2      1     10              1_to_the_are_on   
3      2      7  2_this_die_hilarious_regret   
4      3      5            3_as_the_you_fuck   

                                      Representation  \
0  [amazon, and, not, is, in, that, to, an, do, for]   
1   [amazon, the, of, to, is, my, but, for, it, and]   
2    [to, the, are, on, of, out, and, masks, it, if]   
3  [this, die, hilarious, regret, youll, it, fuck...   
4  [as, the, you, fuck, what, workers, your, are,...   

                                 Representative_Docs  
0  [jeffbezos must be feeling the pain of his div...  
1  [i fucking hate usps  yall deliverers never do...  
2  [if you were trying to find cloth face masks n...  
3  [this is hilarious ,  please dont fuck this up...  
4  [this is a crime against humanity  should be t...  
{-1: ['jeffbezos must be feeling the 