In [1]:
%%capture
!pip3 install torch==2.2.0

In [2]:
%%capture
!pip install bertopic datasets accelerate openai tiktoken

In [3]:
import pandas as pd
import numpy as np
import torch
import pickle
import openai
import bertopic
import tiktoken

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.representation import OpenAI

# 1. Load dataset

In [4]:
loading_dir='./drive/MyDrive/Topic Mining Project/LLM/'

In [5]:
# 1. load data
class Dataset(torch.utils.data.Dataset):
    def __init__(self):
        self.raw_data = pd.read_json(loading_dir+"data/News_Category_Dataset_v3.json", lines=True)

    def __len__(self):
        return len(self.raw_data)

    def __getitem__(self, i):
        text = self.raw_data['headline'].iloc[i] + ' | ' + self.raw_data['short_description'].iloc[i]
        label = self.raw_data['category'].iloc[i]
        timestamp = self.raw_data['date'].iloc[i]


        return text, label, timestamp


print("loading data...")
dataset = Dataset()
len(dataset)

loading data...


209527

In [6]:
# sample data
documents=dataset[:20000][0]

In [7]:
timestamp=dataset[:20000][2]

# 2. Load OpenAI API
- prompt
- tokenizer

In [8]:
prompt=None

In [9]:
tokenizer= tiktoken.encoding_for_model("gpt-3.5-turbo")
# Create your representation model
client = openai.OpenAI(api_key="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")
openai_model = OpenAI(
    client,
    model="gpt-3.5-turbo",
    delay_in_seconds=1,
    prompt=prompt,
    chat=True,
    nr_docs=20, # we send 4 docs to GPT API at one time
    doc_length=500,
    tokenizer=tokenizer
)

# 3. other submodels

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model=CountVectorizer(stop_words="english")

In [11]:
from sentence_transformers import SentenceTransformer
# Pre-calculate embeddings
embedding_model = SentenceTransformer("BAAI/bge-small-en")
embeddings = embedding_model.encode(documents,show_progress_bar=True)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

In [None]:
from umap import UMAP
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

In [None]:
# Pre-reduce embeddings for visualization purposes
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(embeddings)

In [None]:
from hdbscan import HDBSCAN
hdbscan_model = HDBSCAN(min_cluster_size=100, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

from sklearn.cluster import KMeans
kmeans_model = KMeans(n_clusters=100)

# 4. Fit model

In [None]:
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance

# KeyBERT
keybert = KeyBERTInspired()
# MMR
# mmr = MaximalMarginalRelevance(diversity=0.3)

# All representation models
representation_model = {
    "KeyBERT": keybert,
    "Openai": openai_model,
    # "MMR": mmr,
}

In [None]:
from bertopic import BERTopic

topic_model = BERTopic(

  # Sub-models
  vectorizer_model=vectorizer_model,
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=kmeans_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True,
  nr_topics=100, # if nr_topics>n_clusters, will combine similar clusters to reduce number
)

# Train model
topics, probs = topic_model.fit_transform(documents, embeddings)

2024-03-04 12:00:09,724 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-03-04 12:00:31,417 - BERTopic - Dimensionality - Completed ✓
2024-03-04 12:00:31,419 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-04 12:00:32,977 - BERTopic - Cluster - Completed ✓
2024-03-04 12:00:32,978 - BERTopic - Representation - Extracting topics from clusters using representation models.
100%|██████████| 100/100 [03:20<00:00,  2.01s/it]
2024-03-04 12:03:58,396 - BERTopic - Representation - Completed ✓
2024-03-04 12:03:58,397 - BERTopic - Topic reduction - Reducing number of topics
2024-03-04 12:03:58,402 - BERTopic - Topic reduction - Reduced number of topics from 100 to 100


In [None]:
# Show topics
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,Openai,Representative_Docs
0,0,92,0_capitol_riot_jan_insurrection,"[capitol, riot, jan, insurrection, attack, rio...","[insurrectionists, rioters, insurrection, capi...",[Capitol Riot and Insurrection Aftermath],[Capitol Police Fence Off Capitol Ahead of Bid...
1,1,127,1_olympic_olympics_tennis_gold,"[olympic, olympics, tennis, gold, medal, tokyo...","[olympics, olympic, athlete, olympian, athlete...",[Olympic Updates and Controversies from Tokyo ...,[Mikaela Shiffrin Sets Fastest Time In Downhil...
2,2,187,2_police_officer_floyd_officers,"[police, officer, floyd, officers, man, trial,...","[killed, manslaughter, prosecutor, accused, ar...",[Police Officer Misconduct and Civil Rights Vi...,[Lawyer For Derek Chauvin Claims Settlement Pa...
3,3,202,3_pandemic_coronavirus_covid_19,"[pandemic, coronavirus, covid, 19, face, mask,...","[masks, pandemic, coronavirus, distancing, cov...",[Coping with COVID-19 Pandemic and Face Mask A...,[How To Clean A Fabric Coronavirus Face Mask A...
4,4,137,4_netflix_watch_movies_week,"[netflix, watch, movies, week, popular, movie,...","[netflix, movies, hbo, movie, debuts, comedies...",[Netflix's Popular Movie Releases and Trending...,[What To Watch On Netflix That’s New This Week...
...,...,...,...,...,...,...,...
95,95,56,95_funniest_tweets_parents_week,"[funniest, tweets, parents, week, kids, aug, j...","[tweets, funniest, parents, parenting, funny, ...",[Funniest Parent Tweets of the Week on Kids an...,[The Funniest Tweets From Parents This Week (A...
96,96,117,96_voting_democrats_rights_voter,"[voting, democrats, rights, voter, census, vot...","[democrats, voting, filibuster, gerrymandering...",[Voting Rights and Democrats' Legislative Effo...,[Obama: Senate Should Eliminate Filibuster To ...
97,97,83,97_workers_starbucks_tax_union,"[workers, starbucks, tax, union, pay, cuts, am...","[union, workers, starbucks, unionize, labor, s...",[Labor and Corporate Actions in the Face of Ec...,[Another Starbucks Store Votes To Unionize | T...
98,98,15,98_opioid_purdue_pharma_oxycontin,"[opioid, purdue, pharma, oxycontin, walgreens,...","[pharmacies, pharma, oxycontin, walgreens, opi...",[Opioid Crisis and Legal Actions Against Purdu...,[OxyContin Maker Purdue Pharma Files For Bankr...


# Visualization

In [None]:
topic_model.visualize_documents(dataset[:10000][0], reduced_embeddings=reduced_embeddings[:], hide_annotations=True, hide_document_hover=False, custom_labels=True)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
topics_over_time = topic_model.topics_over_time(documents, timestamp, nr_bins=20)
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10)

In [None]:
topics_per_class = topic_model.topics_per_class(documents, classes=dataset[:10000][1])
topic_model.visualize_topics_per_class(topics_per_class, top_n_topics=10)

29it [00:02, 11.26it/s]


In [None]:
topic_distr, _ = topic_model.approximate_distribution(documents)
topic_model.visualize_distribution(topic_distr[10])

100%|██████████| 10/10 [00:06<00:00,  1.63it/s]


In [None]:
# Calculate the topic distributions on a token-level
topic_distr, topic_token_distr = topic_model.approximate_distribution(documents, calculate_tokens=True)

# Visualize the token-level distributions
df = topic_model.visualize_approximate_distribution(documents[10], topic_token_distr[10])
df

100%|██████████| 10/10 [00:05<00:00,  1.71it/s]


Unnamed: 0,World,Cup,Captains,Want,To,Wear,Rainbow,Armbands,In,Qatar,FIFA,has,come,under,pressure,from,several,European,soccer,federations,who,want,to,support,human,rights,campaign,against,discrimination,at,the,World.1,Cup.1
50_nfl_quarterback_bowl_super,0.106,0.106,0.106,0.106,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.118,0.118,0.118,0.118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.163,0.163,0.163,0.163
58_democratic_primary_campaign_presidential,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.101,0.101,0.101,0.101,0.0,0.0,0.0,0.0,0.0,0.0
63_space_cave_nasa_moon,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.106,0.106,0.106,0.106,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
93_saudi_arabia_khashoggi_jamal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111,0.22,0.337,0.337,0.227,0.118,0.0,0.0,0.0,0.0,0.0
96_voting_democrats_rights_voter,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.103,0.103,0.207,0.207,0.104,0.104,0.0,0.0,0.0,0.0,0.0


In [None]:
from scipy.cluster import hierarchy as sch
# Hierarchical topics
linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
hierarchical_topics = topic_model.hierarchical_topics(documents, linkage_function=linkage_function)
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [None]:
embedding_model = "sentence-transformers/BAAI/bge-small-en"
topic_model.save(loading_dir+"GPTopenai", serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)

In [14]:
loaded_model = BERTopic.load(loading_dir+"saved_model")

In [13]:
loaded_model.get_topic_info()


Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,Llama2,Representative_Docs
0,0,926,0_autism_child_autistic_son,"[autism, child, autistic, son, syndrome, disab...","[autism, asperger, autistic, asd, parenting, t...","[Autism, , , , , , , , , ]",
1,1,2196,1_trump_comey_fbi_russia,"[trump, comey, fbi, russia, mueller, probe, ho...","[comey, trump, reportedly, bannon, wikileaks, ...","[Russia Investigation, , , , , , , , , ]",
2,2,3209,2_music_song_songs_singer,"[music, song, songs, singer, band, rapper, alb...","[music, song, musician, songs, spotify, album,...","[Music, , , , , , , , , ]",
3,3,2127,3_travel_vacation_traveling_trip,"[travel, vacation, traveling, trip, road, trav...","[traveling, travel, travelers, travels, travel...","[Travel, , , , , , , , , ]",
4,4,1437,4_abortion_court_parenthood_planned,"[abortion, court, parenthood, planned, supreme...","[abortion, abortions, scotus, republicans, jus...","[Reproductive Rights and Access to Abortion, ,...",
...,...,...,...,...,...,...,...
95,95,3123,95_cancer_health_mental_care,"[cancer, health, mental, care, patients, illne...","[cancer, health, care, medicine, patients, che...","[Cancer Care and Treatment, , , , , , , , , ]",
96,96,2411,96_church_confederate_charlottesville_god,"[church, confederate, charlottesville, god, je...","[charlottesville, christians, confederate, chr...","[Religion and Social Justice, , , , , , , , , ]",
97,97,1918,97_netflix_thrones_season_game,"[netflix, thrones, season, game, walking, trai...","[hbo, thrones, spoilers, tv, spoiler, episodes...","[Game of Thrones, , , , , , , , , ]",
98,98,1845,98_cruz_rubio_ted_marco,"[cruz, rubio, ted, marco, jeb, carson, bush, c...","[rubio, cruz, republicans, nonpartisan, gop, t...","[GOP Primary Race, , , , , , , , , ]",
