In [51]:
from bertopic import BERTopic
import pandas as pd
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, PartOfSpeech

In [52]:
data = pd.read_csv("../preprocssed_data/cleaned_data_USA.csv")    
dataFrame = data[['newsTitle']]
strings_list = [item[0] for item in dataFrame.values.tolist()]

In [53]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(strings_list, show_progress_bar=True)

Batches: 100%|██████████| 94/94 [00:08<00:00, 10.93it/s]


In [54]:
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

In [55]:
hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

In [56]:
vectorizer_model = CountVectorizer(stop_words="english")

In [57]:

# KeyBERT
keybert_model = KeyBERTInspired()

# Part-of-Speech
pos_model = PartOfSpeech("en_core_web_sm")

# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    "MMR": mmr_model,
    "POS": pos_model
}

In [58]:
topic_model = BERTopic(
# Pipeline models
embedding_model=embedding_model,
umap_model=umap_model,
hdbscan_model=hdbscan_model,
vectorizer_model=vectorizer_model,
representation_model=representation_model,
# Hyperparameters
top_n_words=10,
verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(strings_list)

# Show topics
topic_model.get_topic_info()

2024-11-26 19:32:09,739 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 94/94 [00:09<00:00, 10.06it/s]
2024-11-26 19:32:19,124 - BERTopic - Embedding - Completed ✓
2024-11-26 19:32:19,125 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-26 19:32:30,344 - BERTopic - Dimensionality - Completed ✓
2024-11-26 19:32:30,345 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-26 19:32:30,440 - BERTopic - Cluster - Completed ✓
2024-11-26 19:32:30,443 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-26 19:32:38,970 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,POS,Representative_Docs
0,-1,968,-1_new_know_watch_says,"[new, know, watch, says, day, star, video, tim...","[2017, premiere, watch, channel, live, instagr...","[watch, star, 2017, birthday, season, jenner, ...","[new, star, video, time, years, people, birthd...",[Watch 2017 Rose Bowl Parade Live Online Start...
1,0,322,0_trump_donald_trumps_obama,"[trump, donald, trumps, obama, inauguration, s...","[donald, trump, ivanka, inauguration, trumps, ...","[trumps, obama, inauguration, snl, sean, secre...","[inauguration, president, secretary, security,...",[Are we really stuck with President Donald Tru...
2,1,213,1_nfl_super_patriots_bowl,"[nfl, super, patriots, bowl, draft, packers, c...","[nfl, falcons, packers, cowboys, seahawks, red...","[nfl, patriots, packers, cowboys, seahawks, ch...","[draft, playoff, loss, playoffs, score, team, ...",[2017 NFL Playoff Odds Atlanta Falcons Vs Gree...
3,2,179,2_nba_warriors_game_lebron,"[nba, warriors, game, lebron, cavaliers, cavs,...","[cavs, nba, lebron, nbas, cavaliers, warriors,...","[nba, warriors, lebron, cavaliers, cavs, trade...","[trade, points, playoffs, win, rest, rumors, t...","[NBA schedule, Draymond flattens LeBron as War..."
4,3,173,3_basketball_ncaa_tournament_bowl,"[basketball, ncaa, tournament, bowl, state, co...","[ncaa, clemson, uconn, louisville, ucla, gonza...","[ncaa, tournament, carolina, duke, gonzaga, us...","[basketball, tournament, bowl, state, college,...",[College Football Playoff Clemson vs Ohio Stat...
5,4,106,4_day_patricks_2017_easter,"[day, patricks, 2017, easter, st, friday, chri...","[holiday, christmas, walmart, st, friday, star...","[patricks, easter, st, friday, christmas, pizz...","[day, year, hours, deals, food, open, free, qu...","[A History Of St Patricks Day In Minnesota, Ne..."
6,5,95,5_barcelona_madrid_real_fc,"[barcelona, madrid, real, fc, league, vs, cup,...","[barcelona, barcelonas, atletico, madrid, sevi...","[madrid, fc, league, liga, champions, arsenal,...","[goals, hard, nonleague, scores, win, stats, w...",[La Liga Luis Suarez double keeps FC Barcelona...
7,6,93,6_dies_dead_death_star,"[dies, dead, death, star, age, 67, actor, died...","[deaths, died, dies, death, dead, actress, cel...","[dies, dead, 67, actor, died, deaths, george, ...","[dead, death, star, actor, pop, deaths, heart,...","[Bollywood Star Vinod Khanna Dies at 70, Inter..."
8,7,65,7_globes_golden_awards_oscars,"[globes, golden, awards, oscars, oscar, wins, ...","[emmy, awards, oscar, nominations, oscarsnubbe...","[awards, oscars, oscar, 2017, grammys, denzel,...","[best, winners, tribute, speeches, actor, carp...",[Sarah Paulson Wins Best Actress in a Limited ...
9,8,57,8_trailer_movie_batman_marvel,"[trailer, movie, batman, marvel, logan, rogue,...","[trailer, logan, homecoming, spiderman, premie...","[trailer, batman, marvel, logan, rogue, homeco...","[trailer, review, new, things, footage, creepy...","[Details of the Final Bloody Logan Trailer, Ro..."


In [59]:
df = pd.DataFrame({'topic':topics, 'documents':probs})
df

Unnamed: 0,topic,documents
0,4,0.915318
1,-1,0.000000
2,14,0.538238
3,1,1.000000
4,-1,0.000000
...,...,...
2982,12,1.000000
2983,2,1.000000
2984,-1,0.000000
2985,8,0.619622


In [60]:
topic_model.visualize_topics()