In [2]:
! pip install bertopic

Collecting bertopic
  Downloading bertopic-0.17.3-py3-none-any.whl.metadata (24 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloa

In [34]:
from bertopic import BERTopic
import pandas as pd
from bs4 import BeautifulSoup

import plotly.io as pio
pio.renderers.default = "colab"

def html_cleaner(text):
    soup = BeautifulSoup(text,'html')
    return soup.text

In [27]:

df = pd.read_parquet('filtered_cleaned_articles.parquet')
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=5, max_df=0.95, stop_words='english',ngram_range=(1,2))
topic_model = BERTopic(calculate_probabilities=True, vectorizer_model=cv)


In [28]:
corpus = df['body'].apply(html_cleaner).tolist()

In [29]:
from pathlib import Path
import numpy as np

embeds_path = Path('embeddings.npy')

if embeds_path.exists():
  embeddings = np.load(embeds_path)
else:
  from sentence_transformers import SentenceTransformer

  transformer = SentenceTransformer("all-MiniLM-L6-v2")
  embeddings = transformer.encode(corpus)
  np.save(embeds_path, embeddings)

In [32]:
topics, probs = topic_model.fit_transform(corpus, embeddings=embeddings)
topic_model.save('my_model.pkl', serialization='pickle')



In [35]:
topic_model = BERTopic.load('my_model.pkl')

topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,406,-1_data_companies_content_google,"[data, companies, content, google, users, gove...",[ 3.13pm BST Boeing's new chief executive Or...
1,0,132,0_jobs_workers_systems_humans,"[jobs, workers, systems, humans, think, techno...","[In the winter of 1958, a 30-year-old psycholo..."
2,1,109,1_trump_election_voters_biden,"[trump, election, voters, biden, political, ca...",[“What a bunch of malarkey.” Gail Huntley reco...
3,2,89,2_summit_safety_government_sunak,"[summit, safety, government, sunak, ai safety,...",[The most advanced technology companies will a...
4,3,63,3_bst_australia_updated_australian,"[bst, australia, updated, australian, police, ...",[\n 9.30am BST \nWhat we learned; Wednesday 12...
5,4,53,4_authors_copyright_content_openai,"[authors, copyright, content, openai, books, c...",[The US comedian and author Sarah Silverman is...
6,5,52,5_bst_growth_inflation_rate,"[bst, growth, inflation, rate, bank, rates, ma...","[ 5.42pm BST UK government seeking to ""work ..."
7,6,46,6_film_movie_films_movies,"[film, movie, films, movies, scifi, love, char...",[Jane Curtin and Harriet Sansom Harris are bes...
8,7,41,7_altman_openai_board_microsoft,"[altman, openai, board, microsoft, openais, sa...",[Microsoft has hired Sam Altman as head of a n...
9,8,39,8_sunak_party_labour_election,"[sunak, party, labour, election, minister, pri...",[Britain is facing some of the most dangerous ...


In [38]:
topic_model.visualize_barchart(n_words=10, height=400)