In [None]:
! pip install bertopic embedding-atlas

Collecting bertopic
  Downloading bertopic-0.17.3-py3-none-any.whl.metadata (24 kB)
Collecting embedding-atlas
  Downloading embedding_atlas-0.8.0-py3-none-any.whl.metadata (2.7 kB)
Collecting fastparquet>=2024.0.0 (from embedding-atlas)
  Downloading fastparquet-2024.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting inquirer>=3.0.0 (from embedding-atlas)
  Downloading inquirer-3.4.1-py3-none-any.whl.metadata (6.8 kB)
Collecting uvloop>=0.21.0 (from embedding-atlas)
  Downloading uvloop-0.21.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Collecting blessed>=1.19.0 (from inquirer>=3.0.0->embedding-atlas)
  Downloading blessed-1.21.0-py2.py3-none-any.whl.metadata (13 kB)
Collecting editor>=1.6.0 (from inquirer>=3.0.0->embedding-atlas)
  Downloading editor-1.6.6-py3-none-any.whl.metadata (2.3 kB)
Collecting readchar>=4.2.0 (from inquirer>=3.0.0->embedding-atlas)
  Downloading readchar-4.2.1-py3-none-any.whl.metadat

In [None]:
from bertopic import BERTopic
import pandas as pd
from bs4 import BeautifulSoup

import plotly.io as pio
pio.renderers.default = "colab"

def html_cleaner(text):
    soup = BeautifulSoup(text,'html')
    return soup.text

In [None]:

df = pd.read_parquet('filtered_cleaned_articles.parquet')
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=5, max_df=0.95, stop_words='english',ngram_range=(1,2))


In [None]:
corpus = df['body'].apply(html_cleaner).tolist()
df['corpus'] = corpus

In [None]:
from pathlib import Path
import numpy as np

embeds_path = Path('embeddings.npy')

if embeds_path.exists():
  embeddings = np.load(embeds_path)
else:
  from sentence_transformers import SentenceTransformer

  transformer = SentenceTransformer("all-MiniLM-L6-v2")
  embeddings = transformer.encode(corpus)
  np.save(embeds_path, embeddings)

In [None]:
model_path = Path('my_model.pkl')
if model_path.exists():
  topic_model = BERTopic.load(model_path)
else:
  topic_model = BERTopic(calculate_probabilities=True, vectorizer_model=cv)
  topic_model.fit_transform(corpus, embeddings=embeddings)
  topic_model.save(model_path, serialization='pickle')

In [None]:
from embedding_atlas.widget import EmbeddingAtlasWidget


In [None]:
df['topic'] = [topic_model.topic_labels_[x] for x in topic_model.topics_]

In [None]:
from embedding_atlas.projection import compute_text_projection

compute_text_projection(df, text="corpus",
    x="projection_x", y="projection_y", neighbors="neighbors"
)

In [None]:
widget = EmbeddingAtlasWidget(df, text="corpus",
    x="projection_x", y="projection_y", neighbors="neighbors"
)
widget

EmbeddingAtlasWidget()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups

cats = ('alt.atheism','sci.space','sci.electronics','soc.religion.christian')

ng_data = fetch_20newsgroups(remove=('headers','footers','quotes'), categories=cats, subset='all')
df = pd.DataFrame(ng_data['data'], columns=['text'])
df['label'] = ng_data.target

label_lookup = {i:name for i, name in enumerate(ng_data.target_names)}
df['label_text'] = df['label'].map(label_lookup)
df







Unnamed: 0,text,label,label_text
0,\nDo you know of the world-wide-web? This is ...,2,sci.space
1,,2,sci.space
2,A related question (which I haven't given that...,2,sci.space
3,\nRobin Lane Fox is a historian and a gardener...,3,soc.religion.christian
4,\nHello net. I have a 386sx motherboard with ...,1,sci.electronics
...,...,...,...
3762,There is a nice little tool in Lucid emacs. It...,2,sci.space
3763,\nPerhaps we have different definitions of abs...,3,soc.religion.christian
3764,\n\n\nNo. I estimate a 99 % probability the Ge...,2,sci.space
3765,\n\nIf you want parallels the best source is p...,3,soc.religion.christian


In [None]:
from sklearn.decomposition import PCA
tfidf = TfidfVectorizer(min_df=5, max_df=0.95, stop_words='english', ngram_range=(1,2))

X = tfidf.fit_transform(df['text'])
pca = PCA(2)
components = pca.fit_transform(X)

In [67]:
df['pca_x'] = components[:,0]
df['pca_y'] = components[:,1]
widget = EmbeddingAtlasWidget(df, text="text",
    x="pca_x", y="pca_y", neighbors="neighbors"
)
widget

EmbeddingAtlasWidget()

In [None]:
widget.

EmbeddingAtlasWidget()

In [None]:
widget.selection()

Unnamed: 0,id,type,sectionId,sectionName,webPublicationDate,webTitle,webUrl,apiUrl,tags,isHosted,...,peak,x,y,topic,corpus,projection_x,projection_y,neighbors,__row_id__,_ev_topic_id
0,business/article/2024/aug/02/uk-funding-techno...,article,business,Business,2024-08-02 10:15:31+00:00,UK shelves £1.3bn of funding for technology an...,https://www.theguardian.com/business/article/2...,https://content.guardianapis.com/business/arti...,"[{'activeSponsorships': None, 'apiUrl': 'https...",False,...,,-0.011988,-0.08537,5_bst_growth_inflation_rate,The Labour government has shelved £1.3bn of fu...,7.990232,6.286142,"{'distances': [0.0, 0.27524498479900283, 0.298...",13,6
1,business/article/2024/jul/12/british-chipmaker...,article,business,Business,2024-07-12 14:04:03+00:00,UK risks tech ‘talent drain’ to US if pension ...,https://www.theguardian.com/business/article/2...,https://content.guardianapis.com/business/arti...,"[{'activeSponsorships': None, 'apiUrl': 'https...",False,...,,0.075511,-0.130789,5_bst_growth_inflation_rate,Britain risks a tech “talent drain” to the US ...,7.36173,6.822834,"{'distances': [0.0, 0.1171433612409587, 0.4043...",389,6
2,commentisfree/article/2024/jul/10/rachel-reeve...,article,commentisfree,Opinion,2024-07-10 15:07:18+00:00,Rachel Reeves says the UK’s public finances ar...,https://www.theguardian.com/commentisfree/arti...,https://content.guardianapis.com/commentisfree...,"[{'activeSponsorships': None, 'apiUrl': 'https...",False,...,,0.009242,-0.071454,5_bst_growth_inflation_rate,A Labour government that comes to power after ...,8.006539,6.591729,"{'distances': [0.0, 0.4090235476074865, 0.4125...",473,6
3,business/article/2024/aug/04/a-simple-solution...,article,business,Business,2024-08-04 11:02:05+00:00,A simple solution to Rachel Reeves’ spending c...,https://www.theguardian.com/business/article/2...,https://content.guardianapis.com/business/arti...,"[{'activeSponsorships': None, 'apiUrl': 'https...",False,...,,-0.018159,-0.074542,5_bst_growth_inflation_rate,Well that didn’t take long. Less than a month ...,8.060617,6.605041,"{'distances': [0.0, 0.26862117355842974, 0.275...",489,6
4,business/article/2024/sep/01/germany-economy-p...,article,business,Business,2024-09-01 10:32:29+00:00,The German problem? It’s an analogue country i...,https://www.theguardian.com/business/article/2...,https://content.guardianapis.com/business/arti...,"[{'activeSponsorships': None, 'apiUrl': 'https...",False,...,,-0.036154,-0.039874,5_bst_growth_inflation_rate,Sir Keir Starmer is not the first Labour leade...,7.915723,6.636663,"{'distances': [0.0, 0.41257402072971683, 0.457...",559,6
5,business/article/2024/may/30/business-manifest...,article,business,Business,2024-05-30 14:38:46+00:00,Business manifestos: six pre-election proposal...,https://www.theguardian.com/business/article/2...,https://content.guardianapis.com/business/arti...,"[{'activeSponsorships': None, 'apiUrl': 'https...",False,...,,0.017755,0.055035,5_bst_growth_inflation_rate,Business interest groups are jostling for infl...,7.981167,6.773706,"{'distances': [0.0, 0.4848263549820053, 0.4988...",602,6
6,business/nils-pratley-on-finance/article/2024/...,article,business,Business,2024-08-05 16:45:18+00:00,Get ready for a long and messy August in the s...,https://www.theguardian.com/business/nils-prat...,https://content.guardianapis.com/business/nils...,"[{'activeSponsorships': None, 'apiUrl': 'https...",False,...,,-0.006521,-0.117359,5_bst_growth_inflation_rate,Choose the culprit behind the sudden sell-off ...,7.187605,7.641772,"{'distances': [0.0, 0.2885561586953178, 0.3118...",631,6
7,business/2024/feb/07/sainsburys-does-not-rule-...,article,business,Business,2024-02-07 15:21:35+00:00,Sainsbury’s does not rule out job cuts as it r...,https://www.theguardian.com/business/2024/feb/...,https://content.guardianapis.com/business/2024...,"[{'activeSponsorships': None, 'apiUrl': 'https...",False,...,,-0.017955,-0.112935,5_bst_growth_inflation_rate,Sainsbury’s is to use more automated tills and...,7.23476,7.07371,"{'distances': [0.0, 0.40574716495435226, 0.484...",659,6
8,business/article/2024/jul/21/uk-services-based...,article,business,Business,2024-07-21 10:09:11+00:00,Growing inflation in UK’s service-based econom...,https://www.theguardian.com/business/article/2...,https://content.guardianapis.com/business/arti...,"[{'activeSponsorships': None, 'apiUrl': 'https...",False,...,,0.010851,-0.011954,5_bst_growth_inflation_rate,I’m due a haircut and in the past week receive...,7.958661,6.815622,"{'distances': [0.0, 0.42387654768995264, 0.452...",666,6
9,technology/2023/may/20/labour-should-pledge-11...,article,technology,Technology,2023-05-20 07:00:51+00:00,Labour should pledge £11bn to build ‘BritGPT’ ...,https://www.theguardian.com/technology/2023/ma...,https://content.guardianapis.com/technology/20...,"[{'activeSponsorships': None, 'apiUrl': 'https...",False,...,,-0.036658,-0.098238,5_bst_growth_inflation_rate,Keir Starmer should pledge £11bn towards build...,7.865239,6.267584,"{'distances': [0.0, 0.2872140080004165, 0.2985...",680,6


In [None]:
topic_model.visualize_barchart(n_words=10, height=400)