#BERTopic  Model

Bertopic es un algoritmo de modelado de temas, emplea embeddings y transformaciones c-TF-IDF para crear clusters que permiten interpretar temas a partir de palabras similares.

In [1]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer


In [3]:
from pandas._libs.tslibs import timestamps
input_file="economics_TW.csv"
#--------------------------------------------
df=pd.read_csv(input_file)
docs=df["Filtered"]
timestamps=df["Date"]
print(f"@economics with {len(docs)} tweets")
df.head()

@economics with 2445 tweets


Unnamed: 0.1,Unnamed: 0,Tweet,Filtered,Date
0,0,Waste-to-hydrogen technology firm Concord Blue...,Waste- to- hydrogen technology firm Concord Bl...,2022-04-30 18:29:21
1,1,Bitcoin is struggling to gain traction in El S...,Bitcoin is struggling to gain traction in El S...,2022-04-30 18:08:05
2,2,"The Star Ferry, an icon of Hong Kong and argua...","The Star Ferry, an icon of Hong Kong and argua...",2022-04-30 17:35:35
3,3,Food and energy price surges worsened by the U...,Food and energy price surges worsened by the U...,2022-04-30 17:08:03
4,4,President Joe Biden is considering forgiving a...,President Joe Biden is considering forgiving a...,2022-04-30 16:28:48


# Static Bert
Modelado de temas sin consideración temporal. <br>
Esto permite crear gráficos con representación de similitud entre temas.

In [4]:
#Elimina las palabras de tope
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english") 
#Creates the model
topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True, vectorizer_model=vectorizer_model)
topics, probs = topic_model.fit_transform(docs)

Batches: 100%|██████████| 77/77 [00:33<00:00,  2.32it/s]
2022-05-01 12:47:52,557 - BERTopic - Transformed documents to Embeddings
2022-05-01 12:48:05,986 - BERTopic - Reduced dimensionality
2022-05-01 12:48:06,426 - BERTopic - Clustered reduced embeddings


In [5]:
topic_model.get_topic(0)  # Temas mas frecuentes

[('fed', 0.05162266938856498),
 ('recession', 0.022961415044964392),
 ('federal', 0.019134789915175),
 ('federal reserve', 0.018707986373371736),
 ('rates', 0.01805916753443595),
 ('point', 0.01702120081695335),
 ('powell', 0.016891025062765394),
 ('reserve', 0.01639280916788377),
 ('fed president', 0.01618555471113198),
 ('says', 0.015638514307550273)]

In [6]:
%matplotlib inline
topic_model.visualize_topics() #Visualiza similaridad entre temas

In [7]:
%matplotlib inline
topic_model.visualize_distribution(probs[200], min_probability=0.015) #Muestra la distribución de probabilidades

In [8]:
%matplotlib inline
topic_model.visualize_barchart(top_n_topics=5) #Top words in each topic

# Dynamic Bert
Modelado de temas con consideración temporal.

In [9]:
%matplotlib inline
topics_over_time = topic_model.topics_over_time(docs=docs, 
                                                topics=topics, 
                                                timestamps=timestamps, 
                                                global_tuning=True, 
                                                evolution_tuning=True, 
                                                nr_bins=20)

20it [00:03,  6.12it/s]


In [10]:
#Frecuencia de temas en el tiempo
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=5)

# Biliografía
https://github.com/MaartenGr/BERTopic  <br>
https://maartengr.github.io/BERTopic/api/bertopic.html
