# BERTopic

In [1]:
!pip install bertopic sentence_transformers hdbscan flair

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bertopic
  Downloading bertopic-0.12.0-py2.py3-none-any.whl (90 kB)
[K     |████████████████████████████████| 90 kB 7.8 MB/s 
[?25hCollecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 5.5 MB/s 
[?25hCollecting hdbscan
  Downloading hdbscan-0.8.29.tar.gz (5.2 MB)
[K     |████████████████████████████████| 5.2 MB 62.4 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting flair
  Downloading flair-0.11.3-py3-none-any.whl (401 kB)
[K     |████████████████████████████████| 401 kB 50.4 MB/s 
[?25hCollecting umap-learn>=0.5.0
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[K     |████████████████████████████████| 88 kB 5.1 MB/s 
[?25hCollecting pyyaml<6.0
  Download

In [2]:
import pandas as pd
from flair.embeddings import TransformerDocumentEmbeddings
from sentence_transformers import SentenceTransformer
from umap import UMAP 
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic import BERTopic

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
data = pd.read_csv('/content/drive/Shareddrives/TEAM 3 NLP - AI /Reto/ETL/MainDatasetTwitter.csv')
data.drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'Time', 'User'], inplace=True)
data.head()

Unnamed: 0,Tweet
0,@dzolecito Aquí tienes algunos recursos que ta...
1,"@dzolecito Hola, entendemos que en la vida est..."
2,The #FIL2022 has officially started. This is w...
3,@danielgonsan ¡Felicidades! Sin duda estás dej...
4,@MelyPaezDesign ¡Muchas felicidades!


In [7]:
# Load Dataset of Twitter
tweets_list = data.values.tolist()
tweets = [''.join(ele) for ele in tweets_list]

In [8]:
# Step 1 - Extract embeddings
"""
Sentence-transformers model: 'paraphrase-multilingual-MiniLM-L12-v2'
- one major difference is that the multilingual models work for 50+ languages.
"""
embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
# embedding_model = SentenceTransformer('symanto/sn-xlm-roberta-base-snli-mnli-anli-xnli')

Downloading:   0%|          | 0.00/968 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.79k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/645 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/471M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [9]:
# Step 2 - Reduce dimensionality
"""
Typically, embeddings are at least 384 in length and many clustering algorithms 
have difficulty clustering in such a high dimensional space.
"""
umap_model = UMAP(n_neighbors=50, n_components=5, min_dist=0.0, metric='cosine', low_memory=False)

In [10]:
# Step 3 - Cluster reduced embeddings
"""
This process of clustering is quite important because the more performant our 
clustering technique the more accurate our topic representations are.
- HDBSCAN as it is quite capable of capturing structures with different densities.
"""
hdbscan_model = HDBSCAN(min_cluster_size=50, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

In [11]:
# Step 4 - Tokenize topics
"""
When we use HDBSCAN as a cluster model, we may assume that our clusters having 
different degrees of density and different shapes. 
This means that a centroid-based topic representation technique might not be 
the best fitting model. 
In other words, we want a topic representation technique that makes little to 
no assumption on the expected structure of the clusters.
"""
# vectorizer_model = CountVectorizer(stop_words='english', ngram_range=(1, 3), min_df=10)
vectorizer_model = CountVectorizer(stop_words='english')

In [12]:
# Step 5 - Create topic representation
"""
TF-IDF was adjusted to work on a cluster/categorical/topic-level 
instead of a document-level. 
This adjusted TF-IDF representation is called c-TF-IDF takes into account what 
makes the documents in once cluster different from documents in another cluster
"""
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

In [13]:
# All steps together
"""
diversity: To improve the coherence of words, Maximal Marginal Relevance 
was used to find the most coherent words without having too much overlap between 
the words themselves. This results in the removal of words that do not 
contribute to a topic.

You can also use this technique to diversify the words generated in the
topic representation.
"""
topic_model = BERTopic(embedding_model=embedding_model,    # Step 1 - Extract embeddings
                       umap_model=umap_model,              # Step 2 - Reduce dimensionality
                       hdbscan_model=hdbscan_model,        # Step 3 - Cluster reduced embeddings
                       vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
                       ctfidf_model=ctfidf_model,          # Step 5 - Extract topic words
                       diversity=0.9,                      # Step 6 - Diversify topic words
                       language = "multilingual",
                       n_gram_range=3,
                       nr_topics=5,
                       calculate_probabilities=True,
                       )

In [14]:
# Fit the model on a corpus
topics, probs = topic_model.fit_transform(tweets)

In [15]:
# After generating topics and their probabilities, we can access the frequent topics that were generated:
"""
-1 refers to all outliers and should typically be ignored. 
Next, let's take a look at the most frequent topic that was generated
"""
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,22395,-1_https_la_tu_rt
1,0,1349,0_transmisión_twitch_final_borregostec
2,1,1210,1_hola_dm_apoyarte_podemos
3,2,881,2_felicidades_nosotros_workattec_tecdemonterrey
4,3,816,3_arquitectura_sistemas_industrial_robótica
5,4,727,4_pandemia_expertotec_la_aliados


In [16]:
# topic_model.update_topics(tweets, vectorizer_model=vectorizer_model)

In [17]:
# Reduce the number of topics
# topic_model.reduce_topics(tweets, nr_topics=5)

In [18]:
topic_model.get_topics()

{-1: [('https', 0.33847362202087955),
  ('la', 0.3266378598566583),
  ('tu', 0.27580621923341264),
  ('rt', 0.27409672186053424),
  ('tec', 0.27236522800561236),
  ('una', 0.2659523084747011),
  ('más', 0.25833337362891406),
  ('al', 0.24861887461686813),
  ('nuestra', 0.2339856769746622),
  ('día', 0.2237536039669257)],
 0: [('transmisión', 0.4352388681964396),
  ('twitch', 0.42891424446669235),
  ('final', 0.41900703698816266),
  ('borregostec', 0.3795636901726023),
  ('gaming', 0.3753367982264681),
  ('sky', 0.35394848145911806),
  ('https', 0.353557947836257),
  ('esportscup', 0.35342820626384275),
  ('506', 0.3248986560413467),
  ('tienes', 0.3168744651799194)],
 1: [('hola', 0.7980933455733945),
  ('dm', 0.678011547456814),
  ('apoyarte', 0.5956248763293296),
  ('podemos', 0.5561268514437558),
  ('mándanos', 0.550425366717527),
  ('mentor', 0.5101725918021087),
  ('situación', 0.4769629791076328),
  ('800', 0.4673579108967217),
  ('mensaje', 0.45657638186796473),
  ('24', 0.45635

In [19]:
topic_model.generate_topic_labels()

['-1_https_la_tu',
 '0_transmisión_twitch_final',
 '1_hola_dm_apoyarte',
 '2_felicidades_nosotros_workattec',
 '3_arquitectura_sistemas_industrial',
 '4_pandemia_expertotec_la']

In [20]:
topic_model.topic_sizes_

{-1: 22395, 0: 1349, 1: 1210, 2: 881, 3: 816, 4: 727}

In [21]:
topic_model.representative_docs_

{2: ['@ManziCleto 😂',
  '@kobemty 😮 ¡Que bien! 👏🏻 Nosotros también #Tqueremos.💙',
  '@hannasalaz ¡Feliz #DiaDeLosInocentes, Hanna! 😎'],
 1: ['@scordova971 Hola. Te sugerimos comunicarte a Tec Services para que tu correo sea dado de baja en nuestra base de datos. Puedes contactarlos vía:\n\nCorreo: tecservices@servicios.tec.mx\nWhatsApp: 811 625 5123\nTeléfono: +52 81 8358 2000\n\nEstaremos al pendiente. Saludos.',
  '@_csnow_ @TECcampusMTY Hola. Hemos recibido tu comentario. Por favor, comunícate con nuestros compañeros de Tec Services para reportar esta situación. Puedes contactarlos por: \n\nWhatsapp: +52 811 625 5123\nTeléfono: +52 81 8358 2000\n\nEstaremos al pendiente de este caso. Saludos.',
  '@dmendoza06011 Hola. Te sugerimos comunicarte a Tec Services, con gusto podrán ayudarte en esta situación. Contáctalos en:\n\nTeléfono: +52 81 8358 2000\nWhatsApp: +52811625 5123\n\nSi tienes alguna otra pregunta, no dudes en enviarnos un mensaje. Quedamos al pendiente. Saludos.',
  '@cris

In [22]:
topic_model.visualize_topics() 

In [23]:
topic_model.visualize_barchart()

In [24]:
topic_model.visualize_heatmap()

In [25]:
topic_model.visualize_documents(tweets)

In [26]:
topic_model.visualize_hierarchy()

In [27]:
topic_model.save('BERTopicV2')

In [28]:
# load model
BERTopicv1 = BERTopic.load('/content/drive/Shareddrives/TEAM 3 NLP - AI /Reto/BERTopicv1')

In [29]:
# prediction 1
BERTopicv1.find_topics('becas')

([-1, 3, 1, 4, 0],
 [0.8650532088062444,
  0.8363240234132664,
  0.8119295285881571,
  0.8081767100582393,
  0.7298679120578029])

In [30]:
# prediction 2
BERTopicv1.find_topics('graduación')

([4, 3, 1, 0, -1],
 [0.5525140601816712,
  0.4851111631054542,
  0.4834157007771734,
  0.46295586022140095,
  0.4599323453765263])

In [31]:
# prediction4
BERTopicv1.find_topics('TQueremos')

([-1, 1, 3, 4, 0],
 [0.8997670301134552,
  0.8849034913826039,
  0.8792841227465318,
  0.8305202920206516,
  0.7850631342741732])

In [32]:
# prediction5
BERTopicv1.find_topics('problema con mi beca acádemica')

([0, 4, 1, -1, 3],
 [0.4104852528568946,
  0.3828955991675881,
  0.38259119681475456,
  0.36975547828254307,
  0.3605072996925315])

In [33]:
# prediction6
BERTopicv1.find_topics('equipos represantitos del TEC, borregos TEC')

([3, 4, 2, 0, -1],
 [0.44121370273153304,
  0.42294243015588295,
  0.41178867822779514,
  0.4046093637526143,
  0.4027266789299212])

In [34]:
# prediction7
BERTopicv1.find_topics('quiero darme de baja temporalmente')

([1, 3, 0, 4, -1],
 [0.46567675068346454,
  0.39899731049865966,
  0.38938693647754313,
  0.3815804462098109,
  0.3615853603069022])

In [35]:
# prediction8
BERTopicv1.find_topics('ya soy exatec')

([1, 3, -1, 0, 4],
 [0.6228592538313453,
  0.6034949827955831,
  0.5796516905502391,
  0.49692904545832217,
  0.4882514155607525])