# Apprentissage with BertTOPIC

## What is BertTOPIC ?
BERTopic is a topic modeling python library that combines transformer embeddings and clustering model algorithms to identify topics in NLP (Natual Language Processing).

In [3]:
%pip install bertopic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.cm as cm

from bertopic import BERTopic

In [5]:
from umap import UMAP

In [6]:
%pip install langdetect

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [7]:
from langdetect import detect

In [13]:
final_dataset = pd.read_csv("df_10000.CSV")
final_dataset = final_dataset.drop(['Unnamed: 0'], axis=1) #remove columns creted bc of the to_csv() in the cleaning
final_dataset = final_dataset.iloc[0:10000]
print(final_dataset.shape)
ind = 0
for i in final_dataset['corps']:
    if ('nan' in i) and (len(i)<100):
        final_dataset.drop(ind, axis=0, inplace=True)
    
    try:
      langue = detect(' '.join(i))
    except:
      final_dataset.drop(ind, axis=0, inplace=True)
      langue = 'no'
    if 'fr' in langue or 'es' in langue:
      print(ind)
      final_dataset.drop(ind, axis=0, inplace=True)
    
    ind = ind +1


# final_dataset = final_dataset.loc(final_dataset['corps'] != ['nan'])
final_dataset.head()

(10000, 3)
692
3235


Unnamed: 0,objet,corps,adresse
0,"[']', 'mission', 'promotion', 'important', 'me...","['dear', 'student', 'ambassador', 'lutt', 'you...","['etudiants-request', 'utt', 'fr']"
1,"[']', 'message', 'pierre', 'koch', 'personal',...","['english', ']', 'dear', 'fellow', 'student', ...","['etudiants-request', 'utt', 'fr']"
2,"['your', 'instagram', 'password', 'change']","['this', 'confirmation', 'password', 'instagra...","['no-reply', 'mail.instagram', 'com']"
3,"[']', 'fwd', 'challenge', 'inter', 'entreprise...","['de', 'michel', 'legault', 'sport', 'cc', 'sd...","['sport-request', 'utt', 'fr']"
4,"['confirmation', 'ter', 'ticket', 'order']","['confirmation', 'trip', 'monday', 'november',...","['mesbilletsTER-noreply', 'ter-sncf', 'fr']"


In [14]:
final_dataset = final_dataset.reset_index()

In [50]:
final_dataset['corps'][1939]

"['hola', 'oscar', 'como', 'quieras', 'importa', 'en', 'tu', 'caso', 'es', 'verdad', 'que', 'es', 'mejor', 'hacerlo', 'con', 'el', 'master', 'perfecto', 'hasta', 'manana', 'envoye', 'de', 'mon', 'iphone', 'le', 'mai', 'oscar', 'benoit', 'ecrit', 'hola', 'senora', 'vous', 'voulez', 'que', 'nous', 'postulion', 'une', 'universite', 'pour', 'un', 'master', 'ou', 'pour', 'un', 'echange', 'si', 'c', 'pour', 'un', 'master', 'devon', 'nous', 'obligatoirement', 'choisir', 'une', 'ecole', 'partenaire', 'de', 'lutt', 'par', 'exemple', 'je', 'voudrai', 'completer', 'formation', 'ingenieur', 'dans', 'une', 'ecole', 'de', 'business', 'apre', 'mon', 'diplome', 'est', 'ce', 'que', 'faire', 'une', 'lettre', 'de', 'motivation', 'et', 'un', 'entretien', 'sur', 'ce', 'type', 'decole', 'et', 'de', 'formation', 'vous', 'conviendrai', 'bonne', 'journee', 'oscar', 'de', 'frederique', 'boulet', 'l', 'oscar', 'benoit', 'envoye', 'vendredi', 'mai', 'objet', 'entrevistas', 'de', 'mayo', 'bueno', 'dias', 'todo', '

In [15]:
# Initiate UMAP
umap_model = UMAP(n_neighbors=15, 
                  n_components=5, 
                  min_dist=0.0, 
                  metric='cosine', 
                  random_state=100)
# Initiate BERTopic
topic_model = BERTopic(umap_model=umap_model, language="multilingual", calculate_probabilities=True, nr_topics=12)
# Run BERTopic model
topics, probabilities = topic_model.fit_transform(final_dataset['corps'])

In [16]:
# Get the list of topics
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,6256,-1_the_message_we_hello
1,0,499,0_linkedin_ireland_wilton_company
2,1,427,1_baptiste_hello_thank_guichard
3,2,356,2_store_offer_eur_valid
4,3,355,3_password_connection_account_change
5,4,349,4_de_la_que_en
6,5,345,5_student_message_dear_utt
7,6,268,6_stage_paris_job_new
8,7,249,7_eur_lydia_paris_account
9,8,204,8_ticket_sncf_game_trip


In [17]:
# Get top 10 terms for a topic
topic_model.get_topic(0)

[('linkedin', 0.24451010250399127),
 ('ireland', 0.12508313375066674),
 ('wilton', 0.11867446245194352),
 ('company', 0.11539003753531193),
 ('unlimited', 0.11483015346180094),
 ('france', 0.08255115619698303),
 ('plaza', 0.0698065956929539),
 ('dublin', 0.06732533625400605),
 ('de', 0.05589024914173239),
 ('logo', 0.05428004554600633)]

In [18]:
# Visualize top topic keywords
topic_model.visualize_barchart(top_n_topics=12)

In [19]:
# Visualize term rank decrease
topic_model.visualize_term_rank()

In [20]:
# Visualize intertopic distance
topic_model.visualize_topics()

In [21]:
# Visualize connections between topics using hierachical clustering
topic_model.visualize_hierarchy(top_n_topics=12)

In [24]:
# Visualize similarity using heatmap
topic_model.visualize_heatmap()

In [32]:
# Visualize probability distribution
topic_model.visualize_distribution(topic_model.probabilities_[0], min_probability=0.015)

In [None]:
# Save the chart to a variable
chart = topic_model.visualize_distribution(topic_model.probabilities_[0]) 
# Write the chart as a html file
chart.write_html("amz_review_topic_probability_distribution.html")