## Text cleanup process

    Text cleanup: remove special characters, numbers, punctuation, and unnecessary white space.

    Eliminate stopwords: eliminate common words that do not add meaning to the analysis, such as "de", "la", "que", etc.

    Remove URLs and usernames: remove urls and usernames from posts

    Remove hashtags: remove hashtags from posts

    Remove emoticons: remove emoticons from posts

In [1]:
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from bertopic import BERTopic

In [3]:
df = pd.read_csv("Publicaciones2.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,post_id,text,time,image,video,video_watches,likes,comments,shares,...,links,user_id,username,reaction_count,likes_standar,comments_standar,shares_standar,dia,hora,total_reacciones
0,0,10160500185237520,Jamás habrá pierde en un jugosito #sánguche de...,2023-02-01 13:23:02,1.0,1,,121.0,19.0,8.0,...,10,129614200000.0,LA LUCHA SANGUCHERIA CRIOLLA,121.0,0.008643,0.003585,0.010323,Wednesday,13,0.034773
1,1,10160496038652520,Fuimos en búsqueda del mejor planchero 😝💪🏻 ¿qu...,2023-01-30 13:07:26,1.0,1,,64.0,39.0,5.0,...,10,129614200000.0,LA LUCHA SANGUCHERIA CRIOLLA,64.0,0.004571,0.007358,0.006452,Monday,13,0.032418
2,2,10160489394752520,"Y si estás con hambre, nuestro #pollo deluxe s...",2023-01-27 09:20:29,1.0,1,,81.0,12.0,2.0,...,9,129614200000.0,LA LUCHA SANGUCHERIA CRIOLLA,81.0,0.005786,0.002264,0.002581,Friday,9,0.015396
3,3,10160481576377520,"Te haremos antojar un sánguche de pavo en 3,2,...",2023-01-23 10:14:42,1.0,1,,85.0,11.0,2.0,...,10,129614200000.0,LA LUCHA SANGUCHERIA CRIOLLA,85.0,0.006071,0.002075,0.002581,Monday,10,0.015257
4,4,10160476026507520,Come desde S/5.90 en La Lucha Sanguchería 🤩✨,2023-01-20 18:36:27,1.0,1,,567.0,67.0,40.0,...,1,129614200000.0,LA LUCHA SANGUCHERIA CRIOLLA,567.0,0.0405,0.012642,0.051613,Friday,18,0.159266


In [4]:
def remove_stopwords(text:str)->str:
    stop_words = set(stopwords.words("spanish"))
    words = word_tokenize(text)
    filtered_sentence = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_sentence)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Laecs\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Laecs\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
def format_text(texto):
    import emoji

    sig_punt = {
        ord('\n'): None,
        ord('*'): None,
        ord(':'): None,
        ord('#'): None,
        ord('@'): None,
        ord('•'): None
    }
    
    marca = ['laluchasangucheriacriolla','desayunosfit','patriosangucheria','facebook',
             '.com','www.','https','.peru','elchinito','delidesayunosd','delidesayunosdelivery',
             'laluchasangucheria','eljardindejazmin','desayunodelivery','paraderov',
             'elchinovegano','seitanurbanbistro','asianica', 'asianicastreetfood','carnivorolahamburguesería','carnivorolahamburguesería',
             'lima141','palermocafe','monstruos' ,'sandmonstruos','maztikasanguchesurbanos','patrio','palermo','maztika']
    
    #If the post have Text
    if isinstance(texto, str):
        #print("texto es: ",texto)
        #Remove special characters
        texto = texto.translate(sig_punt)
        
        #Remove emoji
        texto = emoji.replace_emoji(texto,'')
        
        texto = texto.lower()
        
        #Remove URLs and hashtag words
        for i in marca:
            texto = texto.replace(i,'')
        
        #Remove Stopwords
        texto = remove_stopwords(texto)
        
        return texto
    else:
        return ''

## Topic Modelling

In [1]:
#Text transformation
df['textBert'] = df['text'].apply(format_text)
textos = df['textBert']

#Define BERT Model
topic_model = BERTopic(embedding_model="paraphrase-multilingual-MiniLM-L12-v2", nr_topics='10')

#Training with text
topics, probs = topic_model.fit_transform(textos)

#Reduce outliers
new_topics = topic_model.reduce_outliers(textos, topics, strategy="distributions")

#Update new topics
topic_model.update_topics(textos, topics=new_topics)
topic_model.topics_ = new_topics

documents = pd.DataFrame({"Document": textos, "Topic": new_topics})
topic_model._update_topic_size(documents)

NameError: name 'df' is not defined

In [11]:
#Get topics from texts
topicos = topic_model.get_document_info(textos)

#Concat posts with topics
df_topics = pd.concat([df, topicos], axis=1)
df_topics.head()

Unnamed: 0,Topic,Count,Name
0,0,663,0_lucha_mejor_hoy_8am
1,1,79,1_amor_regalospersonalizados_regalosoriginales...
2,2,30,2_menujardinero_paso_campechanito_pastrami


In [None]:
#Save df
df_topics.to_csv("Publicaciones_topicos.csv")