In [1]:
import nltk.stem
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN

ModuleNotFoundError: No module named 'nltk'

In [None]:
english_stemmer = nltk.stem.SnowballStemmer('english') 
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([english_stemmer.stem(w) for w in analyzer(doc)])

def train_bert(docs,model_path):
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

    # Clustering model: See [2] for more details
    cluster_model = HDBSCAN(min_cluster_size = 15, 
                            metric = 'euclidean', 
                            cluster_selection_method = 'eom', 
                            prediction_data = True)
    
    #Explicitly define, use, and adjust the ClassTfidfTransformer with new parameters, 
    #bm25_weighting and reduce_frequent_words, to potentially improve the topic representation
    ctfidf_model = ClassTfidfTransformer(bm25_weighting=True)                         
    #vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 2))
    vectorizer_model = StemmedCountVectorizer(analyzer="word",stop_words="english", ngram_range=(1, 2))

    # BERTopic model
    topic_model = BERTopic(embedding_model = embedding_model,
                           hdbscan_model = cluster_model,
                           ctfidf_model=ctfidf_model,
                           vectorizer_model=vectorizer_model,
                           language="english")

    # Fit the model on a corpus
    topics, probs = topic_model.fit_transform(docs)
    # topic_model.save(model_path)
    return topic_model

def load_bert(model_path):
    topic_model = BERTopic.load(model_path)
    return topic_model

In [None]:
import pandas as pd

# Đọc file CSV vào DataFrame
df = pd.read_csv('thiet-bi-kts-phu-kien-so.csv')

docs = df['Translated Text'].tolist()


# Hiển thị DataFrame sau khi thêm cột
print(docs[1])

In [None]:
docs_train =[]
for doc in docs:
    docs_train.append(str(doc))

# Chăm sóc nhà cửa

In [None]:
print('Start training!')
topic_model = train_bert(docs_train,"model")
print('End training!')
print(topic_model.get_topic_freq().head())

In [None]:
pd.set_option('display.width', None)        # Hiển thị bảng mà không cắt theo chiều rộng

freq_df = topic_model.get_topic_info()
print("Number of topics: {}".format( len(freq_df)))
freq_df['Percentage'] = round(freq_df['Count']/freq_df['Count'].sum() * 100,2)
freq_df = freq_df.iloc[:,[0,1,3,2]]
freq_df.head(10)

In [None]:
fig1 = topic_model.visualize_topics()
fig1.show()

In [None]:
# Save topic-terms barcharts as HTML file
fig2 = topic_model.visualize_barchart(top_n_topics = 10)
fig2.show()

In [None]:
# Save documents projection as HTML file
fig3 = topic_model.visualize_documents(docs)
fig3.show()

In [None]:
# Save topics dendrogram as HTML file
fig4 = topic_model.visualize_hierarchy()
fig4.show()

In [None]:
fig5 = topic_model.visualize_heatmap(n_clusters=10, width=1000, height=1000)
fig5.show()

# Thiết bị kts

In [None]:
print('Start training!')
topic_model = train_bert(docs_train,"model")
print('End training!')
print(topic_model.get_topic_freq().head())

In [None]:
freq_df = topic_model.get_topic_info()
print("Number of topics: {}".format( len(freq_df)))
freq_df['Percentage'] = round(freq_df['Count']/freq_df['Count'].sum() * 100,2)
freq_df = freq_df.iloc[:,[0,1,3,2]]
freq_df.head(10)

In [None]:
fig1 = topic_model.visualize_topics()
fig1.show()

In [None]:
# Save topic-terms barcharts as HTML file
fig2 = topic_model.visualize_barchart(top_n_topics = 10)
fig2.show()

In [None]:
# Save documents projection as HTML file
fig3 = topic_model.visualize_documents(docs)
fig3.show()

In [None]:
# Save topics dendrogram as HTML file
fig4 = topic_model.visualize_hierarchy()
fig4.show()

In [None]:
fig5 = topic_model.visualize_heatmap(n_clusters=10, width=1000, height=1000)
fig5.show()