<a href="https://colab.research.google.com/github/Kluthra15/twitter_marketing_ML/blob/main/NLP_Model_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.metrics import silhouette_score
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from sentence_transformers import SentenceTransformer
from bertopic.representation import TextGeneration
from bertopic.representation import KeyBERTInspired
from bertopic.representation import MaximalMarginalRelevance
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from transformers import pipeline, AutoModel
from bertopic.representation import OpenAI
from hdbscan import HDBSCAN
import numpy as np

# Build the pipeline with the current parameter settings
stopwords_list      = list(stopwords.words('english')) + ['http', 'https', 'amp', 'com', 'gtgtgt', 'please', 'send', 'dm']

vectorizer_model    = CountVectorizer(min_df=5,
                                      ngram_range=(1,2),
                                      stop_words=stopwords_list)

embedding_model     = AutoModel.from_pretrained('roberta-base')

umap_model          = UMAP(n_neighbors= 15,
                           n_components= 7,
                           min_dist= 0.1,
                           random_state= 42)

hdbscan_model       = HDBSCAN(min_cluster_size= 100,
                              min_samples= 40,
                              gen_min_span_tree=True,
                              prediction_data=True)

ctfidf = ClassTfidfTransformer(reduce_frequent_words=True)
representation_model = KeyBERTInspired()

model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    representation_model = representation_model,
    ctfidf_model=ctfidf,
    top_n_words=10,
    min_topic_size=100,
    language='english',
    calculate_probabilities=True,
    verbose=True,
    nr_topics = 50
    )

# Fit the BERTopic model
topics, probs = model.fit_transform(df_tweets_preprocessed['text_preprocessed'])

# Calculate silhouette score
silhouette_avg = silhouette_score(probs, hdbscan_model.labels_)

print(silhouette_avg)

In [None]:
# # Method 1 - safetensors
model.save("/content/drive/MyDrive/Colab_Notebooks/My_Models/", serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)

# # Method 2 - pytorch
model.save("/content/drive/MyDrive/Colab_Notebooks/My_Models/", serialization="pytorch", save_ctfidf=True, save_embedding_model=embedding_model)

# # Method 3 - pickle
model.save("/content/drive/MyDrive/Colab_Notebooks/My_Models/twitter_bert_model", serialization="pickle")