In [1]:
from bertopic import BERTopic
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models.coherencemodel import CoherenceModel
import gensim.corpora as corpora
import matplotlib.pyplot as plt
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance
import sys
import os
import syspend
from utility import parse_config, seed_everything, custom_print
from preprocess_class import create_datasets
from model_base_class import BaseModel
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from transformers.pipelines import pipeline
from sentence_transformers import SentenceTransformer
from bertopic.representation import KeyBERTInspired

sys.path.append("../..")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tingy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tingy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\tingy\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tingy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
class BERTopic_model(BaseModel):
    """
    BERTopic model for topic modelling. BERTopic is modular and the final topic model is dependent on the submodels chosen for each part of the task
    The parts of the model that an be modified is as follows: 
    1. Document embedding, 2. Dimensionality Reduction, 3. Clustering, 4. Tokenizer, 5. Weighting scheme 6. Representation Tuning (optional)
    """
    def __init__(self, embedding_model = None, dim_reduction_model=None,
                 clustering_model = None, vectorizer_model=None, 
                 ctfidf_model=None,  representation_model=None,
                 min_topic_size = 10):
        """
        @param embedding_model: Model to transform document into matrix of embedding
        @param dim_reduction_model: Dimensionality reduction algorithm to use
        @param clustering_model: Clustering algorithm to use
        @param vectorizer_model: Tokenizer to use
        @param ctfidf_model: weighting scheme to use
        @param representation_model: optional model to use to finetune the representations calculated using ctfidf
        """
        self.topic_model = None
        self.embedding_model = embedding_model
        self.dim_reduction_model = dim_reduction_model
        self.clustering_model = clustering_model
        self.vectorizer_model = vectorizer_model
        self.ctfidf_model = ctfidf_model
        self.representation_model =representation_model
        self.min_topic_size = min_topic_size
    
    def train(self, dataset, probability = False):
        """
        fit and transform the BERTopic model to the dataset
        @param dataset [Dataset]: Dataset for the model to be fit and transform on
        """
        self.topic_model = BERTopic(embedding_model=self.embedding_model, ctfidf_model=self.ctfidf_model,
                        vectorizer_model=self.vectorizer_model, 
                        min_topic_size= self.min_topic_size, 
                        representation_model=self.representation_model, 
                        umap_model = self.dim_reduction_model, 
                        hdbscan_model = self.clustering_model, 
                        nr_topics= 'auto',
                        calculate_probabilities=probability, verbose=True)
        self.topic_model.fit_transform(dataset.text)

    def evaluate(self,dataset):
        """
        Evaluate performance of model using coherence_score. (Using normalise pointwise mutual information, range between -1 and 1, higher score is better)
        prints out coherence score and topic freqenucy
        @param dataset [Dataset]: Dataset to evaluate performance
        """
        c_score = self.get_coherence_score(dataset)
        return c_score
        
    def predict(self, dataset):
        '''
        Cluster the dataset into topics
        @param dataset Union[str,[Dataset]]: New dataset to predict
        @return prediction: Topic prediction for each document
        '''
        if type(dataset) == str:
            return self.topic_model.transform(dataset)
        else:
            return self.topic_model.transform(dataset.text)

    def load_model(self, path):
        '''
        Load previously trained topic model
        @param path [str]: path to model
        '''
        self.topic_model = BERTopic.load(path)
        
    def get_coherence_score(self, dataset):
        """
        Evaluation metric for model
        @param dataset [Dataset]: Training dataset
        @return c_score [float]: coherence score
        """
        documents = pd.DataFrame({"Document": dataset.text,
                                "ID": range(len(dataset.text)),
                                "Topic": self.topic_model.topics_})
        documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
        cleaned_docs = self.topic_model._preprocess_text(documents_per_topic.Document.values)

        # Extract vectorizer and analyzer from BERTopic
        vectorizer = self.topic_model.vectorizer_model
        analyzer = vectorizer.build_analyzer()

        # Extract features for Topic Coherence evaluation
        tokens = [analyzer(doc) for doc in cleaned_docs]
        dictionary = corpora.Dictionary(tokens)
        corpus = [dictionary.doc2bow(token) for token in tokens]
        topic_words = [[words for words, _ in self.topic_model.get_topic(topic)] 
                    for topic in range(len(set(self.topic_model.topics_))-1)]

        # Evaluate
        cm = CoherenceModel(topics=topic_words, 
                                        texts=tokens, 
                                        corpus=corpus,
                                        dictionary=dictionary, 
                                        coherence='c_npmi', #'u_mass', 'c_v', 'c_uci', 'c_npmi'
                                        topn=5)
        return cm.get_coherence()

In [3]:
df = pd.read_csv('../../data/reviews.csv')
train, test = create_datasets(df)

In [43]:
miniLM_model = BERTopic_model()
miniLM_model.topic_model = BERTopic.load('sentence-transformers-key-bigram-v2')

In [49]:
miniLM_model.topic_model.save("sentence-transformers-key-bigram-v2")	


In [7]:
test_topic = miniLM_model.topic_model.transform(test.text)

Batches:   0%|          | 0/35 [00:00<?, ?it/s]

365

In [48]:
miniLM_model.topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,CustomName
0,-1,902,-1_pork rinds_microwave pork_vanilla beans_raw...,Outliers
1,0,1406,0_tangerine flavor_tangerine orange_orange tan...,Soft Drinks
2,1,624,1_quality tea_quality coffee_recommend tea_tea...,Tea/Coffee
3,2,167,2_dog foods_dog treats_products dogs_dogs chicken,Dog Food
4,3,163,3_sauce tasty_ranchero sauce_fajita sauce_spic...,Sauces
5,4,156,4_bisquick pancakes_cake mixes_pancake mixes_b...,Baking Products
6,5,111,5_diet bars_tasting bars_ingredients bars_tast...,Snack Bars
7,6,110,6_chips flavor_chips tasty_tasting chips_love ...,Chips
8,7,100,7_keurig coffee_kona coffee_keurig nice_keurig...,Keurig Coffee Products
9,8,91,8_healthy cereals_cereal tasty_cereal snack_ta...,Cereal


In [47]:
miniLM_model.topic_model.set_topic_labels({
-1: 'Outliers',
0: 'Soft Drinks',
1: 'Tea/Coffee',
2: 'Dog Food',
3: 'Sauces',
4: 'Baking Products',
5: 'Snack Bars',
6: 'Chips',
7: 'Keurig Coffee Products',
8: 'Cereal',
9: 'Noodle',
10: 'Salty Food Products',
11: 'Crackers',
12: 'Cat Food',
13: 'Coconut Products',
14: 'Popcorn',
15: 'Oil Products',
16: 'Soup',
17: 'Protein Powder',
18: 'Peanut Butter'
})

In [46]:
labeled_train = pd.DataFrame({'Text': train.text,'Label':miniLM_model.topic_model.topics_})
labeled_train.loc[labeled_train['Label']==7, 'Text'].sample(n=10)

1530    this product is hard to find for us in stores ...
2138    I was expecting a plain old cup of coffee when...
1323    The products lid stays attached to the needle ...
937     I love Sumatra coffee and was excited to recei...
2797    Although I've only been able to try three of C...
2338    I love these K cups they have a nice tea taste...
1407    I'm a pretty big coffee drinker, but had never...
2409    I enjoyed these k-cups when I used them. They ...
676     I have tried about 25 different K-Cup flavors,...
139     Caused my cuisinart k-cup brewer to leak every...
Name: Text, dtype: object

In [60]:
miniLM_model.topic_model.set_topic_labels({
-1: 'Outliers',
0: 'Staple Food',
1: 'Coffee',
2: 'Tea',
3: 'Chips',
4: 'Dog Food',
5: 'Baby Food/Canned Food',
6: 'Delivery',
7: 'Processed Food',
8: 'Cat Food',
9: 'Concentrated Syrup',
10: 'Popcorn',
11: 'Coconut Products',
12: 'Protein Powder',
13: 'Consumable Oil',
14: 'Soup'})

In [50]:
test_label = miniLM_model.predict(test)
labeled_test = pd.DataFrame({'Text': test.text,'Label':test_label[0]})
lab = miniLM_model.topic_model.get_topic_info().loc[:,['Topic', 'CustomName']]
labeled_test = labeled_test.merge(lab, how='left', left_on='Label', right_on='Topic')

Batches:   0%|          | 0/35 [00:00<?, ?it/s]

In [13]:
labeled_test

Unnamed: 0,Text,Label,Topic,CustomName
0,"This tea is amazing, I cannot believe the pric...",2,2,Tea
1,I've tried a handful of other brands of French...,-1,-1,Outliers
2,I would recommend not buying these nuts if you...,0,0,Staple Food
3,"I wish these came in larger quantities, if I c...",-1,-1,Outliers
4,Has a smooth taste like it claims without any ...,9,9,Concentrated Syrup
...,...,...,...,...
1084,I'd seen an add for these in a magazine recent...,-1,-1,Outliers
1085,I was a bit cautious when I first opened the b...,-1,-1,Outliers
1086,"I bought these for my wife for her birthday, s...",-1,-1,Outliers
1087,Lowrey's has changed it's microwave pork rinds...,-1,-1,Outliers


In [51]:
out=labeled_test.loc[labeled_test['Label']==-1].sample(n=10, random_state= 4263)   
for i in range(19):
    out = out.append(labeled_test.loc[labeled_test['Label']==i].sample(n=10, replace=True, random_state= 4263))

In [54]:
out

Unnamed: 0,Text,CustomName
54,I am the biggest fan of mushrooms you'll ever ...,Outliers
567,I have bought this at a local pet shop for my ...,Outliers
94,I can only drink decaf. Caffiene makes me nut...,Outliers
77,"I bought this product last week for our dogs, ...",Outliers
769,I had been looking for a healthier breakfast. ...,Outliers
...,...,...
965,"I really enjoy this peanut butter, it is sweet...",Peanut Butter
869,"I make an extended butter using this oil, wate...",Peanut Butter
819,The peanut butter was very fresh tasting. The...,Peanut Butter
1023,Absolutely the best Nutella alternative. I act...,Peanut Butter


In [53]:
out.drop(columns=['Label', 'Topic'], inplace=True)

In [55]:
out.to_csv(f'sentence-transformers-key-bigram-v2.csv', index=False)

In [44]:
topics_over_time = model.topic_model.topics_over_time(docs=train.text, 
                                                timestamps=train.date, 
                                                global_tuning=True, 
                                                evolution_tuning=True, 
                                                nr_bins=20)
model.topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=15)

In [25]:
# Save model
topic_model.save("bertopic_model")	

In [28]:
# Load model
topic_model = BERTopic.load("bertopic_model")	