In [89]:
from bertopic import BERTopic
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models.coherencemodel import CoherenceModel
import gensim.corpora as corpora
import matplotlib.pyplot as plt
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance
import sys
import os
import syspend
from utility import parse_config, seed_everything, custom_print
from preprocess_class import create_datasets
from model_base_class import BaseModel
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from transformers.pipelines import pipeline
from sentence_transformers import SentenceTransformer
from bertopic.representation import KeyBERTInspired, ZeroShotClassification
from BERTopic_model import BERTopic_model
sys.path.append("../..")

In [2]:
df = pd.read_csv('../../data/reviews.csv')
train, test = create_datasets(df)

In [53]:
candidate_topics = ['drinks', 'cat', 'dog', 'carbohydrates', 'chocolate', 'snacks', 'healthy alternatives', 'household', 'family', 'sauce', 'condiments']
representation_model = ZeroShotClassification(candidate_topics, model="facebook/bart-large-mnli", min_prob = 0.5)

# Use the representation model in BERTopic on top of the default pipeline
topic_model = BERTopic(representation_model=representation_model)
topics, _ = topic_model.fit_transform(train.text)

In [66]:
miniLM_model = BERTopic_model()
miniLM_model.topic_model = BERTopic.load('sentence-transformers-key-bigram')

In [49]:
miniLM_model.topic_model.save("sentence-transformers-key-bigram")	


In [7]:
test_topic = miniLM_model.topic_model.transform(test.text)

Batches:   0%|          | 0/35 [00:00<?, ?it/s]

365

In [54]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,1055,-1_the_and_to_it
1,0,918,0_drinks___
2,1,868,1_the_and_it_to
3,2,342,2_and_the_to_it
4,3,310,3_snacks___
5,4,252,4_chocolate___
6,5,229,5_cat___
7,6,156,6_sauce___
8,7,124,7_carbohydrates___
9,8,79,8_healthy alternatives___


In [55]:
topic_model.get_topic(1)

[('the', 0.050298565034729265),
 ('and', 0.04202089928639583),
 ('it', 0.04066654586185052),
 ('to', 0.03776439526073845),
 ('this', 0.03678709696743862),
 ('is', 0.031280962615838605),
 ('of', 0.030870238957200563),
 ('for', 0.02765312690854749),
 ('in', 0.025100899786985174),
 ('was', 0.024953398287246218)]

In [52]:
labeled_train = pd.DataFrame({'Text': train.text,'Label':topic_model.topics_})
labeled_train.loc[labeled_train['Label']==4, 'Text'].sample(n=10)

772     I purchased this at my local health food store...
820     I realize that it is not quite healthy to guzz...
1799    I purchased this lime juice a few years ago on...
2123    This is a good drink, tastes like soda, but fr...
2302    This is for sure not what I was expecting it t...
3848    I've tried other flavors of Switch carbonated ...
1781    I expected to like this drink because I like v...
1719    I like orange soda but usually cannot drink an...
3868    So far I like this cherry juice best.  I was e...
3142    I wasn't sure what to expect with this citrusy...
Name: Text, dtype: object

In [41]:
train.text[791]

'Did not like flavor.  Left significant residue in cup and flow was limited into cup.  Tried 2 and plan to return.  Very disappointing'

In [47]:
miniLM_model.topic_model.set_topic_labels({
-1: 'Outliers',
0: 'Soft Drinks',
1: 'Tea/Coffee',
2: 'Dog Food',
3: 'Sauces',
4: 'Baking Products',
5: 'Snack Bars',
6: 'Chips',
7: 'Keurig Coffee Products',
8: 'Cereal',
9: 'Noodle',
10: 'Salty Food Products',
11: 'Crackers',
12: 'Cat Food',
13: 'Coconut Products',
14: 'Popcorn',
15: 'Oil Products',
16: 'Soup',
17: 'Protein Powder',
18: 'Peanut Butter'
})

In [46]:
labeled_train = pd.DataFrame({'Text': train.text,'Label':miniLM_model.topic_model.topics_})
labeled_train.loc[labeled_train['Label']==7, 'Text'].sample(n=10)

1530    this product is hard to find for us in stores ...
2138    I was expecting a plain old cup of coffee when...
1323    The products lid stays attached to the needle ...
937     I love Sumatra coffee and was excited to recei...
2797    Although I've only been able to try three of C...
2338    I love these K cups they have a nice tea taste...
1407    I'm a pretty big coffee drinker, but had never...
2409    I enjoyed these k-cups when I used them. They ...
676     I have tried about 25 different K-Cup flavors,...
139     Caused my cuisinart k-cup brewer to leak every...
Name: Text, dtype: object

In [68]:
miniLM_model.topic_model.set_topic_labels({
-1: 'Outliers',
0: 'Staple Food/Cooking Products',
1: 'Coffee',
2: 'Tea',
3: 'Snacks',
4: 'Dog Food',
5: 'Baby Food/Canned Food',
6: 'Delivery',
7: 'Processed Food',
8: 'Cat Food',
9: 'Concentrated Syrup',
10: 'Popcorn',
11: 'Coconut Products',
12: 'Protein Powder',
13: 'Consumable Oil',
14: 'Soup'})

In [82]:
test_label = model.predict(test)
labeled_test = pd.DataFrame({'Text': test.text,'Label':test_label[0]})
lab = model.topic_model.get_topic_info().loc[:,['Topic', 'CustomName']]
labeled_test = labeled_test.merge(lab, how='left', left_on='Label', right_on='Topic')

Batches:   0%|          | 0/35 [00:00<?, ?it/s]

2023-04-05 14:48:07,900 - BERTopic - Reduced dimensionality
2023-04-05 14:48:07,969 - BERTopic - Predicted clusters


In [83]:
labeled_test.groupby('Label').count()

Unnamed: 0_level_0,Text,Topic,CustomName
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,400,400,400
0,262,262,262
1,107,107,107
2,107,107,107
3,67,67,67
4,55,55,55
5,41,41,41
6,37,37,37
7,13,13,13


In [84]:
out=labeled_test.loc[labeled_test['Label']==-1].sample(n=10, random_state= 4263)   
for i in range(8):
    out = out.append(labeled_test.loc[labeled_test['Label']==i].sample(n=10, replace=False, random_state= 4263))

In [63]:
###### Driver class
curr_dir = os.getcwd()
config_path = os.path.join(curr_dir, 'bert_topic_config.yml')
config_file = parse_config(config_path)
data_file = config_file['data_folder']
home_folder = os.path.abspath(os.path.join(os.getcwd(),'../..'))
data_df = pd.read_csv(os.path.join(home_folder,data_file))
train, test = create_datasets(data_df)
for i in range(len(config_file)-3):
    if i == 5:
        cur_model = f'model_{i}'
        model_name = config_file[cur_model]['model_name']
        embedding_model = eval(config_file[cur_model]['embedding_model'])
        dim_reduction_model = eval(config_file[cur_model]['dim_reduction_model'])
        clustering_model = eval(config_file[cur_model]['clustering_model'])
        vectorizer_params = dict(config_file[cur_model]['vectorizer_params'])    
        ctfidf_params = dict(config_file[cur_model]['ctfidf_params'])
        representation_model = eval(config_file[cur_model]['representation_model'])
        min_topic_size = int(config_file[cur_model]['min_topic_size'])
        nr_topics = config_file[cur_model]['nr_topics']
        logging_path = os.path.join(curr_dir,config_file['log_path'],f'{model_name}.log')
        image_path = os.path.join(curr_dir,config_file['image_folder'])
        ctfidf_model = ClassTfidfTransformer(bm25_weighting= ctfidf_params["bm25_weighting"], reduce_frequent_words= ctfidf_params["reduce_frequent_words"])        
        vectorizer_model = CountVectorizer(stop_words=train.stop_words_list, min_df = vectorizer_params['min_df'], max_df = vectorizer_params['min_df'],\
                                    ngram_range=(1,vectorizer_params['ngram_range']))
        logger = open(os.path.join(curr_dir, logging_path), 'w')
        custom_print('Training model',logger = logger)
        custom_print('---------------------------------\n',logger = logger)
        custom_print(f"model name: {model_name}",logger = logger)
        model = BERTopic_model(embedding_model = embedding_model,
                            dim_reduction_model=dim_reduction_model,
                            clustering_model = clustering_model,
                            vectorizer_model=vectorizer_model, 
                            ctfidf_model=ctfidf_model, 
                            representation_model=representation_model,
                            min_topic_size = min_topic_size)
        if nr_topics == 'None':
            nr_topics = None
        model.train(train, probability=False, nr_topics=nr_topics)
        custom_print(f'Coherence score: {model.evaluate(train)}', logger=logger)
        custom_print(f'{model.topic_model.get_topic_info()}', logger=logger)

        for i in range(len(model.topic_model.topic_labels_)-1):
            custom_print(f'Words and score of topic {i}:\n {model.topic_model.get_topic(i)}',
                            logger=logger)
        fig = model.topic_model.visualize_documents(train.text)
        fig.write_html(f"{image_path}{model_name}_doc_viz.html")
        model.topic_model.save(f"{model_name}")	

logger.close()

Training model
---------------------------------

model name: zero-shot


Batches:   0%|          | 0/137 [00:00<?, ?it/s]

2023-04-05 14:18:35,872 - BERTopic - Transformed documents to Embeddings
2023-04-05 14:18:40,406 - BERTopic - Reduced dimensionality
2023-04-05 14:18:40,581 - BERTopic - Clustered reduced embeddings

Mean of empty slice.


invalid value encountered in double_scalars



Coherence score: nan
    Topic  Count                                               Name
0      -1   1291                -1_rinds_pork rinds_lentils_similac
1       0    288                                        0_drinks___
2       1    282                                        1_drinks___
3       2    265                                        2_snacks___
4       3    167                                     3_household___
5       4    146                                           4_dog___
6       5    127                                         5_sauce___
7       6    112           6_bars good_banana nut_bars ve_kind bars
8       7    112                                        7_drinks___
9       8    108  8_salt vinegar_lime flavoring_chips really_reg...
10      9    107                                 9_carbohydrates___
11     10    106                                10_carbohydrates___
12     11    106                         11_healthy alternatives___
13     12    102           

In [65]:
topics_to_merge= [[0,1,7,12,13,21,28], [2,14,22,8,20], [3,25,31],[5,23],[9,10,16,19],[6,11,15,24,26,17],
                  [-1,27,29,30]]
model.topic_model.merge_topics(train.text, topics_to_merge)

In [80]:
model.topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,CustomName
0,-1,1406,-1_rinds_pork rinds_lentils_similac,Outliers
1,0,972,0_drinks___,Drinks
2,1,563,1_chicken jerky_movie_products china_theater,Snacks
3,2,452,2_healthy alternatives___,Healthy Alternatives
4,3,340,3_carbohydrates___,Carbohydrates
5,4,244,4_household___,Household
6,5,174,5_sauce___,Sauce
7,6,146,6_dog___,Dogs
8,7,58,7_cat___,Cats


In [79]:
model.topic_model.set_topic_labels({-1: "Outliers", 0: "Drinks", 1: "Snacks", 2:"Healthy Alternatives",
                                    3:"Carbohydrates", 4:"Household", 5:"Sauce", 6:'Dogs', 7:"Cats"})

In [85]:
out.drop(columns=['Label', 'Topic'], inplace=True)

In [87]:
out.to_csv(f'zero-shot.csv', index=False)

In [44]:
topics_over_time = model.topic_model.topics_over_time(docs=train.text, 
                                                timestamps=train.date, 
                                                global_tuning=True, 
                                                evolution_tuning=True, 
                                                nr_bins=20)
model.topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=15)

In [88]:
# Save model
model.topic_model.save("zero-shot")	

In [28]:
# Load model
topic_model = BERTopic.load("bertopic_model")	