In [1]:
import os
import numpy as np
import json

In [None]:
document_storage = 'documents'

In [4]:
props = np.load('documents/newsgroups.npy')

In [5]:
with open('documents/newsgroups.json') as f:
    docs = json.load(f) 

### 3.  Saving Model outputs
Before we move on to fine-tuning the topic representations, we will first store the results of our model fitting for future use. BERTopic provides 3 methods for this serialization process: safetensors, pytorch and pickle (documentation of this is available [in the documentation.](https://maartengr.github.io/BERTopic/getting_started/serialization/serialization.html).  

For our demonstration we will use the .safetensors method.

**Step 1: Explicitily state the embedding model used to generate the model.  For the default this is the sentence transformers 'all-MiniLM-L6-v2'.**

In [10]:
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"

**Step 2: Define the path to where you want to save the models.  In this case we will use the cookbook_models folder provided** 

In [11]:
model_storage_folder = 'cookbook_models'
model_name = 'newsgroups_default_model'
model_storage_path = os.path.join(model_storage_folder, model_name)

**Step 3: Save the model**

In [12]:
newsgroups_default_model.save(model_storage_path, serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)

**Load and check model**
If you navigate in the folder directory at left you can see your saved files, and you can now load the model from these files directly.  We will run the code to do that below and look at what we produce.  

In [13]:
# Load model
newsgroups_default_model_saved = BERTopic.load(model_storage_path)

In [14]:
# Look at topics
newsgroups_default_model_saved.get_topic_info().head()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,6422,-1_to_the_is_you,"[to, the, is, you, of, and, it, in, for, that]",
1,0,1836,0_game_team_games_he,"[game, team, games, he, players, season, hocke...",
2,1,560,1_key_clipper_chip_encryption,"[key, clipper, chip, encryption, keys, escrow,...",
3,2,524,2_ites_cheek_yep_huh,"[ites, cheek, yep, huh, ken, why, each, of, , ]",
4,3,471,3_fbi_batf_fire_koresh,"[fbi, batf, fire, koresh, compound, they, gas,...",


**Step 4: Save topic info with representative documents**
Looking at the data we see that the Representative_Docs were **NOT** saved in the default.  This is intentional on BERTopics side for data privacy reasons.  However those representative documents are necessary inputs if you want to use Large Language Models (LLM)s to generate sentence representations of your topics.  For this purpose we will save the topic_info dataframe explicitly.

In [17]:
# Set topics info as dataframe
topics_info_dataframe = newsgroups_default_model.get_topic_info()

# Set path to stored filein the same directory as the model. We are using csv for ease in demo, but other file options are available
topic_info_path = os.path.join(model_storage_path, 'topic_info.csv')

# Store Topics Information as a csv file 
topics_info_dataframe.to_csv(topic_info_path)

### 4.  Loading Saved Model outputs

We have already loaded the saved model in step 3 above when we checked it.  We will now load the topic info from the csv and merge this back into the loaded model.

In [21]:
newsgroups_default_model_saved_topics_info = pd.read_csv(topic_info_path)

In [22]:
newsgroups_default_model_saved_topics_info.head()

Unnamed: 0.1,Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,-1,6422,-1_to_the_is_you,"['to', 'the', 'is', 'you', 'of', 'and', 'it', ...",['Why should anyone (check: let\'s restrict th...
1,1,0,1836,0_game_team_games_he,"['game', 'team', 'games', 'he', 'players', 'se...","[""\nWales Conference, Adams Division, Semifina..."
2,2,1,560,1_key_clipper_chip_encryption,"['key', 'clipper', 'chip', 'encryption', 'keys...",['The following document summarizes the Clippe...
3,3,2,524,2_ites_cheek_yep_huh,"['ites', 'cheek', 'yep', 'huh', 'ken', 'why', ...","['\nHuh?', '\nYep.\n', '\n \n ..."
4,4,3,471,3_fbi_batf_fire_koresh,"['fbi', 'batf', 'fire', 'koresh', 'compound', ...","['Folks,\n\nIt\'s time to start building some ..."


In [35]:
newsgroups_default_model_saved.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,6422,-1_to_the_is_you,"[to, the, is, you, of, and, it, in, for, that]",
1,0,1836,0_game_team_games_he,"[game, team, games, he, players, season, hocke...",
2,1,560,1_key_clipper_chip_encryption,"[key, clipper, chip, encryption, keys, escrow,...",
3,2,524,2_ites_cheek_yep_huh,"[ites, cheek, yep, huh, ken, why, each, of, , ]",
4,3,471,3_fbi_batf_fire_koresh,"[fbi, batf, fire, koresh, compound, they, gas,...",
...,...,...,...,...,...
205,204,11,204_usenet_advertising_internet_commercial,"[usenet, advertising, internet, commercial, co...",
206,205,10,205_memory_shared_server_pixmaps,"[memory, shared, server, pixmaps, xputimage, e...",
207,206,10,206_ca_bbs_818_805,"[ca, bbs, 818, 805, 408, il, valley, 310, chic...",
208,207,10,207_freedom_speech_notre_dame,"[freedom, speech, notre, dame, speeches, speac...",


In [27]:
rep_docs_path = os.path.join(model_storage_path, 'representative_docs.json')

In [29]:
import json
with open(rep_docs_path, 'w', encoding='utf-8') as f:
    json.dump(newsgroups_default_model.representative_docs_, f)

In [32]:

with open(rep_docs_path, 'r', encoding='utf-8') as fr:
    rep_docs = json.load(fr)

In [34]:
newsgroups_default_model_saved.representative_docs_ = rep_docs

In [38]:
if newsgroups_default_model_saved.representative_docs_ is not None:
    print('here')
else:
    print('nope')

here


In [39]:
newsgroups_default_model_saved.get_topic_info().head()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,6422,-1_to_the_is_you,"[to, the, is, you, of, and, it, in, for, that]",
1,0,1836,0_game_team_games_he,"[game, team, games, he, players, season, hocke...",
2,1,560,1_key_clipper_chip_encryption,"[key, clipper, chip, encryption, keys, escrow,...",
3,2,524,2_ites_cheek_yep_huh,"[ites, cheek, yep, huh, ken, why, each, of, , ]",
4,3,471,3_fbi_batf_fire_koresh,"[fbi, batf, fire, koresh, compound, they, gas,...",


In [46]:
rep_docs = newsgroups_default_model.representative_docs_

In [47]:
rep_docs ==newsgroups_default_model.representative_docs_

True

In [48]:
import json

with open('saved_dict.json', 'w') as f:
  json.dump(newsgroups_default_model.representative_docs_, f)

with open('saved_dict.json') as f:
  loaded_dict = json.load(f)


In [23]:
# Method 1 - safetensors
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
base_topic_model.save("models/base_topic_model", serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)


In [24]:
# Load model
base_saved = BERTopic.load("models/base_topic_model")

In [25]:
base_saved.get_topic_info().head()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,6916,-1_to_the_is_of,"[to, the, is, of, and, you, for, in, it, that]",
1,0,1827,0_game_team_games_he,"[game, team, games, he, players, season, hocke...",
2,1,569,1_key_clipper_chip_encryption,"[key, clipper, chip, encryption, keys, escrow,...",
3,2,526,2_ites_cheek_yep_huh,"[ites, cheek, yep, huh, ken, ignore, forget, w...",
4,3,483,3_israel_israeli_jews_arab,"[israel, israeli, jews, arab, jewish, arabs, p...",


In [16]:
# Method 2 - pytorch
base_topic_model.save("models/base_topic_model_pytorch", serialization="pytorch", save_ctfidf=True, save_embedding_model=embedding_model)


In [18]:
# Load model
base_saved_pytorch = BERTopic.load("models/base_topic_model_pytorch")

In [21]:
base_saved_pytorch.get_topic_info().head()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,6916,-1_to_the_is_of,"[to, the, is, of, and, you, for, in, it, that]",
1,0,1827,0_game_team_games_he,"[game, team, games, he, players, season, hocke...",
2,1,569,1_key_clipper_chip_encryption,"[key, clipper, chip, encryption, keys, escrow,...",
3,2,526,2_ites_cheek_yep_huh,"[ites, cheek, yep, huh, ken, ignore, forget, w...",
4,3,483,3_israel_israeli_jews_arab,"[israel, israeli, jews, arab, jewish, arabs, p...",
