**Set-up**

In [None]:
#Install libraries and then go to Runtime->Restart Runtime
#No need to re-run this cell after restarting runtime
#Also it's recommended to go to Edit->Notebook Setting and turn on GPU accelerator
!pip install bertopic
!pip install --upgrade joblib==1.1.0
!pip install keybert

In [None]:
#Read in the data - Must upload the tsv.gz file to colab
import pandas as pd

df = pd.read_csv('sbir_sttr_history.tsv.gz')

#I'm currently combining name_award and description_award into one feature
#Sounds like BERTopic works best with sentences or paragraphs, not multiple paragraphs
#So we probably want to change up this approach.
df['text_feature'] = df['name_award'] + ' ' + df['description_award']
docs = df['text_feature']

In [None]:
df.to_csv('sheldon.csv')

In [None]:
#Remove records that don't have a text feature, as BERTopic doesn't work with those
docs = docs.tolist()
docs = [x for x in docs if str(x) != 'nan']

In [None]:
#In order to run the model efficently reduce size to 10% randomly
import random
docs_20000 = random.sample(docs, 20000)

In [None]:
type(docs_20000)

In [None]:
sample_docs = pd.DataFrame({"Doc": docs_20000})
sample_docs.to_csv('sample_docs.csv')

**Train the Model**

In [None]:
#Add common SBIR award words to stop words
from sklearn.feature_extraction import text
stop_words = list(text.ENGLISH_STOP_WORDS)
my_words = ['phase', 'data', 'technology', 'system', 'high', 'develop',
            'development', 'project', 'use', 'design']
stop_words.extend(my_words)


In [None]:
#Create a vocabulary of keywords
from keybert import KeyBERT

# Extract keywords
kw_model = KeyBERT()
keywords = kw_model.extract_keywords(docs)

# Create our vocabulary
vocabulary = [k[0] for keyword in keywords for k in keyword]
vocabulary = list(set(vocabulary))

In [None]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import IncrementalPCA
from bertopic.vectorizers import OnlineCountVectorizer

# Prepare sub-models that support online learning
umap_model = IncrementalPCA(n_components=5)
cluster_model = MiniBatchKMeans(n_clusters=50, random_state=0)
vectorizer_model = OnlineCountVectorizer(ngram_range=(1, 3), stop_words=stop_words,
                                   vocabulary=vocabulary, decay=.01)

In [None]:
topic_model = BERTopic(umap_model=umap_model,
                       hdbscan_model=cluster_model,
                       vectorizer_model=vectorizer_model)

In [None]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean',
                        cluster_selection_method='leaf', prediction_data=True,
                        min_samples=5)

vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words=stop_words,
                                   vocabulary=vocabulary)
topic_model = BERTopic(language="english",
                       calculate_probabilities=True,
                       verbose=True,
                       vectorizer_model=vectorizer_model,
                       nr_topics='auto',
                       hdbscan_model=hdbscan_model,
                       diversity=0.2)


In [None]:
!pip install --upgrade numba

In [None]:
import time
round = 1
for index in range(0, len(docs) - 150000, 1000):
    start = time.time()
    topic_model.partial_fit(docs[index: index+1000])
    topic_model.save('bertopic_model')
    #do some stuff
    stop = time.time()
    duration = str(stop-start)
    current_round = str(round)
    print('Round ' + current_round + ': ' + duration)
    round = round + 1

In [None]:
from bertopic import BERTopic
!pip install --upgrade numba
#Save model
topic_model.save('bertopic_002')

In [None]:
from bertopic import BERTopic
from keybert import KeyBERT
import pandas as pd

#Load model
topic_model = BERTopic.load('bertopic_002')

**Analyze Output**

In [None]:
#List of all topics, sorted by count - Topic -1 is unclustered outliers
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,279,0_instrument_materials_nosetip_properties
1,1,562,1_treatment_tb_ds_vasospasm
2,2,550,2_blood_flow_tissue_naloxone
3,3,153,3_guidance_launch_reusable launch vehicles_reu...
4,4,359,4_cells_delivery_skin_chiral
5,5,120,5_systems_failure_models_prognostic
6,6,273,6_coating_coatings_vent_trucks
7,7,404,7_stroke_cerebral_ischemic stroke_ischemic
8,8,173,8_hd_screening_irritability_ptsd
9,9,332,9_raster_reality_support_users


In [None]:
#Inspect a certain topic for top n words
topic_model.get_topic(0)

[('', 1e-05),
 ('', 1e-05),
 ('', 1e-05),
 ('', 1e-05),
 ('', 1e-05),
 ('', 1e-05),
 ('', 1e-05),
 ('', 1e-05),
 ('', 1e-05),
 ('', 1e-05)]

In [None]:
#Barchart
bar = topic_model.visualize_barchart()
bar.write_html("topic_barchart.html")


In [None]:
#Document distance map
from sentence_transformers import SentenceTransformer
from umap import UMAP

sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(docs, show_progress_bar=False)
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

doc = topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)
doc.write_html("document_map.html")


In [None]:
# This is a bertopic model that runs on GPU. J.D. can run over all docs

from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN
#from cuml.cluster import HDBSCAN
from cuml.manifold import UMAP

# Create instances of GPU-accelerated UMAP and HDBSCAN
umap_model = UMAP(n_components=5, n_neighbors=30, min_dist=0.0)
#hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, prediction_data=True)
vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words=stop_words,
                                   )

# Pass the above models to be used in BERTopic
cpu_model = BERTopic(language='English',
                       calculate_probabilities=True,
                       umap_model = umap_model,
                       min_topic_size=30,
                       verbose=True,
                       low_memory=True,
                       vectorizer_model=vectorizer_model,
                       nr_topics='auto',
                       #hdbscan_model=hdbscan_model,
                       diversity=0.2)

In [None]:
#Visualize clusters in 2D
topic_model.visualize_topics()

In [None]:
#Visualize hierarchical clustering
hierarchical_topics = topic_model.hierarchical_topics(docs)
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

ValueError: ignored

In [None]:
tree = topic_model.get_topic_tree(hierarchical_topics)
print(tree)


**Export Topic Assignments to Spreadsheet**

In [None]:
topic_labels = []
for topic in topics:
  topic_labels.append(topic_model.topic_labels_[topic])

In [None]:
results = pd.DataFrame({"Doc": docs, "Topic": topics, "Label": topic_labels})

ValueError: ignored

In [None]:
results.to_csv('topics.csv')

In [None]:
hierarchical_topics.to_csv('hierarchical_topics.csv')