In [1]:
!pip install bertopic
import pandas as pd
import re
import nltk
from bertopic import BERTopic
import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
import torch
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("No GPU available, using CPU.")

Collecting bertopic
  Downloading bertopic-0.15.0-py2.py3-none-any.whl (143 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/143.4 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m96.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting umap-learn>=0.5.0
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.2/88.2 kB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting plotly>=4.7.0
  Downloading plotly-5.16.1-py2.py3-none-any.whl (15.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.6/15.6 MB[0m [31m86

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


GPU: NVIDIA A100-SXM4-80GB


## Data

In [30]:
import os
os.listdir('../datasets/kickstarter_cleaned')

[]

In [None]:
df_kickstarter = pd.read_csv('../datasets/kickstarter_cleaned/kickstarter_cleaned_topicmodelling.csv')

In [None]:
df_kickstarter = df_kickstarter[df_kickstarter['num_words'] <= 2000]

In [None]:
df_kickstarter.to_csv('../datasets/kickstarter_cleaned/kickstarter_cleaned_topicmodelling.csv')

# Functions

In [None]:
# Usage optional -> probably better embeddings without this
def preprocess_text(document: str) -> str:

    # Remove all the special characters
    document = re.sub(r'\W', ' ', document)

    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)

    # Converting to Lowercase
    document = document.lower()

    return document;

# BERTopic

In [42]:
# Copy Dataframe
df_bertopic_kickstarter = df_kickstarter.copy()

In [43]:
# Remove the columns
df_bertopic_kickstarter.drop(df_bertopic_kickstarter.columns.difference(['project_description', 'project_category_id', 'project_parent_category_id', 'project_state', 'project_launched_at']), axis=1,inplace=True)

### Technology Category

In [44]:
df_bertopic_kickstarter_technology = df_bertopic_kickstarter.copy()

In [45]:
# Filter Dataframe by Technology category (number 16)
df_bertopic_kickstarter_technology = df_bertopic_kickstarter_technology[(df_bertopic_kickstarter_technology['project_category_id'] == 16) | (df_bertopic_kickstarter_technology['project_parent_category_id'] == 16)]

In [46]:
docs = df_bertopic_kickstarter_technology['project_description'].tolist()

In [None]:
from sentence_transformers import SentenceTransformer

# 512 token limit default is all-MiniLM-L6-v2 with 256 token limit
sentence_model = SentenceTransformer("all-distilroberta-v1")
embeddings = sentence_model.encode(docs, show_progress_bar=True)

In [8]:
from bertopic.vectorizers import ClassTfidfTransformer
from hdbscan import HDBSCAN

# Only Topics with 15 documents or more
hdbscan_model = HDBSCAN(min_cluster_size=15)

# removes noise
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=(['0000', 'with sound', 'play replay', 'replay with', 'capable browser', 'sound 0000', 'html5 capable', 'content play', '0000 0000', 'sound play']))

# removes stopwords and other extremes
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) # vectorizer_model = CountVectorizer(stop_words="english") as an alternative to remove stopwords (does not remove other extremes)

topic_model = BERTopic(hdbscan_model=hdbscan_model, vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model, language="english", calculate_probabilities=True, verbose=True, n_gram_range=(1, 2)) #embedding_model=sentence_model
topics, probs = topic_model.fit_transform(docs) #add embeddings as parameter to use costum embeddings model

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/843 [00:00<?, ?it/s]

2023-09-13 14:38:32,417 - BERTopic - Transformed documents to Embeddings
2023-09-13 14:39:10,944 - BERTopic - Reduced dimensionality


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

2023-09-13 14:40:36,726 - BERTopic - Clustered reduced embeddings


### Visualization of Topics

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_hierarchy(top_n_topics=300)

In [None]:
topic_model.visualize_barchart(top_n_topics=300)

In [12]:
freq = topic_model.get_topic_info(); freq.head(300)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,11012,-1_my_project_phone_would,"[my, project, phone, would, work, people, will...",[I (Michael) have been a photography instructo...
1,0,638,0_light_led_lighting_lights,"[light, led, lighting, lights, leds, the light...",[Tired of tripping over things in the dark at ...
2,1,497,1_social_social media_events_event,"[social, social media, events, event, travel, ...",[IMPORTANT UPDATE: Our campaign has moved! Ch...
3,2,487,2_security_files_encryption_passwords,"[security, files, encryption, passwords, passw...","[We re-invented how your devices work Today, m..."
4,3,481,3_golf_sports_swing_players,"[golf, sports, swing, players, ball, athletes,...",[I'm on a mission to turn analytics into a too...
...,...,...,...,...,...
280,279,10,279_touch easel_easel_touch_playsurface,"[touch easel, easel, touch, playsurface, easyt...",[Sonzia presents Touch Easel: The School Desk ...
281,280,10,280_komunity_amal_and neil_life groups,"[komunity, amal, and neil, life groups, neil, ...",[Kindness Begins with Kom-Unity\n PLAY\n\nThe ...
282,281,10,281_paint_paint can_roller covers_the paint,"[paint, paint can, roller covers, the paint, r...",[ Product Info:The Paint Can Holder was design...
283,282,10,282_moto_tramigo_swagon_tramigo moto,"[moto, tramigo, swagon, tramigo moto, moto clo...","[FINALLY, A PRO-ACTIVE WAY TO HAVE A SAFER BIK..."


In [19]:
topic_model.get_topic(15, full=True)

{'Main': [('0000', 0.14969733203398805),
  ('with sound', 0.14967371210649202),
  ('play replay', 0.13543283793945604),
  ('replay with', 0.1354268425235859),
  ('capable browser', 0.13542084797498574),
  ('sound 0000', 0.13542084797498574),
  ('html5 capable', 0.13542084797498574),
  ('content play', 0.13541485429341663),
  ('0000 0000', 0.13541485429341663),
  ('sound play', 0.13537292277925492)]}

In [15]:
topic_model.get_representative_docs(15)

['So What Is Movyabox? Movyabox is a self-contained, portable streaming media server which allows users to easily load, take and stream their media collections (including both music and video content) at home or on the move. Simply put, a media server is a device that stores and shares media content of multiple types. Movyabox allows the user to play their content on up to six devices regardless of the operating platform of the device (smartphone, tablet, PC or Mac).         While at their homes, users can connect Movyabox to their TV, allowing them to watch streamed content from such sites as Netflix, HBO, and others, but also from content that has been downloaded to the device.         Movyabox at Home         You\'ll need an HTML5 capable browser to see this content.       Play                                   Replay with sound          Play with \r\n sound             00:00   00:00                                       Some additional screen shots from our TV interface    courtesy

### Dynamic Topic Modelling

In [25]:
from datetime import datetime

In [57]:
timestamps  = pd.to_datetime(df_bertopic_kickstarter_technology['project_launched_at']).apply(lambda x: (datetime.timestamp(x))).tolist()

In [48]:
len(df_bertopic_kickstarter_technology.index)

26976

In [49]:
len(timestamps)

26976

In [50]:
len(docs)

26976

In [54]:
docs[1]

"I've begun prototyping a low-cost LED light for the Kindle. I'd like the finished product to be something that I can offer as a DIY kit for end users to purchase with parts + instructions, and I'll post how to do it on your own without the kit on my website too. So far, I've just been toying with what parts to use, how to position for good light distribution, etc. Here's a flickr set of the first very basic progression of prototyping to date:\r\n I've also posted about the process on my blog:\r\n\r\n For the next steps, I want to add a switch, move the battery to the back of the Kindle, and make the building of the LED light less of a bend and tape operation and a more polished LED kit. In order to make this next step I could use some money to buy a bunch of parts and tools for prototyping (I need a new soldering iron, wires, heat shrink tubing, etc.). If and when I get it to the level where I begin offering the LED kit to the public, I'll give all donors of this successfully funded p

In [59]:
topics_over_time = topic_model.topics_over_time(docs, timestamps, nr_bins=20)

3it [02:18, 46.06s/it]


KeyboardInterrupt: 

In [None]:
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)

In [None]:
topic_model.visualize_topics_over_time(topics_over_time, topics=[9, 10, 72, 83, 87, 91])

### Topic document Datafrem

In [None]:
document_info = topic_model.get_document_info(docs)