# Imports

In [36]:
#!pip install pyLDAvis
#!pip install visdom
import pandas as pd
import nltk
import re
import gensim
import gensim.corpora as corpora
from gensim.models import Phrases
from nltk.corpus import stopwords
from tqdm.notebook import tqdm
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
from sklearn.feature_extraction.text import CountVectorizer
tqdm.pandas()

# Data

In [37]:
#from google.colab import drive
#drive.mount('/content/drive')
df_lda_kickstarter = pd.read_csv('../kickstarter_cleaned.csv')

# Functions

In [38]:
def preprocess_text(document: str, stemmer: nltk.stem.WordNetLemmatizer, en_stop: set) -> list:
    """Preprocesses a document to remove special characters/whitespace/etc

    Args:
        document (str):
        stemmer (nltk.stem.WordNetLemmatizer): Stemmer from NLTK
        en_stop (set): Set of stop words, usually from NLTK

    Returns:
        str: preprocessed document
    """

    # Remove all the special characters
    document = re.sub(r'\W', ' ', document)

    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)

    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)

    # Converting to Lowercase
    document = document.lower()

    # Tokenization
    tokens = nltk.word_tokenize(document)

    # POS-Tagging
    tagged_tokens = nltk.pos_tag(tokens)

    # Filter Nouns and Lemmatization
    lemmatized_nouns = []
    for token, pos in tagged_tokens:
        if pos.startswith('N'):  # Check if the token is a noun
            lemma = stemmer.lemmatize(token)
            # FIlter stop words, words that contain only numbers and short words
            if lemma not in en_stop and not lemma.isdigit() and len(lemma) > 2:
                lemmatized_nouns.append(lemma)

    return lemmatized_nouns;

In [39]:
def preprocess_text_helper(t):
    stemmer = nltk.stem.WordNetLemmatizer()
    return preprocess_text(t, stemmer, stopwords.words('english'))

In [40]:
# Create function to calculate topic distribution
def get_topic_distribution(text, dictionary, model):
    bow = dictionary.doc2bow(text)
    topic_probs = model.get_document_topics(bow)
    return topic_probs

In [41]:
# Create a function to extract the probability for the chosen topic
def get_topic_probability(topic_dist, chosen_topic):
    for topic, prob in topic_dist:
        if topic == chosen_topic:
            return prob
    return 0  # Return 0 if the chosen topic is not found

In [105]:
# Create a function to extract the top n descriptions corresponding to a topic
def get_top_descriptions(df, chosen_topic, n, model):

    # Calculate topic distribution for projects
    #df['topic_distribution'] = df['processed_description'].progress_apply(lambda x: get_topic_distribution(x, dictionary, model))   
    
    # Add a new column with the probability for the chosen topic
    df['selected_topic_probability'] = df['topic_distribution'].progress_apply(lambda x: get_topic_probability(x, chosen_topic))

    # Sort the DataFrame based on the probability for the chosen topic
    sorted_df = df.sort_values(by='selected_topic_probability', ascending=False)

    # Print the top N project descriptions for the chosen topic
    top_n = n  # Number of top project descriptions to print
    print(model.show_topic(chosen_topic, topn=10))
    for i, (_, row) in enumerate(sorted_df.head(top_n).iterrows()):
        original_index = row.name
        print(f"Document {original_index}:")
        print(row['topic_distribution'])
        print("Topic Probability:", row['selected_topic_probability'])
        print("Project Description:", row['project_description'])
        print("processed Description:", row['processed_description'])
        print("\n")


# LDA

In [43]:
# Remove the columns
df_lda_kickstarter.drop(df_lda_kickstarter.columns.difference(['project_description', 'project_category_id', 'project_parent_category_id', 'project_state']), axis=1,inplace=True)

### Technology Category

In [44]:
df_lda_kickstarter_technology = df_lda_kickstarter.copy()

In [45]:
# Filter Dataframe by Technology category (number 16)
df_lda_kickstarter_technology = df_lda_kickstarter_technology[(df_lda_kickstarter_technology['project_category_id'] == 16) | (df_lda_kickstarter_technology['project_parent_category_id'] == 16)]

In [46]:
# Preprocessing general steps
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
df_lda_kickstarter_technology['processed_description'] = df_lda_kickstarter_technology['project_description'].progress_apply(preprocess_text_helper)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\CoolerMaster\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\CoolerMaster\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\CoolerMaster\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


  0%|          | 0/26976 [00:00<?, ?it/s]

In [None]:
data_technology = df_lda_kickstarter_technology['processed_description'].tolist()
# Compute n-grams
from gensim.models import Phrases

# Add n-grams to docs (only ones that appear 20 times or more).
ngrams_technology = Phrases(data_technology, min_count=20)

# Create an iterator for df_lda_kickstarter_technology
df_iterator = df_lda_kickstarter_technology.iterrows()

for idx in range(len(data_technology)):
    for token in ngrams_technology[data_technology[idx]]:
        if '_' in token:
            # Token is a n-grams, add to document.
            data_technology[idx].append(token)

            # Get the corresponding row from df_lda_kickstarter_technology using the iterator
            try:
                _, row = next(df_iterator)
                row['processed_description'].extend(token)
            except StopIteration:
                break

In [50]:
# create dictionary
dictionary_technology = corpora.Dictionary(data_technology)

In [51]:
# Filter out words that occur less than 5 documents, or more than 50% of the documents.
dictionary_technology.filter_extremes(no_below=5, no_above=0.5)

In [52]:
# Define your custom stoplist
custom_stoplist_technology = ["project", "kickstarter", "pledge", "backer", "campaign", "product"] 
                              #"people", "device", "system", "user", "way", "technology", "goal", "reward", "use"]
# Add your custom stop words to the dictionary
stop_ids_technology = [dictionary_technology.token2id[word] for word in custom_stoplist_technology if word in dictionary_technology.token2id]
# Remove the stop words from the dictionary
dictionary_technology.filter_tokens(bad_ids=stop_ids_technology)

In [53]:
# create corpus
corpus_technology = [dictionary_technology.doc2bow(tokens) for tokens in data_technology]

In [17]:
from collections import Counter

# Count word occurrences
word_counts_technology = Counter()
for doc in corpus_technology:
    for word_id, count in doc:
        word = dictionary_technology[word_id]
        word_counts_technology[word] += count

# Get most frequent words
most_common_technology = word_counts_technology.most_common(100)
print(most_common_technology)

[('app', 39720), ('design', 31958), ('people', 31013), ('device', 30676), ('system', 29730), ('user', 28330), ('year', 27333), ('way', 26500), ('power', 24389), ('world', 24184), ('technology', 23965), ('video', 21718), ('goal', 20748), ('phone', 20672), ('play', 19988), ('part', 18539), ('life', 18421), ('feature', 18172), ('team', 18028), ('development', 17840), ('experience', 17774), ('day', 17587), ('software', 17564), ('data', 17200), ('business', 16974), ('battery', 16912), ('home', 16488), ('prototype', 15725), ('application', 15594), ('support', 15540), ('community', 15178), ('work', 15047), ('idea', 14839), ('cost', 14608), ('production', 14579), ('company', 14483), ('service', 14265), ('board', 14064), ('platform', 14049), ('level', 13721), ('thing', 13459), ('information', 13440), ('reward', 12976), ('tool', 12566), ('quality', 12455), ('game', 12197), ('water', 12163), ('order', 11971), ('friend', 11919), ('student', 11894), ('hand', 11880), ('control', 11869), ('computer',

In [18]:
print('Number of unique tokens: %d' % len(dictionary_technology))
print('Number of documents: %d' % len(corpus_technology))

Number of unique tokens: 19789
Number of documents: 26976


In [19]:
from gensim.models.callbacks import CoherenceMetric, DiffMetric, PerplexityMetric, ConvergenceMetric

# define perplexity callback
pl = PerplexityMetric(corpus=corpus_technology, logger="visdom", title="Perplexity")

# define other remaining metrics available
ch_umass = CoherenceMetric(corpus=corpus_technology, coherence="u_mass", logger="visdom", title="Coherence (u_mass)")
ch_cv = CoherenceMetric(corpus=corpus_technology, texts=data_technology, coherence="c_v", logger="visdom", title="Coherence (c_v)")
diff_kl = DiffMetric(distance="kullback_leibler", logger="visdom", title="Diff (kullback_leibler)")
convergence_kl = ConvergenceMetric(distance="jaccard", logger="visdom", title="Convergence (jaccard)")

callbacks = [pl, ch_umass, ch_cv, diff_kl, convergence_kl]

In [20]:
# Iterate over the desired topic counts from 10 to 300 in steps of 10
#for num_topics in tqdm(range(10, 301, 10)):
    # Create an LDA model with the current topic count
lda_model = gensim.models.LdaModel(corpus=corpus_technology, id2word=dictionary_technology, num_topics=100, passes=10, per_word_topics=True, chunksize=1500, iterations=150, alpha='auto', callbacks=callbacks)

    # Save the model to Google Drive
    #model_path = main_path + f"lda_technology/{num_topics}/lda_model_technology_{num_topics}"
    #lda_model.save(model_path)

    # Append the model path to the list
    #lda_models_technology.append(lda_model)

Setting up a new session...


In [44]:
vis = gensimvis.prepare(lda_model, corpus_technology, dictionary_technology)
pyLDAvis.save_html(vis, './LDA/technology/vis/100_POS.html')

  default_term_info = default_term_info.sort_values(


In [21]:
lda_model.save('./LDA/technology/100/LDA_technology_100_POS')

In [24]:
lda_technology_50 = gensim.models.LdaModel.load('./LDA/technology/50/LDA_technology_50')
lda_technology_100 = gensim.models.LdaModel.load('./LDA/technology/100/LDA_technology_100')
lda_technology_200 = gensim.models.LdaModel.load('./LDA/technology/200/LDA_technology_200')

In [25]:
vis_50 = gensimvis.prepare(lda_technology_50, corpus_technology, dictionary_technology)
vis_100 = gensimvis.prepare(lda_technology_100, corpus_technology, dictionary_technology)
vis_200 = gensimvis.prepare(lda_technology_200, corpus_technology, dictionary_technology)

  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(


In [27]:
pyLDAvis.save_html(vis_50, './LDA/technology/vis/50.html')
pyLDAvis.save_html(vis_100, './LDA/technology/vis/100.html')
pyLDAvis.save_html(vis_200, './LDA/technology/vis/200.html')

In [None]:
# Loop through the created LDA models
for num_topics, lda_model in zip(range(10, 301, 10), lda_models_technology):
    # Create the visualization
    vis = gensimvis.prepare(lda_model, corpus_technology, dictionary_technology)

    # Save the visualization as an HTML file
    html_path = main_path + f"lda_technology/{num_topics}/lda_vis_technology_{num_topics}.html"
    pyLDAvis.save_html(vis, html_path)

In [71]:
lda_model = gensim.models.LdaModel.load('../LDA/technology/100/LDA_technology_100_POS')

# Get the document-topic distribution
document_topic_distribution = lda_model.get_document_topics(corpus_technology)


In [62]:
# Calculate topic distribution for projects
df_lda_kickstarter_technology['topic_distribution'] = df_lda_kickstarter_technology['processed_description'].progress_apply(lambda x: get_topic_distribution(x, dictionary_technology, lda_model)) 

  0%|          | 0/26976 [00:00<?, ?it/s]

In [122]:
get_top_descriptions(df_lda_kickstarter_technology, 49, 10, lda_model)

  0%|          | 0/26976 [00:00<?, ?it/s]

[('earth', 0.07328849), ('space', 0.05889715), ('planet', 0.05794305), ('mission', 0.053534202), ('rocket', 0.042700317), ('experiment', 0.030173728), ('launch', 0.029435897), ('exploration', 0.028161496), ('science', 0.026005017), ('moon', 0.025583562)]
Document 127101:
[(2, 0.018465027), (3, 0.011230439), (5, 0.024826935), (7, 0.06237485), (8, 0.01951306), (12, 0.010114677), (26, 0.010741617), (33, 0.064890526), (36, 0.0228165), (41, 0.028042266), (43, 0.055787455), (48, 0.012700078), (49, 0.21737455), (50, 0.1082855), (55, 0.013706272), (56, 0.019996902), (63, 0.0123601975), (69, 0.06220742), (72, 0.062977225), (74, 0.027659697), (80, 0.014657998), (93, 0.017964711)]
Topic Probability: 0.2173745483160019
Project Description:                                            booksplatform.comBooks platform uses the same concept of buying selling and renting the books but with some little modification. In this platform we will help student’s rent and sell their books to other students by con

In [115]:
document_topic_distribution = lda_model.get_document_topics(corpus_technology)

In [None]:
# Convert the TransformedCorpus object to a list of lists
document_topic_distribution_list = [list(doc) for doc in document_topic_distribution]

# Print the structure of the document-topic distribution
for doc in document_topic_distribution_list:
    print(doc)

In [127]:
chosen_topic = 49

print(lda_model.show_topic(chosen_topic, topn=10))

# Get the document-topic distribution
document_topic_distribution = lda_model.get_document_topics(corpus_technology)

# Ensure chosen_topic is within the valid range
if 0 <= chosen_topic < lda_model.num_topics:
    # Sort documents by their probability score for the chosen topic
    sorted_documents = sorted(
        enumerate(document_topic_distribution),
        key=lambda x: next((prob for topic, prob in x[1] if topic == chosen_topic), 0),
        reverse=True
    )
    top_n = 5  # Number of top documents to print
    for i, (doc_id, topic_probs) in enumerate(sorted_documents[:top_n]):
        document = corpus_technology[doc_id]
        project_description = df_lda_kickstarter_technology.iloc[doc_id]['project_description']
        print(f"Document {i + 1} (Corpus ID {doc_id}):")
        print("Topic Probability:", next((prob for topic, prob in topic_probs if topic == chosen_topic), 0))
        words = [word for word, freq in document]
        print("Document Text:", words)
        print("Project Description:", project_description)
        print("\n")
else:
    print(f"Chosen topic index {chosen_topic} is out of range.")


[('earth', 0.07328849), ('space', 0.05889715), ('planet', 0.05794305), ('mission', 0.053534202), ('rocket', 0.042700317), ('experiment', 0.030173728), ('launch', 0.029435897), ('exploration', 0.028161496), ('science', 0.026005017), ('moon', 0.025583562)]
Document 1 (Corpus ID 5260):
Topic Probability: 0.22032104
Document Text: [9, 30, 38, 48, 64, 83, 84, 101, 113, 115, 119, 128, 133, 138, 139, 172, 173, 182, 206, 263, 319, 329, 332, 335, 354, 366, 391, 411, 429, 450, 453, 472, 488, 490, 503, 518, 524, 526, 532, 596, 605, 618, 645, 669, 897, 921, 928, 1132, 1159, 1198, 1224, 1354, 1410, 1416, 1542, 1567, 1683, 2506, 2508, 3145, 3148, 3229, 3596, 3609, 3634, 4416, 4686, 4893, 5836, 6580, 8128, 12231]
Project Description:                                            booksplatform.comBooks platform uses the same concept of buying selling and renting the books but with some little modification. In this platform we will help student’s rent and sell their books to other students by connecting t

In [97]:
lda_model.show_topics(num_topics=100, num_words=5,formatted=False)

[(0,
  [('model', 0.27340782),
   ('hand', 0.18517405),
   ('paper', 0.07428981),
   ('pocket', 0.06711247),
   ('palm', 0.021995557)]),
 (1,
  [('filter', 0.23052648),
   ('oil', 0.049532704),
   ('waste', 0.038064633),
   ('protection', 0.03258945),
   ('filtration', 0.028714458)]),
 (2,
  [('art', 0.18539728),
   ('artist', 0.097372755),
   ('collection', 0.032964494),
   ('history', 0.031125648),
   ('museum', 0.030880507)]),
 (3,
  [('output', 0.06567293),
   ('input', 0.06203519),
   ('channel', 0.035808913),
   ('voltage', 0.030149069),
   ('effect', 0.025455726)]),
 (4,
  [('user', 0.25149477),
   ('medium', 0.06860956),
   ('message', 0.036952436),
   ('group', 0.036429774),
   ('profile', 0.030639045)]),
 (5,
  [('content', 0.09667209),
   ('creator', 0.040244915),
   ('movie', 0.039695006),
   ('character', 0.03113036),
   ('show', 0.026449785)]),
 (6,
  [('radio', 0.123276606),
   ('receiver', 0.046684355),
   ('gps', 0.046156887),
   ('antenna', 0.045875195),
   ('bundle',

In [98]:
lda_model.show_topic(49, topn=10)


[('earth', 0.07328849),
 ('space', 0.05889715),
 ('planet', 0.05794305),
 ('mission', 0.053534202),
 ('rocket', 0.042700317),
 ('experiment', 0.030173728),
 ('launch', 0.029435897),
 ('exploration', 0.028161496),
 ('science', 0.026005017),
 ('moon', 0.025583562)]

### Games Category

In [22]:
df_lda_kickstarter_games = df_lda_kickstarter.copy()

In [23]:
# Filter Dataframe by Games category (number 12)
df_lda_kickstarter_games = df_lda_kickstarter_games[(df_lda_kickstarter_games['project_category_id'] == 12) | (df_lda_kickstarter_games['project_parent_category_id'] == 12)]

In [24]:
# Preprocessing
df_lda_kickstarter_games['project_description'] = df_lda_kickstarter_games['project_description'].progress_apply(preprocess_text_helper)

  0%|          | 0/39654 [00:00<?, ?it/s]

In [25]:
data_games = df_lda_kickstarter_games['project_description'].tolist()
# Compute bigrams/trigrams
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
ngrams_games = Phrases(data_games, min_count=20)
for idx in range(len(data_games)):
    for token in ngrams_games[data_games[idx]]:
        if '_' in token:
            # Token is a bigram/trigram, add to document.
            data_games[idx].append(token)

In [26]:
# create dictionary
dictionary_games = corpora.Dictionary(data_games)

In [27]:
# Filter out words that occur less than 5 documents, or more than 50% of the documents.
dictionary_games.filter_extremes(no_below=5, no_above=0.5)

In [28]:
# Define your custom stoplist
custom_stoplist_games = ["project", "kickstarter", "pledge", "backer", "campaign", "product"] #to be extended
# Add your custom stop words to the dictionary
stop_ids_games = [dictionary_games.token2id[word] for word in custom_stoplist_games if word in dictionary_games.token2id]
# Remove the stop words from the dictionary
dictionary_games.filter_tokens(bad_ids=stop_ids_games)

In [29]:
# create corpus
corpus_games = [dictionary_games.doc2bow(tokens) for tokens in data_games]

In [30]:
from collections import Counter

# Count word occurrences
word_counts_games = Counter()
for doc in corpus_games:
    for word_id, count in doc:
        word = dictionary_games[word_id]
        word_counts_games[word] += count

# Get most frequent words
most_common_games = word_counts_games.most_common(100)
print(most_common_games)

[('card', 187046), ('character', 65596), ('level', 62734), ('deck', 61114), ('book', 48977), ('reward', 48967), ('play', 44231), ('art', 41949), ('design', 41142), ('year', 40574), ('stretch', 39548), ('dice', 38203), ('stretch_goal', 37227), ('people', 35017), ('story', 34738), ('copy', 34333), ('board', 34033), ('rule', 33813), ('adventure', 32497), ('shipping', 32293), ('system', 32043), ('team', 31380), ('box', 30285), ('set', 30063), ('edition', 29871), ('version', 29191), ('item', 29123), ('cost', 28641), ('life', 27998), ('point', 27868), ('friend', 27775), ('order', 27523), ('thing', 27511), ('color', 26079), ('experience', 25413), ('hand', 25394), ('part', 25250), ('work', 25206), ('page', 25084), ('day', 23422), ('power', 23205), ('action', 22846), ('monster', 22786), ('video', 22296), ('print', 22199), ('ability', 21917), ('number', 21794), ('battle', 21661), ('map', 21426), ('idea', 20619), ('support', 20463), ('piece', 20376), ('everyone', 20289), ('fun', 20132), ('hero', 

In [31]:
print('Number of unique tokens: %d' % len(dictionary_games))
print('Number of documents: %d' % len(corpus_games))

Number of unique tokens: 31660
Number of documents: 39654


In [None]:
# Initialize an empty list to store the LDA models
lda_models_games = []

In [41]:
from gensim.models.callbacks import CoherenceMetric, DiffMetric, PerplexityMetric, ConvergenceMetric

# define perplexity callback
pl = PerplexityMetric(corpus=corpus_games, logger="visdom", title="Perplexity")

# define other remaining metrics available
ch_umass = CoherenceMetric(corpus=corpus_games, coherence="u_mass", logger="visdom", title="Coherence (u_mass)")
ch_cv = CoherenceMetric(corpus=corpus_games, texts=data_games, coherence="c_v", logger="visdom", title="Coherence (c_v)")
diff_kl = DiffMetric(distance="kullback_leibler", logger="visdom", title="Diff (kullback_leibler)")
convergence_kl = ConvergenceMetric(distance="jaccard", logger="visdom", title="Convergence (jaccard)")

callbacks = [pl, ch_umass, ch_cv, diff_kl, convergence_kl]

In [32]:
lda_model_70 = gensim.models.LdaModel(corpus=corpus_games, id2word=dictionary_games, num_topics=70, passes=5, per_word_topics=True, chunksize=2000, iterations=150, alpha='auto')

In [67]:
lda_model_100 = gensim.models.LdaModel(corpus=corpus_games, id2word=dictionary_games, num_topics=100, passes=3, per_word_topics=True, chunksize=20000, iterations=150, alpha='auto')

In [68]:
lda_model_125 = gensim.models.LdaModel(corpus=corpus_games, id2word=dictionary_games, num_topics=125, passes=3, per_word_topics=True, chunksize=20000, iterations=150, alpha='auto')

In [45]:
vis = gensimvis.prepare(lda_model_70, corpus_games, dictionary_games)
pyLDAvis.save_html(vis, './LDA/games/vis/70_POS.html')

  default_term_info = default_term_info.sort_values(


In [69]:
vis_75 = gensimvis.prepare(lda_model_75, corpus_games, dictionary_games)
vis_100 = gensimvis.prepare(lda_model_100, corpus_games, dictionary_games)
vis_125 = gensimvis.prepare(lda_model_125, corpus_games, dictionary_games)

  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(


In [70]:
pyLDAvis.save_html(vis_75, './LDA/games/vis/75.html')
pyLDAvis.save_html(vis_100, './LDA/games/vis/100.html')
pyLDAvis.save_html(vis_125, './LDA/games/vis/125.html')

In [None]:
# Iterate over the desired topic counts from 10 to 300 in steps of 10
for num_topics in range(10, 301, 10):
    # Create an LDA model with the current topic count
    lda_model = gensim.models.LdaMulticore(corpus=corpus_games, id2word=dictionary_games, num_topics=num_topics, passes=30, workers=8, per_word_topics=True, chunksize=100, iterations=150, eval_every=None, gamma_threshold=0.001)

    # Save the model to Google Drive
    model_path = main_path + f"lda_games/{num_topics}/lda_model_games_{num_topics}"
    lda_model.save(model_path)

    # Append the model path to the list
    lda_models_games.append(lda_model)

In [None]:
# Loop through the created LDA models
for num_topics, lda_model in zip(range(10, 301, 10), lda_models_games):
    # Create the visualization
    vis = gensimvis.prepare(lda_model, corpus_games, dictionary_games)

    # Save the visualization as an HTML file
    html_path = main_path + f"lda_games/{num_topics}/lda_vis_games_{num_topics}.html"
    pyLDAvis.save_html(vis, html_path)

### Design Category

In [33]:
df_lda_kickstarter_design = df_lda_kickstarter.copy()

In [34]:
# Filter Dataframe by Design category (number 7)
df_lda_kickstarter_design = df_lda_kickstarter_design[(df_lda_kickstarter_design['project_category_id'] == 7) | (df_lda_kickstarter_design['project_parent_category_id'] == 7)]

In [35]:
# Preprocessing general steps
nltk.download('stopwords')
nltk.download('wordnet')
df_lda_kickstarter_design['project_description'] = df_lda_kickstarter_design['project_description'].progress_apply(preprocess_text_helper)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\CoolerMaster\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\CoolerMaster\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


  0%|          | 0/28290 [00:00<?, ?it/s]

In [36]:
data_design = df_lda_kickstarter_design['project_description'].tolist()
# Compute n-grams
from gensim.models import Phrases

# Add n-grams to docs (only ones that appear 20 times or more).
ngrams_design = Phrases(data_design, min_count=20)
for idx in range(len(data_design)):
    for token in ngrams_design[data_design[idx]]:
        if '_' in token:
            # Token is a n-grams, add to document.
            data_design[idx].append(token)

In [37]:
# create dictionary
dictionary_design = corpora.Dictionary(data_design)

In [38]:
# Filter out words that occur less than 5 documents, or more than 50% of the documents.
dictionary_design.filter_extremes(no_below=5, no_above=0.5)

In [39]:
# Define your custom stoplist
custom_stoplist_design = ["project", "kickstarter", "pledge", "backer", "campaign", "product"] #to be extended
# Add your custom stop words to the dictionary
stop_ids_design = [dictionary_design.token2id[word] for word in custom_stoplist_design if word in dictionary_design.token2id]
# Remove the stop words from the dictionary
dictionary_design.filter_tokens(bad_ids=stop_ids_design)

In [40]:
# create corpus
corpus_design = [dictionary_design.doc2bow(tokens) for tokens in data_design]

In [41]:
from collections import Counter

# Count word occurrences
word_counts_design = Counter()
for doc in corpus_design:
    for word_id, count in doc:
        word = dictionary_design[word_id]
        word_counts_design[word] += count

# Get most frequent words
most_common_design = word_counts_design.most_common(100)
print(most_common_design)

[('year', 29868), ('way', 26199), ('production', 25489), ('color', 25434), ('material', 23784), ('goal', 23654), ('hand', 22765), ('people', 21379), ('day', 21053), ('reward', 20921), ('quality', 20735), ('bag', 20164), ('prototype', 19939), ('world', 19687), ('life', 19384), ('case', 17866), ('order', 17849), ('water', 17793), ('part', 17763), ('play', 17668), ('size', 17603), ('idea', 16367), ('card', 16111), ('support', 15584), ('work', 15074), ('process', 14131), ('home', 14068), ('tool', 13981), ('system', 13738), ('thing', 13736), ('cost', 13113), ('company', 12783), ('pocket', 12689), ('use', 12287), ('phone', 12187), ('experience', 11951), ('option', 11928), ('friend', 11918), ('piece', 11869), ('place', 11760), ('space', 11305), ('video', 11272), ('something', 11270), ('help', 11210), ('steel', 11046), ('business', 10898), ('market', 10878), ('team', 10829), ('level', 10716), ('device', 10470), ('wallet', 10402), ('family', 10353), ('watch', 10252), ('line', 10079), ('shipping

In [42]:
print('Number of unique tokens: %d' % len(dictionary_design))
print('Number of documents: %d' % len(corpus_design))

Number of unique tokens: 21627
Number of documents: 28290


In [None]:
# Initialize an empty list to store the LDA models
lda_models_design = []

In [None]:
from gensim.models.callbacks import CoherenceMetric, DiffMetric, PerplexityMetric, ConvergenceMetric

# define perplexity callback
pl = PerplexityMetric(corpus=corpus_design, logger="visdom", title="Perplexity")

# define other remaining metrics available
ch_umass = CoherenceMetric(corpus=corpus_design, coherence="u_mass", logger="visdom", title="Coherence (u_mass)")
ch_cv = CoherenceMetric(corpus=corpus_design, texts=data_design, coherence="c_v", logger="visdom", title="Coherence (c_v)")
diff_kl = DiffMetric(distance="kullback_leibler", logger="visdom", title="Diff (kullback_leibler)")
convergence_kl = ConvergenceMetric(distance="jaccard", logger="visdom", title="Convergence (jaccard)")

callbacks = [pl, ch_umass, ch_cv, diff_kl, convergence_kl]

In [81]:
lda_model_75 = gensim.models.LdaModel(corpus=corpus_design, id2word=dictionary_design, num_topics=75, passes=3, per_word_topics=True, chunksize=15000, iterations=150, alpha='auto')

In [43]:
lda_model_100 = gensim.models.LdaModel(corpus=corpus_design, id2word=dictionary_design, num_topics=100, passes=5, per_word_topics=True, chunksize=1500, iterations=150, alpha='auto')

In [83]:
lda_model_125 = gensim.models.LdaModel(corpus=corpus_design, id2word=dictionary_design, num_topics=125, passes=3, per_word_topics=True, chunksize=15000, iterations=150, alpha='auto')

In [46]:
vis = gensimvis.prepare(lda_model_100, corpus_design, dictionary_design)
pyLDAvis.save_html(vis, './LDA/design/vis/100_POS.html')

  default_term_info = default_term_info.sort_values(


In [84]:
vis_75 = gensimvis.prepare(lda_model_75, corpus_design, dictionary_design)
vis_100 = gensimvis.prepare(lda_model_100, corpus_design, dictionary_design)
vis_125 = gensimvis.prepare(lda_model_125, corpus_design, dictionary_design)

  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(


In [85]:
pyLDAvis.save_html(vis_75, './LDA/design/vis/75.html')
pyLDAvis.save_html(vis_100, './LDA/design/vis/100.html')
pyLDAvis.save_html(vis_125, './LDA/design/vis/125.html')

In [None]:
# Iterate over the desired topic counts from 10 to 300 in steps of 10
for num_topics in tqdm(range(10, 301, 10)):
    # Create an LDA model with the current topic count
    lda_model = gensim.models.LdaMulticore(corpus=corpus_design, id2word=dictionary_design, num_topics=num_topics, passes=30, workers=8, per_word_topics=True, chunksize=100, iterations=150, eval_every=None, gamma_threshold=0.001)

    # Save the model to Google Drive
    model_path = main_path + f"lda_design/{num_topics}/lda_model_design_{num_topics}"
    lda_model.save(model_path)

    # Append the model path to the list
    lda_models_design.append(lda_model)

In [None]:
# Loop through the created LDA models
for num_topics, lda_model in zip(range(10, 301, 10), lda_models_design):
    # Create the visualization
    vis = gensimvis.prepare(lda_model, corpus_design, dictionary_design)

    # Save the visualization as an HTML file
    html_path = main_path + f"lda_design/{num_topics}/lda_vis_design_{num_topics}.html"
    pyLDAvis.save_html(vis, html_path)