# Imports

In [1]:
#!pip install pyLDAvis
#!pip install visdom
import pandas as pd
import nltk
import re
import gensim
import gensim.corpora as corpora
from gensim.models import Phrases
from nltk.corpus import stopwords
from tqdm.notebook import tqdm
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
from sklearn.feature_extraction.text import CountVectorizer
tqdm.pandas()

# Data

In [2]:
#from google.colab import drive
#drive.mount('/content/drive')
df_lda_kickstarter = pd.read_csv('../kickstarter_cleaned.csv')

# Functions

In [3]:
def preprocess_text(document: str, stemmer: nltk.stem.WordNetLemmatizer, en_stop: set) -> list:
    """Preprocesses a document to remove special characters/whitespace/etc

    Args:
        document (str):
        stemmer (nltk.stem.WordNetLemmatizer): Stemmer from NLTK
        en_stop (set): Set of stop words, usually from NLTK

    Returns:
        str: preprocessed document
    """

    # Remove all the special characters
    document = re.sub(r'\W', ' ', document)

    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)

    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)

    # Converting to Lowercase
    document = document.lower()

    # Tokenization
    tokens = nltk.word_tokenize(document)

    # POS-Tagging
    tagged_tokens = nltk.pos_tag(tokens)

    # Filter Nouns and Lemmatization
    lemmatized_nouns = []
    for token, pos in tagged_tokens:
        if pos.startswith('N'):  # Check if the token is a noun
            lemma = stemmer.lemmatize(token)
            # FIlter stop words, words that contain only numbers and short words
            if lemma not in en_stop and not lemma.isdigit() and len(lemma) > 2:
                lemmatized_nouns.append(lemma)

    return lemmatized_nouns;

In [4]:
def preprocess_text_helper(t):
    stemmer = nltk.stem.WordNetLemmatizer()
    return preprocess_text(t, stemmer, stopwords.words('english'))

In [5]:
# Create function to calculate topic distribution
def get_topic_distribution(text, dictionary, model):
    bow = dictionary.doc2bow(text)
    topic_probs = model.get_document_topics(bow)
    return topic_probs

In [6]:
# Create a function to extract the top n descriptions corresponding to a topic
def get_top_descriptions(model, df, corpus, chosen_topic, n):

    print(model.show_topic(chosen_topic, topn=10))

    # Get the document-topic distribution
    document_topic_distribution = model.get_document_topics(corpus)

    # Ensure chosen_topic is within the valid range
    if 0 <= chosen_topic < model.num_topics:
        # Sort documents by their probability score for the chosen topic
        sorted_documents = sorted(
            enumerate(document_topic_distribution),
            key=lambda x: next((prob for topic, prob in x[1] if topic == chosen_topic), 0),
            reverse=True
        )
        top_n = 5  # Number of top documents to print
        for i, (doc_id, topic_probs) in enumerate(sorted_documents[:top_n]):
            document = corpus[doc_id]
            project_description = df.iloc[doc_id]['project_description']
            processed_description = df.iloc[doc_id]['processed_description']
            print(f"Document {i + 1} (Corpus ID {doc_id}):")
            print("Topic Probability:", next((prob for topic, prob in topic_probs if topic == chosen_topic), 0))
            words = [word for word, freq in document]
            print("Document Text:", words)
            print("Project Description:", project_description)
            print("processed Description:", processed_description)
            print("\n")
    else:
        print(f"Chosen topic index {chosen_topic} is out of range.")

# LDA

In [7]:
# Remove the columns
df_lda_kickstarter.drop(df_lda_kickstarter.columns.difference(['project_description', 'project_category_id', 'project_parent_category_id', 'project_state']), axis=1,inplace=True)

### Technology Category

In [8]:
df_lda_kickstarter_technology = df_lda_kickstarter.copy()

In [9]:
# Filter Dataframe by Technology category (number 16)
df_lda_kickstarter_technology = df_lda_kickstarter_technology[(df_lda_kickstarter_technology['project_category_id'] == 16) | (df_lda_kickstarter_technology['project_parent_category_id'] == 16)]

In [10]:
# Preprocessing general steps
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
df_lda_kickstarter_technology['processed_description'] = df_lda_kickstarter_technology['project_description'].progress_apply(preprocess_text_helper)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\CoolerMaster\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\CoolerMaster\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\CoolerMaster\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


  0%|          | 0/25653 [00:00<?, ?it/s]

In [11]:
data_technology = df_lda_kickstarter_technology['processed_description'].tolist()
# Compute n-grams
from gensim.models import Phrases

# Add n-grams to docs (only ones that appear 20 times or more).
ngrams_technology = Phrases(data_technology, min_count=20)

for idx in range(len(data_technology)):
    for token in ngrams_technology[data_technology[idx]]:
        if '_' in token:
            # Token is a n-grams, add to document.
            data_technology[idx].append(token)

In [12]:
# create dictionary
dictionary_technology = corpora.Dictionary(data_technology)

In [13]:
# Filter out words that occur less than 5 documents, or more than 50% of the documents.
dictionary_technology.filter_extremes(no_below=5, no_above=0.5)

In [14]:
# Define your custom stoplist
custom_stoplist_technology = ["project", "kickstarter", "pledge", "backer", "campaign", "goal", "product", "kickstarter_campaign", "funding_goal", "reward", "stretch", "stretch_goal", "fund", "funding", "pledge_level",
                            "tier", "reward_tier", "pledge_amount",
                            "play", "replay", "browser", "html5", "play_replay", "html5_browser", 
                            "people", "world", "thing", "lot", "device", "system", "user", "way", "technology", "use", "design", "year", "month", "day", "hour", "www", "com", "one"
                            "information", "opportunity", "fund", "funding", "technology", "solution", "developement", "tech", "experience", "level", "support", "stretch", "stretch_goal", "let", "detail",
                            "option", "please", "help", "life", "idea", "share", "everything", "thank", "quality", "version"]
# Add your custom stop words to the dictionary
stop_ids_technology = [dictionary_technology.token2id[word] for word in custom_stoplist_technology if word in dictionary_technology.token2id]
# Remove the stop words from the dictionary
dictionary_technology.filter_tokens(bad_ids=stop_ids_technology)

In [15]:
# create corpus
corpus_technology = [dictionary_technology.doc2bow(tokens) for tokens in data_technology]

In [16]:
from collections import Counter

# Count word occurrences
word_counts_technology = Counter()
for doc in corpus_technology:
    for word_id, count in doc:
        word = dictionary_technology[word_id]
        word_counts_technology[word] += count

# Get most frequent words
most_common_technology = word_counts_technology.most_common(100)
print(most_common_technology)

[('app', 35322), ('power', 19311), ('video', 17889), ('phone', 17790), ('feature', 15033), ('part', 15025), ('team', 14904), ('business', 14458), ('development', 14429), ('battery', 13692), ('software', 13581), ('home', 13580), ('data', 13508), ('application', 12942), ('prototype', 12919), ('community', 12682), ('work', 12404), ('service', 12134), ('cost', 12047), ('company', 11999), ('platform', 11886), ('production', 11741), ('information', 11277), ('board', 10995), ('student', 10781), ('tool', 10613), ('friend', 10443), ('game', 10051), ('order', 10042), ('water', 10029), ('market', 9947), ('money', 9893), ('hand', 9853), ('access', 9654), ('light', 9309), ('control', 9257), ('case', 9192), ('computer', 9152), ('color', 9131), ('process', 9073), ('need', 9073), ('problem', 8656), ('place', 8630), ('everyone', 8574), ('medium', 8550), ('music', 8547), ('family', 8519), ('sensor', 8286), ('child', 8258), ('event', 8189), ('material', 7918), ('website', 7917), ('camera', 7880), ('space

In [17]:
print('Number of unique tokens: %d' % len(dictionary_technology))
print('Number of documents: %d' % len(corpus_technology))

Number of unique tokens: 17583
Number of documents: 25653


In [18]:
from gensim.models.callbacks import CoherenceMetric, DiffMetric, PerplexityMetric, ConvergenceMetric

# define perplexity callback
pl = PerplexityMetric(corpus=corpus_technology, logger="visdom", title="Perplexity")

# define other remaining metrics available
ch_umass = CoherenceMetric(corpus=corpus_technology, coherence="u_mass", logger="visdom", title="Coherence (u_mass)")
ch_cv = CoherenceMetric(corpus=corpus_technology, texts=data_technology, coherence="c_v", logger="visdom", title="Coherence (c_v)")
diff_kl = DiffMetric(distance="kullback_leibler", logger="visdom", title="Diff (kullback_leibler)")
convergence_kl = ConvergenceMetric(distance="jaccard", logger="visdom", title="Convergence (jaccard)")

callbacks = [pl, ch_umass, ch_cv, diff_kl, convergence_kl]

In [19]:
lda_model_technology = gensim.models.LdaModel(corpus=corpus_technology, id2word=dictionary_technology, num_topics=100, passes=15, per_word_topics=True, chunksize=1500, iterations=150, alpha='auto') #callbacks=callbacks

In [20]:
lda_model_technology.save('../LDA/technology/100/LDA_technology_100_final')

In [21]:
vis = gensimvis.prepare(lda_model_technology, corpus_technology, dictionary_technology)
pyLDAvis.save_html(vis, '../LDA/technology/vis/100_final.html')

  default_term_info = default_term_info.sort_values(


In [None]:
get_top_descriptions(lda_model_technology, df_lda_kickstarter_technology, corpus_technology, 21, 10)

### Games Category

In [23]:
df_lda_kickstarter_games = df_lda_kickstarter.copy()

In [24]:
# Filter Dataframe by Games category (number 12)
df_lda_kickstarter_games = df_lda_kickstarter_games[(df_lda_kickstarter_games['project_category_id'] == 12) | (df_lda_kickstarter_games['project_parent_category_id'] == 12)]

In [25]:
# Preprocessing
df_lda_kickstarter_games['project_description'] = df_lda_kickstarter_games['project_description'].progress_apply(preprocess_text_helper)

  0%|          | 0/35790 [00:00<?, ?it/s]

In [26]:
data_games = df_lda_kickstarter_games['project_description'].tolist()
# Compute bigrams/trigrams
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
ngrams_games = Phrases(data_games, min_count=20)
for idx in range(len(data_games)):
    for token in ngrams_games[data_games[idx]]:
        if '_' in token:
            # Token is a bigram/trigram, add to document.
            data_games[idx].append(token)

In [27]:
# create dictionary
dictionary_games = corpora.Dictionary(data_games)

In [28]:
# Filter out words that occur less than 5 documents, or more than 50% of the documents.
dictionary_games.filter_extremes(no_below=5, no_above=0.5)

In [29]:
# Define your custom stoplist
custom_stoplist_games = ["project", "kickstarter", "pledge", "backer", "campaign", "goal", "product", "kickstarter_campaign", "funding_goal", "reward", "stretch", "stretch_goal", "fund", "funding", "pledge_level",
                        "tier", "reward_tier", "pledge_amount",
                        "play", "replay", "browser", "html5", "play_replay", "html5_browser",
                        "year", "month", "day", "hour",
                        "people", "thing", "lot", "let", "something", "anyone", "card", "help", "one", "thing", "character", "video", "level", "design", "use",
                        "system", "feature", "play", "style", "title", "feedback", "support", "version", "please", "www", "com", "life", "way", "world", "idea", "share", "everything", "thank", "quality", "version"]                       
# Add your custom stop words to the dictionary
stop_ids_games = [dictionary_games.token2id[word] for word in custom_stoplist_games if word in dictionary_games.token2id]
# Remove the stop words from the dictionary
dictionary_games.filter_tokens(bad_ids=stop_ids_games)

In [30]:
# create corpus
corpus_games = [dictionary_games.doc2bow(tokens) for tokens in data_games]

In [31]:
from collections import Counter

# Count word occurrences
word_counts_games = Counter()
for doc in corpus_games:
    for word_id, count in doc:
        word = dictionary_games[word_id]
        word_counts_games[word] += count

# Get most frequent words
most_common_games = word_counts_games.most_common(100)
print(most_common_games)

[('deck', 51270), ('book', 30165), ('dice', 29503), ('art', 29073), ('board', 25934), ('rule', 24708), ('story', 24141), ('box', 23815), ('copy', 23200), ('shipping', 22741), ('team', 22395), ('adventure', 22082), ('friend', 21723), ('set', 21591), ('cost', 20616), ('order', 20413), ('point', 20404), ('edition', 19916), ('item', 19505), ('hand', 19351), ('color', 19134), ('experience', 18563), ('work', 17718), ('part', 17517), ('page', 17139), ('action', 16457), ('fun', 16303), ('number', 15853), ('monster', 15520), ('power', 15290), ('everyone', 15282), ('print', 15147), ('piece', 15054), ('battle', 14921), ('ability', 14715), ('artist', 14209), ('map', 14118), ('development', 14014), ('money', 13944), ('option', 13186), ('expansion', 13156), ('place', 13135), ('family', 12912), ('end', 12906), ('name', 12869), ('hero', 12617), ('space', 12525), ('city', 12448), ('turn', 12390), ('community', 12026), ('custom', 12001), ('pack', 11588), ('production', 11576), ('event', 11535), ('compan

In [32]:
print('Number of unique tokens: %d' % len(dictionary_games))
print('Number of documents: %d' % len(corpus_games))

Number of unique tokens: 25819
Number of documents: 35790


In [33]:
from gensim.models.callbacks import CoherenceMetric, DiffMetric, PerplexityMetric, ConvergenceMetric

# define perplexity callback
pl = PerplexityMetric(corpus=corpus_games, logger="visdom", title="Perplexity")

# define other remaining metrics available
ch_umass = CoherenceMetric(corpus=corpus_games, coherence="u_mass", logger="visdom", title="Coherence (u_mass)")
ch_cv = CoherenceMetric(corpus=corpus_games, texts=data_games, coherence="c_v", logger="visdom", title="Coherence (c_v)")
diff_kl = DiffMetric(distance="kullback_leibler", logger="visdom", title="Diff (kullback_leibler)")
convergence_kl = ConvergenceMetric(distance="jaccard", logger="visdom", title="Convergence (jaccard)")

callbacks = [pl, ch_umass, ch_cv, diff_kl, convergence_kl]

In [34]:
lda_model_games = gensim.models.LdaModel(corpus=corpus_games, id2word=dictionary_games, num_topics=70, passes=15, per_word_topics=True, chunksize=2000, iterations=150, alpha='auto')

In [35]:
lda_model_games.save('../LDA/games/70/LDA_games_70_final')

In [36]:
vis = gensimvis.prepare(lda_model_games, corpus_games, dictionary_games)
pyLDAvis.save_html(vis, '../LDA/games/vis/70_final.html')

  default_term_info = default_term_info.sort_values(


### Design Category

In [37]:
df_lda_kickstarter_design = df_lda_kickstarter.copy()

In [38]:
# Filter Dataframe by Design category (number 7)
df_lda_kickstarter_design = df_lda_kickstarter_design[(df_lda_kickstarter_design['project_category_id'] == 7) | (df_lda_kickstarter_design['project_parent_category_id'] == 7)]

In [39]:
# Preprocessing general steps
nltk.download('stopwords')
nltk.download('wordnet')
df_lda_kickstarter_design['project_description'] = df_lda_kickstarter_design['project_description'].progress_apply(preprocess_text_helper)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\CoolerMaster\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\CoolerMaster\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


  0%|          | 0/27356 [00:00<?, ?it/s]

In [40]:
data_design = df_lda_kickstarter_design['project_description'].tolist()
# Compute n-grams
from gensim.models import Phrases

# Add n-grams to docs (only ones that appear 20 times or more).
ngrams_design = Phrases(data_design, min_count=20)
for idx in range(len(data_design)):
    for token in ngrams_design[data_design[idx]]:
        if '_' in token:
            # Token is a n-grams, add to document.
            data_design[idx].append(token)

In [41]:
# create dictionary
dictionary_design = corpora.Dictionary(data_design)

In [42]:
# Filter out words that occur less than 5 documents, or more than 50% of the documents.
dictionary_design.filter_extremes(no_below=5, no_above=0.5)

In [43]:
# Define your custom stoplist
custom_stoplist_design = ["project", "kickstarter", "pledge", "backer", "campaign", "goal", "product", "kickstarter_campaign", "funding_goal", "reward", "stretch", "stretch_goal", "fund", "funding", "pledge_level",
                        "tier", "reward_tier", "pledge_amount",
                        "play", "replay", "browser", "html5", "play_replay", "html5_browser",
                        "year", "month", "day", "hour",
                        "people", "life", "way", "world", "idea", "share", "everything", "thank", "quality", "version", "thing", "lot", "let", "something", "anyone", "card", "help", "one", 
                        "thing", "character", "video", "level", "design", "use", "system", "feature", "play", "style", "title", "feedback", "support", "version", "please", "www", "com"]
# Add your custom stop words to the dictionary
stop_ids_design = [dictionary_design.token2id[word] for word in custom_stoplist_design if word in dictionary_design.token2id]
# Remove the stop words from the dictionary
dictionary_design.filter_tokens(bad_ids=stop_ids_design)

In [44]:
# create corpus
corpus_design = [dictionary_design.doc2bow(tokens) for tokens in data_design]

In [45]:
from collections import Counter

# Count word occurrences
word_counts_design = Counter()
for doc in corpus_design:
    for word_id, count in doc:
        word = dictionary_design[word_id]
        word_counts_design[word] += count

# Get most frequent words
most_common_design = word_counts_design.most_common(100)
print(most_common_design)

[('production', 23017), ('color', 22842), ('material', 21412), ('hand', 20498), ('prototype', 18012), ('bag', 17607), ('case', 16266), ('order', 15837), ('size', 15638), ('part', 15272), ('water', 15033), ('work', 13376), ('home', 12581), ('process', 12509), ('tool', 12506), ('cost', 11615), ('pocket', 11440), ('company', 11289), ('phone', 11129), ('friend', 10742), ('piece', 10544), ('place', 10535), ('option', 10520), ('experience', 10493), ('space', 10281), ('steel', 9811), ('market', 9763), ('business', 9712), ('wallet', 9606), ('team', 9534), ('device', 9348), ('family', 9285), ('line', 9028), ('shipping', 8790), ('side', 8739), ('box', 8713), ('watch', 8697), ('bottle', 8558), ('child', 8368), ('model', 8134), ('end', 8120), ('everyone', 8039), ('item', 8037), ('power', 7976), ('community', 7944), ('problem', 7926), ('need', 7921), ('body', 7918), ('art', 7628), ('manufacturing', 7575), ('price', 7525), ('custom', 7490), ('money', 7420), ('strap', 7216), ('plastic', 7177), ('boar

In [46]:
print('Number of unique tokens: %d' % len(dictionary_design))
print('Number of documents: %d' % len(corpus_design))

Number of unique tokens: 19990
Number of documents: 27356


In [47]:
from gensim.models.callbacks import CoherenceMetric, DiffMetric, PerplexityMetric, ConvergenceMetric

# define perplexity callback
pl = PerplexityMetric(corpus=corpus_design, logger="visdom", title="Perplexity")

# define other remaining metrics available
ch_umass = CoherenceMetric(corpus=corpus_design, coherence="u_mass", logger="visdom", title="Coherence (u_mass)")
ch_cv = CoherenceMetric(corpus=corpus_design, texts=data_design, coherence="c_v", logger="visdom", title="Coherence (c_v)")
diff_kl = DiffMetric(distance="kullback_leibler", logger="visdom", title="Diff (kullback_leibler)")
convergence_kl = ConvergenceMetric(distance="jaccard", logger="visdom", title="Convergence (jaccard)")

callbacks = [pl, ch_umass, ch_cv, diff_kl, convergence_kl]

In [48]:
lda_model_design = gensim.models.LdaModel(corpus=corpus_design, id2word=dictionary_design, num_topics=100, passes=15, per_word_topics=True, chunksize=1500, iterations=150, alpha='auto')

In [49]:
lda_model_design.save('../LDA/design/100/LDA_design_100_final')

In [50]:
vis = gensimvis.prepare(lda_model_design, corpus_design, dictionary_design)
pyLDAvis.save_html(vis, '../LDA/design/vis/100_final.html')

  default_term_info = default_term_info.sort_values(
