Using the text in one of the project CSV files:


* Find the top 10 uni-, bi-, and tri-grams
* Convert the text to BERT embeddings
* Cluster the posts using these embeddings
* Extract topics using these embeddings

In [None]:
# Install additional libraries

!pip install bertopic
!pip install gensim
!pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Read the CSV file

import pandas as pd
df = pd.read_csv("GetEmployedForum.csv")

In [None]:
# Look at the comments
df["CommentContent"]

0        that s actually 20 jobs in a month lol i didn ...
1        yeahh the shining shimmering splendid that is ...
2              thank you my friend i really appreciate it 
3        first of all congratulations we all know how h...
4        absolutely is a factor i had the same issue wh...
                               ...                        
52692    i literally can t do anything and this is how ...
52693    i d like to counter this two years out of coll...
52694             what do you do if you don t know people 
52695    i can attest to this i make a fuck ton of mone...
52696    thanks this is awesome anyone reading please t...
Name: CommentContent, Length: 52697, dtype: object

In [None]:
# Tokenize
from gensim.parsing.preprocessing import preprocess_string, remove_stopwords, strip_punctuation, strip_numeric, strip_multiple_whitespaces, strip_short, strip_tags

tokens = df["CommentContent"].apply(preprocess_string, filters=(
  lambda x: x.encode('utf-8').strip(),
  lambda x: x.lower(),
  strip_multiple_whitespaces,
  strip_tags,
  strip_numeric,
  strip_punctuation,
  strip_short,
  remove_stopwords
))
tokens

0        [actually, jobs, month, lol, know, people, app...
1        [yeahh, shining, shimmering, splendid, network...
2                              [thank, friend, appreciate]
3        [congratulations, know, hard, best, interview,...
4        [absolutely, factor, issue, going, college, co...
                               ...                        
52692                                    [literally, jobs]
52693    [like, counter, years, college, graduated, got...
52694                                       [know, people]
52695               [attest, fuck, ton, money, networking]
52696    [thanks, awesome, reading, talk, want, moderator]
Name: CommentContent, Length: 52697, dtype: object

In [None]:
# Compute top n-grams

from collections import Counter
from nltk.util import ngrams

for n in (1, 2, 3):
  ngram_counter = Counter()
  for sentence_tokens in tokens:
    for ngram in ngrams(sentence_tokens, n):
        ngram_counter[ngram] += 1
  print(f"Most common {n}-grams:")
  print(ngram_counter.most_common(10))
  print()

Most common 1-grams:
[(('job',), 24275), (('like',), 14008), (('work',), 13840), (('know',), 9736), (('good',), 9693), (('people',), 9509), (('time',), 9462), (('resume',), 9192), (('want',), 8742), (('experience',), 7410)]

Most common 2-grams:
[(('good', 'luck'), 2139), (('cover', 'letter'), 1664), (('let', 'know'), 1266), (('https', 'www'), 1131), (('entry', 'level'), 1071), (('job', 'search'), 992), (('sounds', 'like'), 963), (('hiring', 'manager'), 809), (('resources', 'like'), 804), (('career', 'resources'), 802)]

Most common 3-grams:
[(('ref', 'careerbot', 'free'), 460), (('let', 'know', 'comments'), 400), (('share', 'let', 'know'), 399), (('know', 'comments', 'add'), 399), (('hey', 'free', 'resources'), 398), (('free', 'resources', 'hope'), 398), (('resources', 'hope', 'useful'), 398), (('way', 'careerbot', 'reddit'), 398), (('careerbot', 'reddit', 'bot'), 398), (('reddit', 'bot', 'points'), 398)]



In [40]:
# Compute the embeddings
from sentence_transformers import SentenceTransformer
sentence_transformer = SentenceTransformer('distilbert-base-nli-mean-tokens')
# Treat each comment as a "sentence" so we're comparing comment-to-comment
sentences = df['CommentContent'].iloc[:500]  # Only use first n comments to make this fast enough to demo
sentences = [' '.join(sentence_tokens) for sentence_tokens in sentences.apply(preprocess_string, filters=(
  lambda x: x.encode('utf-8').strip(),
  lambda x: x.lower(),
  strip_multiple_whitespaces,
  strip_tags,
  strip_numeric,
  strip_punctuation,
  strip_short,
  remove_stopwords
)).tolist()]
sentence_embeddings = sentence_transformer.encode(sentences)

In [41]:
# Cluster with k-means

from sklearn.cluster import KMeans
num_clusters = 5
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(sentence_embeddings)

clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(kmeans.labels_):
    clustered_sentences[cluster_id].append(sentences[sentence_id])

for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i + 1)
    print(cluster)
    print("")



Cluster  1
['brain car culture fuckcars', 'difficult know people discussing problem https blogs stonesteps', 'think applying different companies single day developed tailored resume definitely mistaken realistic advice way', 'problem graduated got months placement ended month sure employment gap matter', 'case deleted entire conversation thread remote angellist', 'incredibly disappointing applied tonight approximately places interview wednesday pay benefits better experience working lab', 'looking people lie time respond leverage nego', 'true hope job listing removed day interview knowthat mean coincidence good vibes interview', 'think appreciate passing wouldn overthink decision probably unlikely wound way basis verbal bouquet flowers face', 'times asked previous designation title different looking similar job duties dropped thought thing modify title requirement changing description work releving letter scares dropped lying title complex dillema', 'growing industry good career discou

In [42]:
# Model topics

from bertopic import BERTopic
topic_model = BERTopic().fit(sentences, sentence_embeddings)
topic_model.get_document_info(sentences)

Unnamed: 0,Document,Topic,Name,Top_n_words,Probability,Representative_document
0,actually jobs month lol know people applying h...,1,1_amp_earn_lifestyle_designarticles,amp - earn - lifestyle - designarticles - post...,0.913622,False
1,yeahh shining shimmering splendid networking m...,0,0_job_work_like_time,job - work - like - time - jobs - know - good ...,1.000000,False
2,thank friend appreciate,0,0_job_work_like_time,job - work - like - time - jobs - know - good ...,1.000000,False
3,congratulations know hard best interview thank...,0,0_job_work_like_time,job - work - like - time - jobs - know - good ...,1.000000,False
4,absolutely factor issue going college coop pla...,0,0_job_work_like_time,job - work - like - time - jobs - know - good ...,1.000000,False
...,...,...,...,...,...,...
495,appreciate criticism post obviously agree fail...,0,0_job_work_like_time,job - work - like - time - jobs - know - good ...,0.970469,False
496,create resume https play google com store apps...,0,0_job_work_like_time,job - work - like - time - jobs - know - good ...,1.000000,False
497,quizzes earn dollars easily earn dollars minut...,1,1_amp_earn_lifestyle_designarticles,amp - earn - lifestyle - designarticles - post...,0.424676,False
498,worked,0,0_job_work_like_time,job - work - like - time - jobs - know - good ...,1.000000,False
