In [1]:
import pandas as pd
import numpy as np

from cleantext import clean

from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic import BERTopic       

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df=pd.read_csv('spotify2.csv')

In [3]:
df['content']=df['content'].apply(lambda x: clean(x, no_emoji=True, no_punct=True))

In [4]:
docs=df.content.values
docs=docs.astype('str')

In [15]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# All steps together
topic_model = BERTopic(
  embedding_model=embedding_model,    # Step 1 - Extract embeddings
  umap_model=umap_model,              # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,        # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,          # Step 5 - Extract topic words
  #diversity=0.5,                      # Diversify topic words
  calculate_probabilities=False, 
  n_gram_range=(1,2),
  nr_topics=10,       
  verbose=True
)

topics, probs = topic_model.fit_transform(docs)
topic_model.get_topic_info()

2024-01-10 14:23:56,955 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 15187/15187 [41:05<00:00,  6.16it/s] 
2024-01-10 15:05:07,598 - BERTopic - Embedding - Completed ✓
2024-01-10 15:05:07,599 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-01-10 16:01:54,631 - BERTopic - Dimensionality - Completed ✓
2024-01-10 16:01:54,681 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-01-10 16:03:09,472 - BERTopic - Cluster - Completed ✓
2024-01-10 16:03:09,473 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-01-10 16:03:21,886 - BERTopic - Representation - Completed ✓
2024-01-10 16:03:21,895 - BERTopic - Topic reduction - Reducing number of topics
2024-01-10 16:03:29,717 - BERTopic - Topic reduction - Reduced number of topics from 3431 to 10


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,162826,-1_app_music_spotify_song,"[app, music, spotify, song, songs, premium, pl...",[this was my favourite music streaming app the...
1,0,196586,0_app_music_spotify_update,"[app, music, spotify, update, premium, worst, ...","[its such a good app to listen the music, best..."
2,1,101113,1_good_nice_love_awesome,"[good, nice, love, awesome, bad, excellent, gr...","[good, good, good]"
3,2,11328,2_hai_que_la_bycott,"[hai, que, la, bycott, se, bycot, bekar, boyco...","[bekar hai, una excelente app para escuchar y ..."
4,3,7181,3_boycott_sweden_quran_islam,"[boycott, sweden, quran, islam, hate, love, bu...","[boycott sweden app, boycott all sweden app, w..."
5,4,4603,4_adds_star_stars_add,"[adds, star, stars, add, rating, zero, deserve...","[to much of adds, so many adds, adds]"
6,5,1932,5_poor_money_greedy_expensive,"[poor, money, greedy, expensive, price, paid, ...","[very poor side, poor, poor]"
7,6,241,6_vary_wrapped_worest_seamless,"[vary, wrapped, worest, seamless, 2022, bad, a...","[vary vary vary bad app, vary vary bad app, va..."
8,7,131,7_joe_rogan_neil_young,"[joe, rogan, neil, young, love, experience, mi...","[joe rogan, joe rogan d, only here for joe rogan]"
9,8,17,8_tin_says_does_exactly,"[tin, says, does, exactly, faultlessly, tinks,...","[does what it says on the tin, does what it sa..."


In [5]:
timestamps=df['at']
import datetime
timestamps=pd.to_datetime(timestamps).dt.date
timestamps = timestamps.to_list()

In [16]:
topics_over_time=topic_model.topics_over_time(docs, timestamps)

362it [01:44,  3.48it/s]


In [17]:
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=5, normalize_frequency=True)

In [10]:
topic_model.reduce_topics(docs,nr_topics=15)

2024-01-10 11:39:18,047 - BERTopic - Topic reduction - Reducing number of topics
2024-01-10 11:39:25,394 - BERTopic - Topic reduction - Reduced number of topics from 3448 to 15


<bertopic._bertopic.BERTopic at 0x1a582cf59f0>

In [11]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,164976,-1_app_spotify_music_song,"[app, spotify, music, song, songs, premium, pl...",[i hate spotify it adds songs to my playlist e...
1,0,194258,0_app_music_worst_update,"[app, music, worst, update, hate, spotify, pre...","[worst app after update, worst app for music, ..."
2,1,111184,1_good_nice_awesome_love,"[good, nice, awesome, love, bad, excellent, gr...","[good, very good, good]"
3,2,7479,2_sweden_boycott_quran_islam,"[sweden, boycott, quran, islam, hate, love, bu...","[boycott sweden app, boycott sweden app, boyco..."
4,3,4212,3_adds_star_stars_rating,"[adds, star, stars, rating, add, zero, deserve...","[too many adds, adds, too many adds]"
5,4,1723,4_que_la_es_musica,"[que, la, es, musica, en, para, el, se, las, c...",[esta es la mejor aplicacion para escuchar mus...
6,5,579,5_fake_ban_restrictions_scam,"[fake, ban, restrictions, scam, banned, legit,...","[fake, fake, fake]"
7,6,469,6_sleep_timer_bed_sleeping,"[sleep, timer, bed, sleeping, night, alarm, se...","[i what sleep timer, is there any sleep timer,..."
8,7,294,7_west_nyc_land_bike,"[west, nyc, land, bike, court, time, app, code...","[west app ever, west app, its west app]"
9,8,255,8_class_3rd_app_cls,"[class, 3rd, app, cls, learned, classy, thirdc...","[third class app, third class app, third class..."


In [12]:
topics_over_time=topic_model.topics_over_time(docs, timestamps)

362it [01:25,  4.24it/s]


In [13]:
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10, normalize_frequency=True)

In [7]:
topics_model1 = BERTopic(
    n_gram_range=(1, 2),
    nr_topics=10,
    min_topic_size=15,
    calculate_probabilities=False)

In [8]:
topics1, probs1 = topics_model1.fit_transform(docs)

In [9]:
topics_model1.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,161097,-1_the_to_and_it,"[the, to, and, it, app, is, this, you, for, mu...",[you can find almost every song you need on he...
1,0,196525,0_app_the_to_this,"[app, the, to, this, is, it, and, music, for, ...","[i hate this app, i hate this app, i hate this..."
2,1,100634,1_good_very_nice_love it,"[good, very, nice, love it, love, it, bad, gre...","[good, good, good]"
3,2,12931,2_de_hai_que_la,"[de, hai, que, la, se, bycott, boycot, bycot, ...",[de repente se salen canciones de mi playlist ...
4,3,7387,3_boycott_sweden_quran_islam,"[boycott, sweden, quran, islam, boycott sweden...","[boycott sweden, boycott sweden, boycott sweden]"
5,4,4309,4_adds_star_give_stars,"[adds, star, give, stars, too, many adds, it, ...","[adds, adds, adds]"
6,5,2357,5_poor_greedy_money_very poor,"[poor, greedy, money, very poor, expensive, ve...","[very poor, very poor, very poor]"
7,6,332,6_heart_the heart_back_bring,"[heart, the heart, back, bring, back the, the,...","[bring back the heart, bring back the heart, b..."
8,7,228,7_class_class app_third class_third,"[class, class app, third class, third, 3rd cla...","[third class app, third class app, third class..."
9,8,158,8_joe_rogan_joe rogan_neil,"[joe, rogan, joe rogan, neil, neil young, youn...","[joe rogan, joe rogan d, joe rogan]"


In [10]:
topics_over_time=topics_model1.topics_over_time(docs, timestamps)



In [13]:
topics_model1.visualize_topics_over_time(topics_over_time, top_n_topics=5, normalize_frequency=True)