In [1]:
from app.tools.youtube import fetch_comments
from app.tools.preprocess import select_fast_batch, preprocess_comments_df

df_all = fetch_comments("https://www.youtube.com/watch?v=26riTPNOJbc",
                       sqlite_path="./.cache.db",
                       include_replies=True,
                       max_comments=5000)

df_fast = select_fast_batch(df_all, mode="top_likes", limit=1200, include_replies=False)

df_pre, dbg = preprocess_comments_df(
    df_fast,
    min_chars=12,                 # було 20 → зробили 12
    keep_langs=("uk","ru","en","pl","cs","sk"),
    drop_spam=True,
    deduplicate=True,
    aggressive_stopword_check=False,
    return_debug=True
)

print(dbg.to_string(index=False))
print(df_pre[["comment_id","text_clean","lang","like_count"]].head())


 n_in  after_minlen  after_lang  after_spam  after_dedup  dropped_minlen  dropped_lang  dropped_spam  dropped_dup                                                                     lang_counts
   40            38          34          34           34               2             4             0            0 {'pl': 20, 'uk': 10, 'ru': 1, 'unknown': 1, 'sk': 1, 'hr': 1, 'en': 1, 'hu': 1}
                   comment_id  \
0  Ugx8YMgNr70XWO9_PGN4AaABAg   
1  UgwnH4Az1RliZfk5B7x4AaABAg   
2  UgxITgSgyx1a2BgJHd94AaABAg   
3  UgyPwsZ3OWS21CNg0AF4AaABAg   
4  Ugxa42Xs9RDHwgB3iJd4AaABAg   

                                          text_clean lang  like_count  
0  Чернівці люблю, чекаю на наступне відео) Вітаю...   uk          10  
1  W końcu doczekałem się nowego odcinka. Pozdrawiam   pl           9  
2  Щоб відчути справжню Україну у часи війни вам ...   uk           8  
3  Kowalski,nie pieprz bzdury,bo u was parkany ta...   pl           7  
4  Cześć, bardzo poruszające filmy. Jadąc z Karpa... 

In [1]:
from app.tools.youtube import fetch_comments
from app.tools.preprocess import select_fast_batch, preprocess_comments_df
from app.tools.embeddings import embed_texts
from app.tools.cluster import cluster_embeddings, summarize_clusters, attach_cluster_columns

# 1) витягуємо / кеш
df_all = fetch_comments("https://www.youtube.com/watch?v=26riTPNOJbc",
                       sqlite_path="./.cache.db",
                       include_replies=True,
                       max_comments=5000)

# 2) швидкий режим + препроцес
df_fast = select_fast_batch(df_all, mode="top_likes", limit=1200, include_replies=False)
df_pre  = preprocess_comments_df(df_fast, min_chars=12,
                                 keep_langs=("uk","ru","en","pl","cs","sk"))

# 3) ембеддинги
X = embed_texts(df_pre["text_clean"].tolist())   # використовує EMB_MODEL з .env

# 4) кластеризація
labels = cluster_embeddings(X, sim_threshold=0.34, min_cluster_size=3)

# 5) коротке резюме по кластерах (поки без LLM-лейблів)
topics_df = summarize_clusters(df_pre, X, labels, topk_quotes=3)
display(topics_df.head())

# 6) (опц.) прикріпити у вихідний df для подальших кроків
df_with_clusters = attach_cluster_columns(df_pre, X, labels)


  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

Unnamed: 0,topic_id,size,share,example_comment_ids,example_quotes
0,0,33,0.9706,"[UgxKXMF8Lyu6gTeuLch4AaABAg, UgwNvS1NJusXYwXlv...","[Cześć, ceny za budynek to oszustwo, chiba gdz..."
