In [2]:
pip install pandas nltk gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy>=1.23.2 (from pandas)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m69.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━

### Reddit Topic Modeling

In [6]:
import pandas as pd
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora
from gensim.models.ldamodel import LdaModel
import nltk

# 1. Load data
df = pd.read_csv('reddit_hot_posts 2.csv', low_memory=False)

# 2. Define engagement metric
df['engagement'] = df['score'].fillna(0) + df['num_comments'].fillna(0)

# 3. NLTK only for stopwords + lemmatizer data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')    # for WordNet lemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    if pd.isna(text):
        return []
    # simple_preprocess: lowercases, strips accents, tokenizes on word boundaries, removes tokens <2 or >15 chars
    tokens = simple_preprocess(text, deacc=True)
    return [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words]

# Combine title + selftext
df['doc'] = (df['title'].fillna('') + ' ' + df['selftext'].fillna('')).apply(preprocess)

# 4. Build dictionary & corpus
dictionary = corpora.Dictionary(df['doc'])
dictionary.filter_extremes(no_below=5, no_above=0.5)
corpus = [dictionary.doc2bow(doc) for doc in df['doc']]

# 5. Fit LDA
NUM_TOPICS = 200
lda = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=NUM_TOPICS,
    passes=10,
    random_state=42
)

# 6. Assign dominant topic
def get_dominant_topic(bow):
    topics = lda.get_document_topics(bow)
    return max(topics, key=lambda x: x[1])[0] if topics else None

df['topic'] = [get_dominant_topic(b) for b in corpus]

# 7. Aggregate engagement
agg = (
    df.groupby('topic')
      .agg(
          total_engagement=('engagement', 'sum'),
          doc_count=('engagement', 'size')
      )
      .reset_index()
)

# 8. Extract top words per topic
agg['top_words'] = agg['topic'].apply(lambda t: [w for w, _ in lda.show_topic(int(t), topn=10)])

# … after you’ve built `agg` and populated `agg['top_words']` …

# 9a. Auto‐generate a simple label from the first two top words
agg['topic_label'] = agg['top_words'].apply(lambda words: ' '.join(words[:2]))

# 9b. Pick top 200 topics by engagement
top200 = (
    agg
    .sort_values('total_engagement', ascending=False)
    .head(200)
    # reorder columns for clarity
    [['topic', 'topic_label', 'top_words', 'total_engagement', 'doc_count']]
)

# 10. Save or inspect
top200.to_csv('top_200_topics_labeled.csv', index=False)
print(top200)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


     topic    topic_label                                          top_words  \
39    72.0   trump israel  [trump, israel, iran, thought, ceasefire, know...   
79   144.0       day hand  [day, hand, baby, two, around, move, may, big,...   
58   103.0        car lol  [car, lol, almost, model, ended, found, rest, ...   
53    97.0     first time  [first, time, make, favorite, see, series, cha...   
84   153.0     made never  [made, never, least, cake, color, coffee, stil...   
..     ...            ...                                                ...   
104  196.0    source game  [source, game, gt, good, see, really, back, no...   
61   116.0       http com  [http, com, original, say, right, post, coming...   
34    63.0      even told  [even, told, enough, one, ever, maybe, woman, ...   
69   128.0  going article  [going, article, get, go, six, made, dinner, w...   
31    57.0     room would  [room, would, cover, like, stay, house, door, ...   

     total_engagement  doc_count  
39  

# Youtube Topic Modeling

In [10]:
import pandas as pd
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora
from gensim.models.ldamodel import LdaModel
import nltk

# 1. Load YouTube data
df = pd.read_csv('youtube_trending_analysis_2025-06-20 (2).csv', low_memory=False)

# 2. Engagement = View Count + Like Count
df['engagement'] = (
    df['View Count'].fillna(0)
  + df['Like Count'].fillna(0)
)

# 3. Download NLTK resources (if you haven’t already)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    if pd.isna(text):
        return []
    tokens = simple_preprocess(text, deacc=True)
    return [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words]

# 4. Build “documents” from Title + Description
#    (make sure your file has a 'Description' column—if it’s named differently, swap in the exact name)
df['doc'] = (
    df['Title'].fillna('') + ' '
  + df['Description'].fillna('')
).apply(preprocess)

# 5. Dictionary & Corpus
dictionary = corpora.Dictionary(df['doc'])
dictionary.filter_extremes(no_below=5, no_above=0.5)
corpus = [dictionary.doc2bow(doc) for doc in df['doc']]

# 6. Fit LDA
NUM_TOPICS = 100
lda = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=NUM_TOPICS,
    passes=10,
    random_state=42
)

# 7. Dominant topic per video
def get_dominant_topic(bow):
    topics = lda.get_document_topics(bow)
    return max(topics, key=lambda x: x[1])[0] if topics else None

df['topic'] = [get_dominant_topic(b) for b in corpus]

# 8. Aggregate engagement by topic
agg = (
    df.groupby('topic')
      .agg(
          total_engagement=('engagement', 'sum'),
          video_count=('engagement', 'size')
      )
      .reset_index()
)

# 9. Extract top words & naive labels
agg['top_words'] = agg['topic'].apply(
    lambda t: [w for w, _ in lda.show_topic(int(t), topn=10)]
)
agg['topic_label'] = agg['top_words'].apply(lambda ws: ' '.join(ws[:2]))

# 10. Select top topics
top_topics = (
    agg.sort_values('total_engagement', ascending=False)
       .head(min(200, NUM_TOPICS))
       [['topic','topic_label','top_words','total_engagement','video_count']]
)

# 11. Save & inspect
top_topics.to_csv('youtube_top_topics.csv', index=False)
print(top_topics)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


    topic         topic_label  \
20   20.0         枚ランタム封入 種より   
67   68.0      official music   
48   49.0      official ateez   
63   64.0     youtube twitter   
60   61.0      music director   
..    ...                 ...   
7     7.0            nbc news   
24   24.0          gd peacock   
85   87.0  bensonboone benson   
77   79.0               कर ki   
14   14.0           sky sport   

                                            top_words  total_engagement  \
20  [枚ランタム封入, 種より, japan, cd, book, photo, 枚封入, of...         333009128   
67  [official, music, song, sidhu, punjabi, apple,...         211752939   
48  [official, ateez, kqent, ojo, osaka, kq, faceb...         194464050   
63  [youtube, twitter, jp, channel, co, tiktok, su...         129678402   
60  [music, director, producer, song, production, ...         128813574   
..                                                ...               ...   
7   [nbc, news, wjar, u, follow, app, read, karen,...           1370891   
24 