In [1]:
import pandas as pd
import numpy as np
import json
import glob
import gensim

from youtube_transcript_api import YouTubeTranscriptApi
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora
from gensim.models import LdaModel
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess
import string
import pyLDAvis
import pyLDAvis.gensim

In [2]:
NUM_TOPICS = 50
NUM_PASSES = 100
NUM_WORDS_PER_TOPIC = 20

In [3]:
stop_words = set(stopwords.words('english'))

In [4]:
def lemmatization(texts, allowed_postags=["NOUN", "ADJ","VERB"]):
    # nlp = spacy.load("en_core_web_sm", disable=["parset", "ner"])
    nlp = spacy.load("en_core_web_sm")

    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        new_text = [token for token in new_text if token not in stop_words]
        final = " ".join(new_text)
        texts_out.append(final)

    return texts_out

In [5]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return final

In [6]:
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens]
    tokens = [token for token in tokens if token not in string.punctuation]

    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    preprocessed_text = ' '.join(tokens)
    return tokens

In [7]:
in_videos_path = "./trending_videos/USvideos.csv"
in_df = pd.read_csv(in_videos_path)

# Extracting transcripts and preprocessing 
documents = []
documents_raw = []
documents_vid = []
i = 0
for vid in in_df['video_id']:
    print(vid)
    try:
        transcript_list = YouTubeTranscriptApi.get_transcript(vid)
        transcript_text = ' '.join([item['text'] for item in transcript_list])
        # preprocessed_text = preprocess_text(transcript_text)
        # documents.append(preprocessed_text)
        documents.append(transcript_text)
        documents_raw.append(transcript_text)
        documents_vid.append(vid)

        i = i + 1
        if i >= 150:
            break
    except:
        print("Subtitles are turned off")
    print("")

2kyS6SvSYSE

1ZAPwfrtAFY

5qpjK5DgCt4
Subtitles are turned off

puqaWrEC7tY

d380meD0W0M

gHZ1Qz0KiKM

39idVpFF7NQ

nc99ccSXST0

jr9QtXwC9vc

TUmyygCMMGA

9wRQljFNDW8

VifQlJit6A0
Subtitles are turned off

5E4ZBSInqUU

GgVmn66oK_A

TaTleo4cOs8

kgaO45SyaO4
Subtitles are turned off

ZAQs-ctOqXQ

YVfyYrEmzgM

eNSN6qet1kE

B5HORANmzHw

vU14JY3x81A

6VhU_T463sU

_-aDHxoblr4

JBZTZZAcFTw

lZ68j2J_GOM

dRpNZV18N_g

fcVjitaM3LY

qeWvgZLz9yU

iIxy3JN3-jc

n30k5CwLhS4
Subtitles are turned off

U0hAC8O7RoI

CBVGjS_EJok
Subtitles are turned off

n1WpP7iowLc
Subtitles are turned off

hz7ukDjuq4w
Subtitles are turned off

p2hJxyF7mok
Subtitles are turned off

0mlNzVSJrT0
Subtitles are turned off

Om_zGhJLZ5U

e_7zHm7GsYc
Subtitles are turned off

dQvIbulWCM4
Subtitles are turned off

zZ9FciUx6gs
Subtitles are turned off

PaJCFHXcWmM
Subtitles are turned off

goP4Z5wyOlM

NZFhMSgbKKM

0tO_l_Ed5Rs
Subtitles are turned off

STI2fI7sKMo
Subtitles are turned off

BWPrk9PUwQE
Subtitles are turned off

og

In [8]:
lemmatized_documents = lemmatization(documents)
documents = gen_words(lemmatized_documents)

In [17]:
# Create dictionary and corpus
dictionary = corpora.Dictionary(documents)
corpus = [dictionary.doc2bow(doc) for doc in documents]

# Train LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=NUM_TOPICS, passes=NUM_PASSES)

print("Topics:")
for idx, topic in lda_model.print_topics(num_words=NUM_WORDS_PER_TOPIC, num_topics=NUM_TOPICS):
    print(f"Topic {idx}: {topic}")

# Compute coherence score
coherence_model = CoherenceModel(model=lda_model, texts=documents, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print(f"Coherence Score: {coherence_score}")

Topics:
Topic 0: 0.061*"go" + 0.039*"look" + 0.032*"use" + 0.019*"feel" + 0.019*"face" + 0.019*"think" + 0.017*"makeup" + 0.015*"guy" + 0.014*"foundation" + 0.014*"little" + 0.013*"eye" + 0.013*"love" + 0.013*"powder" + 0.012*"bit" + 0.012*"makeover" + 0.012*"brush" + 0.012*"like" + 0.010*"side" + 0.010*"shade" + 0.010*"artist"
Topic 1: 0.021*"people" + 0.021*"put" + 0.021*"know" + 0.014*"go" + 0.014*"world" + 0.014*"see" + 0.014*"fake" + 0.007*"number" + 0.007*"moment" + 0.007*"trick" + 0.007*"smile" + 0.007*"spotlight" + 0.007*"like" + 0.007*"place" + 0.007*"act" + 0.007*"tell" + 0.007*"life" + 0.007*"lady" + 0.007*"stage" + 0.007*"promise"
Topic 2: 0.035*"go" + 0.027*"get" + 0.018*"say" + 0.017*"good" + 0.015*"look" + 0.014*"phone" + 0.014*"see" + 0.013*"think" + 0.013*"know" + 0.013*"make" + 0.013*"device" + 0.011*"give" + 0.010*"guy" + 0.010*"win" + 0.008*"thing" + 0.008*"want" + 0.008*"need" + 0.008*"come" + 0.007*"let" + 0.007*"fake"
Topic 3: 0.047*"go" + 0.030*"spaghetti" + 0.0

In [18]:
columns = [f"word_{i}" for i in range(1, NUM_WORDS_PER_TOPIC + 1)]

# Initialize an empty list to collect data for DataFrame rows
data_rows = []

# Process each topic from the LDA model
for idx, topic in lda_model.print_topics(num_words = NUM_WORDS_PER_TOPIC, num_topics=NUM_TOPICS):
    # Split the topic string and extract words with scores
    words_scores = [word.split("*") for word in topic.split("+")]
    words_scores = [(word.strip().strip('"'), float(score.strip())) for score, word in words_scores]
    
    # Sort words_scores by score (descending)
    words_scores.sort(key=lambda x: x[1], reverse=True)
    
    # Create a list of (word, score) tuples for the topic
    word_score_tuples = [(word, score) for word, score in words_scores[:NUM_WORDS_PER_TOPIC]]
    
    # Append the row data (list of tuples) to the data_rows list
    data_rows.append(word_score_tuples)

# Create DataFrame from collected data_rows
topics_df = pd.DataFrame(data_rows, columns=columns)

# Add index for each topic (e.g., Topic 1, Topic 2, ...)
topics_df.index = [f"Topic {i}" for i in range(len(topics_df))]

# Display the topics DataFrame
print("Topics DataFrame:")
print(topics_df)

Topics DataFrame:
                       word_1             word_2                word_3  \
Topic 0           (go, 0.061)      (look, 0.039)          (use, 0.032)   
Topic 1       (people, 0.021)       (put, 0.021)         (know, 0.021)   
Topic 2           (go, 0.035)       (get, 0.027)          (say, 0.018)   
Topic 3           (go, 0.047)  (spaghetti, 0.03)         (none, 0.028)   
Topic 4     (detector, 0.059)       (lie, 0.059)         (buzz, 0.048)   
Topic 5       (record, 0.019)     (track, 0.013)         (food, 0.013)   
Topic 6           (know, 0.0)         (get, 0.0)             (go, 0.0)   
Topic 7           (go, 0.036)       (wait, 0.03)          (get, 0.027)   
Topic 8             (go, 0.0)        (know, 0.0)            (get, 0.0)   
Topic 9           (go, 0.027)     (think, 0.018)         (make, 0.017)   
Topic 10          (go, 0.051)       (get, 0.031)         (make, 0.015)   
Topic 11         (fbe, 0.049)      (face, 0.034)    (recognize, 0.027)   
Topic 12       (than

In [11]:
# Topic Inference

from gensim.models import TfidfModel
tfidf_model = TfidfModel(corpus)

# Transform the corpus to TF-IDF space
corpus_tfidf = tfidf_model[corpus]

# # Print TF-IDF 
# for doc in corpus_tfidf:
#     print(doc)

In [19]:
doc_topic_distribution = [lda_model.get_document_topics(doc_tfidf) for doc_tfidf in corpus_tfidf]

# Map documents to topics
document_topics = []
for doc_topics in doc_topic_distribution:
    # Sort by probability 
    doc_topics.sort(key=lambda x: x[1], reverse=True)
    
    #taking top three topics
    top_three_topics = doc_topics[:3]
    document_topics.append(top_three_topics)


for i, topics in enumerate(document_topics):
    print(f"Document {i} is assigned to topics:")
    for topic, prob in topics:
        print(f"  Topic {topic}: Probability {prob:.3f}")

Document 0 is assigned to topics:
  Topic 9: Probability 0.907
  Topic 40: Probability 0.018
Document 1 is assigned to topics:
  Topic 40: Probability 0.771
  Topic 20: Probability 0.076
  Topic 42: Probability 0.065
Document 2 is assigned to topics:
  Topic 33: Probability 0.869
  Topic 40: Probability 0.045
Document 3 is assigned to topics:
  Topic 10: Probability 0.918
Document 4 is assigned to topics:
  Topic 28: Probability 0.930
Document 5 is assigned to topics:
  Topic 40: Probability 0.648
  Topic 43: Probability 0.252
Document 6 is assigned to topics:
  Topic 28: Probability 0.630
  Topic 10: Probability 0.198
  Topic 20: Probability 0.043
Document 7 is assigned to topics:
  Topic 1: Probability 0.879
Document 8 is assigned to topics:
  Topic 18: Probability 0.637
  Topic 40: Probability 0.193
  Topic 43: Probability 0.103
Document 9 is assigned to topics:
  Topic 19: Probability 0.842
Document 10 is assigned to topics:
  Topic 29: Probability 0.637
Document 11 is assigned to 

In [22]:
print(documents_vid[124])
print("")
print(documents[124])
print("")
print(documents_raw[124])
print("")
print(lda_model.print_topics(num_words=NUM_WORDS_PER_TOPIC, num_topics=NUM_TOPICS)[39])

6hTzM1BPdU8

['think', 'least', 'year', 'violin', 'hi', 'guy', 'draw', 'know', 'go', 'draw', 'way', 'laugh', 'drawing', 'think', 'first', 'thing', 'pop', 'head', 'flag', 'flag', 'guess', 'describe', 'flag', 'look', 'star', 'blue', 'part', 'rectangle', 'shape', 'star', 'state', 'many', 'star', 'think', 'think', 'know', 'cheese', 'hot', 'dog', 'american', 'cheese', 'american', 'cheese', 'draw', 'think', 'thing', 'know', 'describe', 'look', 'person', 'person', 'real', 'person', 'man', 'lady', 'lady', 'lady', 'hold', 'candle', 'book', 'get', 'candle', 'hand', 'wear', 'hat', 'girl', 'book', 'crown', 'long', 'hair', 'short', 'hair', 'girl', 'long', 'make', 'hold', 'puppy', 'cover', 'know', 'president', 'know', 'president', 'think', 'think', 'never', 'rosy', 'cheek', 'look', 'ghost', 'ghost', 'first', 'president', 'old', 'first', 'president', 'age', 'old', 'guy', 'think', 'bajillion', 'year', 'old', 'think', 'dinosaur', 'use', 'live', 'dinosaur', 'tear', 'make', 'one', 'laugh', 'woman', 'aliv