In [1]:
import pandas as pd
import numpy as np
import json
import glob
import gensim

from youtube_transcript_api import YouTubeTranscriptApi
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora
from gensim.models import LdaModel
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess
import string
import pyLDAvis
import pyLDAvis.gensim

In [2]:
NUM_TOPICS = 20
NUM_PASSES = 50

In [3]:
stop_words = set(stopwords.words('english'))

In [4]:
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parset", "ner"])
    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        new_text = [token for token in new_text if token not in stop_words]
        final = " ".join(new_text)
        texts_out.append(final)

    return texts_out

In [5]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return final

In [6]:
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens]
    tokens = [token for token in tokens if token not in string.punctuation]

    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    preprocessed_text = ' '.join(tokens)
    return tokens

In [7]:
in_videos_path = "./trending_videos/USvideos.csv"
in_df = pd.read_csv(in_videos_path)

# Extracting transcripts and preprocessing 
documents = []
documents_raw = []
documents_vid = []
i = 0
for vid in in_df['video_id']:
    print(vid)
    try:
        transcript_list = YouTubeTranscriptApi.get_transcript(vid)
        transcript_text = ' '.join([item['text'] for item in transcript_list])
        # preprocessed_text = preprocess_text(transcript_text)
        # documents.append(preprocessed_text)
        documents.append(transcript_text)
        documents_raw.append(transcript_text)
        documents_vid.append(vid)

        i = i + 1
        if i > 150:
            break
    except:
        print("Subtitles are turned off")
    print("")

2kyS6SvSYSE

1ZAPwfrtAFY

5qpjK5DgCt4
Subtitles are turned off

puqaWrEC7tY

d380meD0W0M

gHZ1Qz0KiKM

39idVpFF7NQ

nc99ccSXST0

jr9QtXwC9vc

TUmyygCMMGA

9wRQljFNDW8

VifQlJit6A0
Subtitles are turned off

5E4ZBSInqUU

GgVmn66oK_A

TaTleo4cOs8

kgaO45SyaO4
Subtitles are turned off

ZAQs-ctOqXQ

YVfyYrEmzgM

eNSN6qet1kE

B5HORANmzHw

vU14JY3x81A

6VhU_T463sU

_-aDHxoblr4

JBZTZZAcFTw

lZ68j2J_GOM

dRpNZV18N_g

fcVjitaM3LY

qeWvgZLz9yU

iIxy3JN3-jc

n30k5CwLhS4
Subtitles are turned off

U0hAC8O7RoI

CBVGjS_EJok
Subtitles are turned off

n1WpP7iowLc
Subtitles are turned off

hz7ukDjuq4w
Subtitles are turned off

p2hJxyF7mok
Subtitles are turned off

0mlNzVSJrT0
Subtitles are turned off

Om_zGhJLZ5U

e_7zHm7GsYc
Subtitles are turned off

dQvIbulWCM4
Subtitles are turned off

zZ9FciUx6gs
Subtitles are turned off

PaJCFHXcWmM
Subtitles are turned off

goP4Z5wyOlM

NZFhMSgbKKM

0tO_l_Ed5Rs
Subtitles are turned off

STI2fI7sKMo
Subtitles are turned off

BWPrk9PUwQE
Subtitles are turned off

og

In [8]:
lemmatized_documents = lemmatization(documents)
documents = gen_words(lemmatized_documents)

In [14]:
# Create dictionary and corpus
dictionary = corpora.Dictionary(documents)
corpus = [dictionary.doc2bow(doc) for doc in documents]

# Train LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=NUM_TOPICS, passes=NUM_PASSES)

print("Topics:")
for idx, topic in lda_model.print_topics(num_words=15):
    print(f"Topic {idx}: {topic}")

# Compute coherence score
coherence_model = CoherenceModel(model=lda_model, texts=documents, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print(f"Coherence Score: {coherence_score}")

Topics:
Topic 0: 0.024*"get" + 0.020*"go" + 0.018*"let" + 0.017*"prime" + 0.017*"know" + 0.011*"really" + 0.009*"never" + 0.009*"thing" + 0.008*"time" + 0.008*"take" + 0.008*"see" + 0.008*"want" + 0.007*"start" + 0.007*"apartment" + 0.007*"love"
Topic 1: 0.021*"go" + 0.015*"think" + 0.015*"movie" + 0.013*"see" + 0.012*"come" + 0.012*"know" + 0.011*"get" + 0.011*"guy" + 0.010*"time" + 0.010*"look" + 0.009*"good" + 0.009*"make" + 0.008*"say" + 0.008*"thing" + 0.007*"let"
Topic 2: 0.022*"phone" + 0.021*"face" + 0.021*"really" + 0.017*"go" + 0.016*"see" + 0.015*"get" + 0.014*"video" + 0.014*"know" + 0.014*"thing" + 0.011*"take" + 0.010*"use" + 0.009*"look" + 0.009*"try" + 0.009*"cat" + 0.009*"lot"
Topic 3: 0.016*"people" + 0.010*"earthquake" + 0.008*"happen" + 0.007*"much" + 0.007*"come" + 0.006*"day" + 0.006*"call" + 0.006*"know" + 0.006*"big" + 0.005*"thing" + 0.005*"say" + 0.005*"bad" + 0.005*"building" + 0.005*"see" + 0.005*"go"
Topic 4: 0.024*"noise" + 0.014*"go" + 0.012*"speaker" + 0

In [18]:
num_words_per_topic = 15  # Number of words per topic
columns = [f"word_{i}" for i in range(1, num_words_per_topic + 1)]

# Initialize an empty list to collect data for DataFrame rows
data_rows = []

# Process each topic from the LDA model
for idx, topic in lda_model.print_topics():
    # Split the topic string and extract words with scores
    words_scores = [word.split("*") for word in topic.split("+")]
    words_scores = [(word.strip().strip('"'), float(score.strip())) for score, word in words_scores]
    
    # Sort words_scores by score (descending)
    words_scores.sort(key=lambda x: x[1], reverse=True)
    
    # Create a list of (word, score) tuples for the topic
    word_score_tuples = [(word, score) for word, score in words_scores[:num_words_per_topic]]
    
    # Append the row data (list of tuples) to the data_rows list
    data_rows.append(word_score_tuples)

# Create DataFrame from collected data_rows
topics_df = pd.DataFrame(data_rows, columns=columns)

# Add index for each topic (e.g., Topic 1, Topic 2, ...)
topics_df.index = [f"Topic {i + 1}" for i in range(len(topics_df))]

# Display the topics DataFrame
print("Topics DataFrame:")
print(topics_df)

ValueError: 15 columns passed, passed data had 10 columns

In [16]:
# Topic Inference

from gensim.models import TfidfModel
tfidf_model = TfidfModel(corpus)

# Transform the corpus to TF-IDF space
corpus_tfidf = tfidf_model[corpus]

# # Print TF-IDF 
# for doc in corpus_tfidf:
#     print(doc)

In [17]:
doc_topic_distribution = [lda_model.get_document_topics(doc_tfidf) for doc_tfidf in corpus_tfidf]

# Map documents to topics
document_topics = []
for doc_topics in doc_topic_distribution:
    # Sort by probability 
    doc_topics.sort(key=lambda x: x[1], reverse=True)
    
    #taking top three topics
    top_three_topics = doc_topics[:3]
    document_topics.append(top_three_topics)


for i, topics in enumerate(document_topics):
    print(f"Document {i} is assigned to topics:")
    for topic, prob in topics:
        print(f"  Topic {topic}: Probability {prob:.3f}")

Document 0 is assigned to topics:
  Topic 19: Probability 0.929
Document 1 is assigned to topics:
  Topic 7: Probability 0.959
Document 2 is assigned to topics:
  Topic 19: Probability 0.834
  Topic 12: Probability 0.089
Document 3 is assigned to topics:
  Topic 15: Probability 0.922
Document 4 is assigned to topics:
  Topic 2: Probability 0.936
Document 5 is assigned to topics:
  Topic 15: Probability 0.590
  Topic 14: Probability 0.164
  Topic 3: Probability 0.143
Document 6 is assigned to topics:
  Topic 15: Probability 0.875
Document 7 is assigned to topics:
  Topic 14: Probability 0.887
Document 8 is assigned to topics:
  Topic 13: Probability 0.380
  Topic 11: Probability 0.364
  Topic 19: Probability 0.135
Document 9 is assigned to topics:
  Topic 6: Probability 0.848
Document 10 is assigned to topics:
  Topic 15: Probability 0.648
  Topic 0: Probability 0.019
  Topic 1: Probability 0.019
Document 11 is assigned to topics:
  Topic 8: Probability 0.926
Document 12 is assigned to 

In [13]:
print(documents[18])

['norm', 'kind', 'hiding', 'massive', 'helmet', 'great', 'see', 'first', 'first', 'tell', 'guy', 'make', 'lot', 'costume', 'end', 'film', 'go', 'consistently', 'busy', 'amazing', 'project', 'work', 'sure', 'movie', 'well', 'receive', 'image', 'look', 'amazing', 'think', 'first', 'trailer', 'come', 'people', 'stun', 'silhouette', 'helmet', 'helmet', 'big', 'antler', 'unique', 'distinct', 'realize', 'guy', 'make', 'physical', 'helmet', 'wear', 'think', 'want', 'moment', 'wear', 'even', 'feeling', 'great', 'challenge', 'pretty', 'big', 'challenge', 'get', 'weight', 'right', 'balance', 'head', 'get', 'secure', 'actually', 'come', 'far', 'forehead', 'think', 'nail', 'first', 'see', 'walk', 'studio', 'pretty', 'impressive', 'let', 'talk', 'fabrication', 'bunch', 'way', 'make', 'assume', 'original', 'thing', 'model', 'correct', 'hand', 'sculpt', 'print', 'pretty', 'much', 'model', 'computer', 'scan', 'datum', 'make', 'skull', 'cap', 'fit', 'really', 'well', 'start', 'model', 'heavily', 'invol