In [1]:
import pandas as pd
import numpy as np
import json
import glob
import gensim

from youtube_transcript_api import YouTubeTranscriptApi
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora
from gensim.models import LdaModel
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess
import string
import pyLDAvis
import pyLDAvis.gensim

In [2]:
NUM_TOPICS = 50
NUM_PASSES = 20

In [3]:
stop_words = set(stopwords.words('english'))

In [4]:
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parset", "ner"])
    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        new_text = [token for token in new_text if token not in stop_words]
        final = " ".join(new_text)
        texts_out.append(final)

    return texts_out

In [5]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return final

In [6]:
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens]
    tokens = [token for token in tokens if token not in string.punctuation]

    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    preprocessed_text = ' '.join(tokens)
    return tokens

In [None]:
in_videos_path = "./trending_videos/USvideos.csv"
in_df = pd.read_csv(in_videos_path)

# Extracting transcripts and preprocessing 
documents = []
documents_directory = []
i = 0
for vid in in_df['video_id']:
    print(vid)
    try:
        transcript_list = YouTubeTranscriptApi.get_transcript(vid)
        transcript_text = ' '.join([item['text'] for item in transcript_list])
        # preprocessed_text = preprocess_text(transcript_text)
        # documents.append(preprocessed_text)
        documents.append(transcript_text)

        i = i + 1
        if i > 100:
            break
    except:
        print("Subtitles are turned off")
    print("")

2kyS6SvSYSE
Subtitles are turned off

1ZAPwfrtAFY

5qpjK5DgCt4
Subtitles are turned off

puqaWrEC7tY

d380meD0W0M

gHZ1Qz0KiKM
Subtitles are turned off

39idVpFF7NQ

nc99ccSXST0

jr9QtXwC9vc
Subtitles are turned off

TUmyygCMMGA

9wRQljFNDW8

VifQlJit6A0
Subtitles are turned off

5E4ZBSInqUU


In [None]:
lemmatized_documents = lemmatization(documents)
documents = gen_words(lemmatized_documents)

In [None]:
# Create dictionary and corpus
dictionary = corpora.Dictionary(documents)
corpus = [dictionary.doc2bow(doc) for doc in documents]

# Train LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=NUM_TOPICS, passes=NUM_PASSES)

print("Topics:")
for idx, topic in lda_model.print_topics():
    print(f"Topic {idx}: {topic}")

# Compute coherence score
coherence_model = CoherenceModel(model=lda_model, texts=documents, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print(f"Coherence Score: {coherence_score}")

In [None]:
topics_df = pd.DataFrame(columns=[f"Word_{i}" for i in range(1, 11)])

for idx, topic in lda_model.print_topics():
    words_scores = [word.split("*") for word in topic.split("+")]
    words_scores = [(word.strip(), float(score.split("*")[0])) for score, word in words_scores]
    
    words_scores.sort(key=lambda x: x[1], reverse=True)
    
    row_data = {f"Word_{i}": word_score for i, word_score in enumerate(words_scores, start=1)}
    topics_df = pd.concat([topics_df, pd.DataFrame(row_data)], ignore_index=True)

topics_df.index = [f"Topic {i}" for i in range(1, len(topics_df) + 1)]

print("Topics DataFrame:")
print(topics_df)

In [None]:
# Topic Inference

from gensim.models import TfidfModel
tfidf_model = TfidfModel(corpus)

# Transform the corpus to TF-IDF space
corpus_tfidf = tfidf_model[corpus]

# # Print TF-IDF 
# for doc in corpus_tfidf:
#     print(doc)

In [None]:
doc_topic_distribution = [lda_model.get_document_topics(doc_tfidf) for doc_tfidf in corpus_tfidf]

# Map documents to topics
document_topics = []
for doc_topics in doc_topic_distribution:
    # Sort by probability 
    doc_topics.sort(key=lambda x: x[1], reverse=True)
    
    #taking top three topics
    top_three_topics = doc_topics[:3]
    document_topics.append(top_three_topics)


for i, topics in enumerate(document_topics):
    print(f"Document {i} is assigned to topics:")
    for topic, prob in topics:
        print(f"  Topic {topic}: Probability {prob:.3f}")