In [1]:
import pandas as pd
import numpy as np
import json
import glob
import gensim

from youtube_transcript_api import YouTubeTranscriptApi
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora
from gensim.models import LdaModel
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess
import string
import pyLDAvis
import pyLDAvis.gensim

In [2]:
stop_words = set(stopwords.words('english'))

In [3]:
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parset", "ner"])
    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        new_text = [token for token in new_text if token not in stop_words]
        final = " ".join(new_text)
        texts_out.append(final)

    return texts_out

In [4]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return final

In [5]:
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens]
    tokens = [token for token in tokens if token not in string.punctuation]

    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    preprocessed_text = ' '.join(tokens)
    return tokens

In [6]:
in_videos_path = "./trending_videos/USvideos.csv"
in_df = pd.read_csv(in_videos_path)

    # Extracting transcripts and preprocessing 
documents = []
documents_directory = []
i = 0
for vid in in_df['video_id']:
    print(vid)
    try:
        transcript_list = YouTubeTranscriptApi.get_transcript(vid)
        transcript_text = ' '.join([item['text'] for item in transcript_list])
        # preprocessed_text = preprocess_text(transcript_text)
        # documents.append(preprocessed_text)
        documents.append(transcript_text)

        i = i + 1
        if i > 10:
            break
    except:
        print("Subtitles are turned off")
    print("")

2kyS6SvSYSE

1ZAPwfrtAFY

5qpjK5DgCt4
Subtitles are turned off

puqaWrEC7tY

d380meD0W0M

gHZ1Qz0KiKM

39idVpFF7NQ

nc99ccSXST0

jr9QtXwC9vc

TUmyygCMMGA

9wRQljFNDW8

VifQlJit6A0
Subtitles are turned off

5E4ZBSInqUU


In [7]:
lemmatized_documents = lemmatization(documents)
documents = gen_words(lemmatized_documents)

In [8]:
# Create dictionary and corpus
dictionary = corpora.Dictionary(documents)
corpus = [dictionary.doc2bow(doc) for doc in documents]

# Train LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, passes=10)

print("Topics:")
for idx, topic in lda_model.print_topics():
    print(f"Topic {idx}: {topic}")

# Compute coherence score
coherence_model = CoherenceModel(model=lda_model, texts=documents, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print(f"Coherence Score: {coherence_score}")

Topics:
Topic 0: 0.029*"go" + 0.022*"ice" + 0.020*"cream" + 0.019*"get" + 0.017*"make" + 0.013*"let" + 0.010*"want" + 0.009*"come" + 0.009*"see" + 0.008*"think"
Topic 1: 0.001*"go" + 0.001*"make" + 0.001*"get" + 0.001*"know" + 0.001*"thing" + 0.001*"really" + 0.001*"want" + 0.001*"phone" + 0.001*"say" + 0.001*"think"
Topic 2: 0.024*"drum" + 0.015*"say" + 0.012*"come" + 0.012*"kicking" + 0.010*"go" + 0.010*"hear" + 0.009*"get" + 0.008*"block" + 0.008*"know" + 0.007*"president"
Topic 3: 0.024*"go" + 0.022*"get" + 0.015*"face" + 0.013*"really" + 0.012*"phone" + 0.012*"thing" + 0.011*"know" + 0.011*"right" + 0.010*"see" + 0.010*"point"
Topic 4: 0.016*"job" + 0.012*"technology" + 0.010*"automation" + 0.009*"work" + 0.009*"see" + 0.007*"go" + 0.007*"people" + 0.007*"new" + 0.006*"really" + 0.006*"thing"
Coherence Score: 0.375266417023381
