In [None]:
import pandas as pd
import numpy as np
import json
import glob
import gensim

from youtube_transcript_api import YouTubeTranscriptApi
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora
from gensim.models import LdaModel
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess
import string
import pyLDAvis
import pyLDAvis.gensim

In [None]:
NUM_TOPICS = 50
NUM_PASSES = 100
NUM_WORDS_PER_TOPIC = 20

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
def lemmatization(texts, allowed_postags=["NOUN", "ADJ","VERB"]):
    # nlp = spacy.load("en_core_web_sm", disable=["parset", "ner"])
    nlp = spacy.load("en_core_web_sm")

    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        new_text = [token for token in new_text if token not in stop_words]
        final = " ".join(new_text)
        texts_out.append(final)

    return texts_out

In [None]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return final

In [None]:
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens]
    tokens = [token for token in tokens if token not in string.punctuation]

    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    preprocessed_text = ' '.join(tokens)
    return tokens

In [None]:
in_videos_path = "./trending_videos/USvideos.csv"
in_df = pd.read_csv(in_videos_path)

# Extracting transcripts and preprocessing 
documents = []
documents_raw = []
documents_vid = []
i = 0
for vid in in_df['video_id']:
    print(vid)
    try:
        transcript_list = YouTubeTranscriptApi.get_transcript(vid)
        transcript_text = ' '.join([item['text'] for item in transcript_list])
        # preprocessed_text = preprocess_text(transcript_text)
        # documents.append(preprocessed_text)
        documents.append(transcript_text)
        documents_raw.append(transcript_text)
        documents_vid.append(vid)

        i = i + 1
        if i >= 150:
            break
    except:
        print("Subtitles are turned off")
    print("")

In [None]:
lemmatized_documents = lemmatization(documents)
documents = gen_words(lemmatized_documents)

In [None]:
# Create dictionary and corpus
dictionary = corpora.Dictionary(documents)
corpus = [dictionary.doc2bow(doc) for doc in documents]

# Train LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=NUM_TOPICS, passes=NUM_PASSES)

print("Topics:")
for idx, topic in lda_model.print_topics(num_words=NUM_WORDS_PER_TOPIC, num_topics=NUM_TOPICS):
    print(f"Topic {idx}: {topic}")

# Compute coherence score
coherence_model = CoherenceModel(model=lda_model, texts=documents, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print(f"Coherence Score: {coherence_score}")

In [None]:
columns = [f"word_{i}" for i in range(1, NUM_WORDS_PER_TOPIC + 1)]

# Initialize an empty list to collect data for DataFrame rows
data_rows = []

# Process each topic from the LDA model
for idx, topic in lda_model.print_topics(num_words = NUM_WORDS_PER_TOPIC, num_topics=NUM_TOPICS):
    # Split the topic string and extract words with scores
    words_scores = [word.split("*") for word in topic.split("+")]
    words_scores = [(word.strip().strip('"'), float(score.strip())) for score, word in words_scores]
    
    # Sort words_scores by score (descending)
    words_scores.sort(key=lambda x: x[1], reverse=True)
    
    # Create a list of (word, score) tuples for the topic
    word_score_tuples = [(word, score) for word, score in words_scores[:NUM_WORDS_PER_TOPIC]]
    
    # Append the row data (list of tuples) to the data_rows list
    data_rows.append(word_score_tuples)

# Create DataFrame from collected data_rows
topics_df = pd.DataFrame(data_rows, columns=columns)

# Add index for each topic (e.g., Topic 1, Topic 2, ...)
topics_df.index = [f"Topic {i}" for i in range(len(topics_df))]

# Display the topics DataFrame
print("Topics DataFrame:")
print(topics_df)

In [None]:
# Topic Inference

from gensim.models import TfidfModel
tfidf_model = TfidfModel(corpus)

# Transform the corpus to TF-IDF space
corpus_tfidf = tfidf_model[corpus]

# # Print TF-IDF 
# for doc in corpus_tfidf:
#     print(doc)

In [None]:
doc_topic_distribution = [lda_model.get_document_topics(doc_tfidf) for doc_tfidf in corpus_tfidf]

# Map documents to topics
document_topics = []
for doc_topics in doc_topic_distribution:
    # Sort by probability 
    doc_topics.sort(key=lambda x: x[1], reverse=True)
    
    #taking top three topics
    top_three_topics = doc_topics[:3]
    document_topics.append(top_three_topics)


for i, topics in enumerate(document_topics):
    print(f"Document {i} is assigned to topics:")
    for topic, prob in topics:
        print(f"  Topic {topic}: Probability {prob:.3f}")

In [None]:
# print(documents_vid[124])
# print("")
# print(documents[124])
# print("")
# print(documents_raw[124])
# print("")
# print(lda_model.print_topics(num_words=NUM_WORDS_PER_TOPIC, num_topics=NUM_TOPICS)[39])

In [None]:
num_topics = max(topic_index for doc_topics in document_topics for topic_index, _ in doc_topics) + 1

# Initialize topic_documents as a list of empty lists for each topic
topic_documents = [[] for _ in range(num_topics)]

# Populate topic_documents with document-number and its probability for each topic
for doc_index, doc_topics in enumerate(document_topics):
    for topic_index, prob in doc_topics:
        # Append (document-number, probability) tuple to the corresponding topic sublist
        topic_documents[topic_index].append((doc_index, prob))

# Print the topic_documents array
for topic_index, topic_docs in enumerate(topic_documents):
    print(f"Topic {topic_index}:")
    for doc_num, prob in topic_docs:
        print(f"  Document {doc_num}: Probability {prob:.3f}")
    print() 

In [None]:
def lemmatization_query(text, allowed_postags=["NOUN", "ADJ","VERB", "ADV"]):
    # nlp = spacy.load("en_core_web_sm", disable=["parset", "ner"])
    nlp = spacy.load("en_core_web_sm")

    texts_out = ""

    doc = nlp(text)
    new_text = []
    for token in doc:
        # if token.pos_ in allowed_postags:
        #     new_text.append(token.lemma_)
        new_text.append(token.lemma_)

    # new_text = [token for token in new_text if token not in stop_words]
    final = " ".join(new_text)
    texts_out = final

    return texts_out

In [None]:
def gen_words_query(text):
    final = gensim.utils.simple_preprocess(text, deacc=True)
    return final

In [None]:
def get_document_numbers(topic_documents, user_topic_list):
    # Sort user_topic_list by probability in descending order
    user_topic_list.sort(key=lambda x: x[1], reverse=True)
    
    # Initialize a dictionary to collect document numbers and their relevance scores
    relevance_scores = {}

    # Iterate over user_topic_list to calculate relevance scores
    for topic_index, topic_prob in user_topic_list:
        # Get the list of document-number and probability tuples for the current topic
        topic_docs = topic_documents[topic_index]
        
        # Update relevance_scores dictionary with document numbers and their relevance scores
        for doc_num, doc_prob in topic_docs:
            if doc_num not in relevance_scores:
                relevance_scores[doc_num] = 0.0
            # Accumulate relevance score based on topic probability and document probability
            relevance_scores[doc_num] += topic_prob * doc_prob
    
    # Sort document numbers based on relevance scores in descending order
    sorted_documents = sorted(relevance_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Extract sorted document numbers from the list of tuples
    sorted_document_numbers = [doc_num for doc_num, _ in sorted_documents]
    
    return sorted_document_numbers

In [None]:
def get_video_urls(document_numbers, document_vid):
    base_url = "https://www.youtube.com/watch?v="  # Base URL for YouTube videos
    video_urls = []

    # Iterate over the relevant document numbers
    for doc_num in document_numbers:
        if doc_num < len(document_vid):
            video_id = document_vid[doc_num]
            video_url = base_url + video_id
            video_urls.append(video_url)
        else:
            print(f"Document number {doc_num} is out of range of document_vid list.")
    
    return video_urls

In [None]:
def find_videos():
    user_query = input("Enter the search keywords")
    user_query_preprocessed = lemmatization_query(user_query)
    print(user_query_preprocessed)
    user_query_tokens = gen_words_query(user_query_preprocessed)
    print(user_query_tokens)

    user_query_bow = dictionary.doc2bow(user_query_tokens)

    query_topic_distribution = lda_model.get_document_topics(user_query_bow)

# Sort topics by probability and select top three
    query_topics = sorted(query_topic_distribution, key=lambda x: x[1], reverse=True)[:3]

# Print top topics for user prompt
    print(f"Top three topics for the prompt '{user_query}':")
    for topic, prob in query_topics:
        print(f"  Topic {topic}: Probability {prob:.3f}")

    print(query_topics)

    document_numbers = get_document_numbers(topic_documents, query_topics)

# Print the sorted document numbers based on relevance to the user's topics
    print("Sorted Document Numbers based on Relevance:")
    print(document_numbers)

    video_urls = get_video_urls(document_numbers, documents_vid)

    print("Video URLs for Relevant Documents:")
    for url in video_urls:
        print(url)

In [None]:
while(True):
    find_videos()