In [1]:
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd


def get_youtube_video_transcript_dataframe(youtube_url):
    try:
        # Extract video ID from the URL
        video_id = youtube_url.split("v=")[-1]

        # Get the transcript
        transcript = YouTubeTranscriptApi.get_transcript(video_id)

        # Create an empty list to store the transcript data
        transcript_data = []

        # Loop through each transcript item and extract timestamp and text
        for item in transcript:
            start_time = item["start"]
            end_time = item["start"] + item["duration"]
            text = item["text"]
            transcript_data.append({"Start": start_time, "End": end_time, "Text": text})

        # Convert the list to a pandas DataFrame
        df = pd.DataFrame(transcript_data)

        return df

    except Exception as e:
        print("Error:", e)
        return None

# Replace the following with the YouTube video URL of your choice
youtube_url = "https://www.youtube.com/watch?v=VyFk2sdw230&ab_channel=BiggerPockets"
transcript_df = get_youtube_video_transcript_dataframe(youtube_url)

if transcript_df is not None:
    print(transcript_df.head())
else:
    print("Failed to get the transcript.")

   Start     End                                      Text
0  0.000   5.339  welcome to mortgage Mondays Today's Show
1  2.280   6.720    we are going to break down an FHA loan
2  5.339   8.280     pretty much everything that you could
3  6.720   9.720      possibly need to know but if you now
4  8.280  10.980   show up to an appointment with the loan


# Preprocessed

# Segemnt into chapter

In [2]:
import pandas as pd
import gensim
from gensim import corpora
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# If you haven't downloaded the stopwords set, do it once
nltk.download('stopwords')
nltk.download('punkt')

# Load dataset
df = transcript_df

# Pre-process the data
def preprocess(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    tokens = [t for t in tokens if t not in stop_words and t.isalpha()]  # Remove stopwords and non-alpha tokens
    return tokens

# Apply preprocessing
df['processed_text'] = df['Text'].apply(preprocess)

# Create Dictionary
id2word = corpora.Dictionary(df['processed_text'])

# Create Corpus
texts = df['processed_text']

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=5,  # You can change this based on your assessment of how many topics there might be
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

# View the top words for each topic
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\drago\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\drago\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Topic: 0 
Words: 0.025*"going" + 0.025*"credit" + 0.019*"make" + 0.017*"little" + 0.015*"fha" + 0.015*"really" + 0.014*"score" + 0.012*"want" + 0.011*"loan" + 0.010*"payment"
Topic: 1 
Words: 0.052*"loan" + 0.024*"go" + 0.023*"percent" + 0.017*"half" + 0.017*"one" + 0.016*"conventional" + 0.013*"much" + 0.012*"people" + 0.012*"fha" + 0.012*"income"
Topic: 2 
Words: 0.016*"mean" + 0.016*"mortgage" + 0.014*"people" + 0.014*"time" + 0.013*"get" + 0.013*"take" + 0.013*"primary" + 0.013*"residence" + 0.012*"conventional" + 0.012*"yeah"
Topic: 3 
Words: 0.031*"one" + 0.031*"right" + 0.024*"fha" + 0.024*"know" + 0.019*"loan" + 0.016*"still" + 0.011*"get" + 0.010*"yeah" + 0.010*"like" + 0.010*"show"
Topic: 4 
Words: 0.039*"fha" + 0.036*"buy" + 0.034*"four" + 0.030*"three" + 0.024*"loan" + 0.021*"property" + 0.020*"want" + 0.019*"use" + 0.019*"unit" + 0.015*"buying"


In [3]:
# Assign each sentence to a topic
topics = []

for text in df['processed_text']:
    bow = id2word.doc2bow(text)
    topic_probs = lda_model.get_document_topics(bow)
    # Get the topic with the highest probability
    dominant_topic = sorted(topic_probs, key=lambda x: x[1], reverse=True)[0][0]
    topics.append(dominant_topic)

df['Topic'] = topics


In [4]:
df

Unnamed: 0,Start,End,Text,processed_text,Topic
0,0.000,5.339,welcome to mortgage Mondays Today's Show,"[welcome, mortgage, mondays, today, show]",3
1,2.280,6.720,we are going to break down an FHA loan,"[going, break, fha, loan]",0
2,5.339,8.280,pretty much everything that you could,"[pretty, much, everything, could]",2
3,6.720,9.720,possibly need to know but if you now,"[possibly, need, know]",3
4,8.280,10.980,show up to an appointment with the loan,"[show, appointment, loan]",3
...,...,...,...,...,...
512,920.160,924.160,Pockets video thanks a lot love you guys,"[pockets, video, thanks, lot, love, guys]",0
513,922.260,933.559,go build some wealth,"[go, build, wealth]",1
514,924.160,933.559,[Music],[music],0
515,937.390,944.480,[Music],[music],0


In [5]:
df.to_csv("LDA_topicsegmented.csv")

## Group the text based on topic

In [None]:
# pick each 10, then pick the most freq topic
# pick using KNN algo

In [8]:
# Assuming the 'Topic' column is already added to df
chunk_size = 10

# Create a new DataFrame to store the combined sentences for each segment
segments_df = pd.DataFrame(columns=['Segment', 'Dominant_Topic', 'Combined_Text'])

for start in range(0, len(df), chunk_size):
    end = start + chunk_size
    chunk = df.iloc[start:end]
    
    # Determine the dominant topic for this chunk
    dominant_topic = chunk['Topic'].value_counts().idxmax()
    
    # Combine the sentences
    combined_text = ' '.join(chunk['Text'])
    
    segments_df = segments_df.append({
        'Segment': f"{start+1}-{end}",
        'Dominant_Topic': dominant_topic,
        'Combined_Text': combined_text
    }, ignore_index=True)

print(segments_df)


  segments_df = segments_df.append({
  segments_df = segments_df.append({
  segments_df = segments_df.append({
  segments_df = segments_df.append({
  segments_df = segments_df.append({
  segments_df = segments_df.append({
  segments_df = segments_df.append({
  segments_df = segments_df.append({
  segments_df = segments_df.append({
  segments_df = segments_df.append({
  segments_df = segments_df.append({
  segments_df = segments_df.append({
  segments_df = segments_df.append({
  segments_df = segments_df.append({
  segments_df = segments_df.append({
  segments_df = segments_df.append({
  segments_df = segments_df.append({
  segments_df = segments_df.append({
  segments_df = segments_df.append({
  segments_df = segments_df.append({
  segments_df = segments_df.append({
  segments_df = segments_df.append({
  segments_df = segments_df.append({
  segments_df = segments_df.append({
  segments_df = segments_df.append({
  segments_df = segments_df.append({
  segments_df = segments_df.append({
 

Unnamed: 0,Segment,Dominant_Topic,Combined_Text
0,1-10,3,welcome to mortgage Mondays Today's Show we ar...
1,11-20,3,person so prepare to be educated on FHA Loans ...
2,21-30,1,mortgage you can clear your debt build wealth ...
3,31-40,3,like BiggerPockets welcome to mortgage Mondays...
4,41-50,0,wanted to hear and you listen and many of you ...
5,51-60,3,with when you go have that conversation with t...
6,61-70,4,only loan so that means you have to live there...
7,71-80,2,picking up a rental property via a primary res...
8,81-90,1,it never goes away yeah it doesn't go away whe...
9,91-100,1,go buy up to what your county is determined to...


In [10]:
segments_df

Unnamed: 0,Segment,Dominant_Topic,Combined_Text
0,1-10,3,welcome to mortgage Mondays Today's Show we ar...
1,11-20,3,person so prepare to be educated on FHA Loans ...
2,21-30,1,mortgage you can clear your debt build wealth ...
3,31-40,3,like BiggerPockets welcome to mortgage Mondays...
4,41-50,0,wanted to hear and you listen and many of you ...
5,51-60,3,with when you go have that conversation with t...
6,61-70,4,only loan so that means you have to live there...
7,71-80,2,picking up a rental property via a primary res...
8,81-90,1,it never goes away yeah it doesn't go away whe...
9,91-100,1,go buy up to what your county is determined to...


# Text ranking algorithm / other curtation algo

# Result

# Evaluation