In [2]:
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd


def get_youtube_video_transcript_dataframe(youtube_url):
    try:
        # Extract video ID from the URL
        video_id = youtube_url.split("v=")[-1]

        # Get the transcript
        transcript = YouTubeTranscriptApi.get_transcript(video_id)

        # Create an empty list to store the transcript data
        transcript_data = []

        # Loop through each transcript item and extract timestamp and text
        for item in transcript:
            start_time = item["start"]
            end_time = item["start"] + item["duration"]
            text = item["text"]
            transcript_data.append({"Start": start_time, "End": end_time, "Text": text})

        # Convert the list to a pandas DataFrame
        df = pd.DataFrame(transcript_data)

        return df

    except Exception as e:
        print("Error:", e)
        return None

# Replace the following with the YouTube video URL of your choice
youtube_url = "https://www.youtube.com/watch?v=VyFk2sdw230&ab_channel=BiggerPockets"
transcript_df = get_youtube_video_transcript_dataframe(youtube_url)

if transcript_df is not None:
    print(transcript_df.head())
else:
    print("Failed to get the transcript.")

   Start     End                                      Text
0  0.000   5.339  welcome to mortgage Mondays Today's Show
1  2.280   6.720    we are going to break down an FHA loan
2  5.339   8.280     pretty much everything that you could
3  6.720   9.720      possibly need to know but if you now
4  8.280  10.980   show up to an appointment with the loan


# SUb topic segementation

In [18]:
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha()]
    
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    return tokens

# Example DataFrame
df = transcript_df

def cluster_dialogue(dataframe, num_topics=3):
    texts = [preprocess_text(text) for text in dataframe['Text']]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary)
    
    topic_assignments = [max(lda_model[text], key=lambda item: item[1])[0] for text in corpus]
    dataframe['Subtopic'] = topic_assignments
    
    return dataframe

df_with_subtopics = cluster_dialogue(df, num_topics=2)

# Group messages by subtopic
subtopic_groups = df_with_subtopics.groupby('Subtopic')['Text'].apply(list)

# Print subtopic messages in order
for topic_id, messages in subtopic_groups.items():
    print(f"Subtopic {topic_id + 1}:\n")
    for message in messages:
        print(message)
    print("\n")


Subtopic 1:

pretty much everything that you could
possibly need to know but if you now
they don't know these things they're the
wrong load off so we're going to tell
you everything that you need to know to
make sure you're working with the right
person so prepare to be educated on FHA
University Christian can you tell us a
little about them if you guys want to
pay off your home in as little as five
to seven years what if you could do it
can retire even earlier want to learn
Christian welcome how we doing good beer
we're doing great and today is gonna be
specific loan products so I asked you
of you said we want to have a whole
should be using I'd rather if you had
but episodes like this could actually
give you some information to be equipped
with the lo as we call them in the
about the FHA loan Christian let's start
off with describing why we call it FHA
it mean yeah so it's just the agency
that oversees it it's the federal
straightforward it's a primary residence
there this is not som

In [16]:
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha()]
    
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    return tokens

def cluster_dialogue(dataframe, num_topics=10):
    texts = [preprocess_text(text) for text in dataframe['Text']]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary)
    
    topic_assignments = [max(lda_model[text], key=lambda item: item[1])[0] for text in corpus]
    topic_clusters = {}
    
    for i, topic_id in enumerate(topic_assignments):
        if topic_id not in topic_clusters:
            topic_clusters[topic_id] = []
        topic_clusters[topic_id].append(dataframe.iloc[i]['Text'])
    
    return topic_clusters

# Example DataFrame
df = transcript_df

subtopic_clusters = cluster_dialogue(df, num_topics=10)
for topic_id, messages in subtopic_clusters.items():
    print(f"Subtopic {topic_id + 1}:")
    for message in messages:
        print(message)
    print("\n")


Subtopic 8:
welcome to mortgage Mondays Today's Show
you everything that you need to know to
without earning big bucks like your guy
mortgage you can clear your debt build
wealth and generate passive income in
like BiggerPockets welcome to mortgage
is sounds to me like a lot of you are
with the lo as we call them in the
housing Administration pretty
there this is not something that you can
refinance to get it off and that's
housing Administration my understanding
credit score that's a good point let's
know when you get lower than 600 though
easy because it could still be more
you're going to have to hurdle through
if their debt to income ratio wasn't
too much debt for the income that they
are regarding the debts income ratios
income ratio than conventional that
what FHA stands for it does not stand
purchase and especially when you get
your old house into a rent this is
this you're turning your old home into a
another thing people don't think about
mortgage but in your big metropolitan


In [17]:
subtopic_clusters

{7: ["welcome to mortgage Mondays Today's Show",
  'you everything that you need to know to',
  'without earning big bucks like your guy',
  'mortgage you can clear your debt build',
  'wealth and generate passive income in',
  'like BiggerPockets welcome to mortgage',
  'is sounds to me like a lot of you are',
  'with the lo as we call them in the',
  'housing Administration pretty',
  'there this is not something that you can',
  "refinance to get it off and that's",
  'housing Administration my understanding',
  "credit score that's a good point let's",
  'know when you get lower than 600 though',
  'easy because it could still be more',
  "you're going to have to hurdle through",
  "if their debt to income ratio wasn't",
  'too much debt for the income that they',
  'are regarding the debts income ratios',
  'income ratio than conventional that',
  'what FHA stands for it does not stand',
  'purchase and especially when you get',
  'your old house into a rent this is',
  "this you'

# Text ranking

In [13]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer


# Example subtopic data
subtopic_data = subtopic_clusters

# Apply TextRank summarization to each subtopic
for subtopic_id, messages in subtopic_data.items():
    subtopic_text = '. '.join(messages)
    
    parser = PlaintextParser.from_string(subtopic_text, Tokenizer("english"))
    summarizer = TextRankSummarizer()
    
    print(f"Subtopic {subtopic_id + 1} Summary:")
    for sentence in summarizer(parser.document, 5):  # Adjust the number of sentences as needed
        print(sentence)
    
    print("\n")

Subtopic 1 Summary:
yeah you'd have to actually do a. basically the insurance policy that you.
the FHA one you can only have one at a. have one ever okay so you can buy.
qualify you for the loan FHA has a. really weird role that unless you move.
future could be absolute and this is I. our borrowers all the time we take a. what if we refinanced and and lost a. property but we gained 1200 on another.
you pick the one that's right for you.


Subtopic 2 Summary:
you everything that you need to know to.
nice thing about it is that it can be a. or four unit that allows you to house.
at it globally does it impact you on a. greater scale than what it's hurting you.
and to round it out you get a more.
you're looking to do and I'll get you.




# Processed into timeline and make a short video

In [13]:
# Original

from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer

# Combine all sentences into one paragraph
text = '. '.join(transcript_df['Text'])

# Parsing the text
parser = PlaintextParser.from_string(text, Tokenizer("english"))

# Applying TextRank
summarizer = TextRankSummarizer()
summary = summarizer(parser.document, 10)  # Here, we're extracting 3 sentences. Adjust as needed.

# Extracted sentences
for sentence in summary:
    print(sentence)


you everything that you need to know to.
nice thing about it is that it can be a. one to four unit so obviously the most.
yeah you'd have to actually do a. refinance to get it off and that's.
lower than that FHA can go down to 500. and that's like I don't want to say.
I don't want to say hey you got a 502. credit score FHA loan is going to be a. slam dunk just because something's.
the FHA one you can only have one at a. time that does not mean you can only.
qualify you for the loan FHA has a. really weird role that unless you move.
going to be 22 unit is going to be 15.. there we go and fhas you could still use.
that you're going to be able to use it.
you pick the one that's right for you.


In [14]:
df = transcript_df
# Mapping sentences to their timestamps
for sentence in summary:
    sentence_str = str(sentence)
    idx = df[df['Text'] == sentence_str].index[0]
    start_time = df.iloc[idx]['Start']
    end_time = df.iloc[idx]['End']
    print(f"{start_time} to {end_time}: {sentence_str}")


IndexError: index 0 is out of bounds for axis 0 with size 0

Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl (97 kB)
     ---------------------------------------- 97.3/97.3 kB 2.8 MB/s eta 0:00:00
Collecting docopt<0.7,>=0.6.1
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting pycountry>=18.2.23
  Downloading pycountry-22.3.5.tar.gz (10.1 MB)
     --------------------------------------- 10.1/10.1 MB 14.4 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting breadability>=0.1.20
  Downloading breadability-0.1.20.tar.gz (32 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Buil