# Link to transcript

In [5]:
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd


def get_youtube_video_transcript_dataframe(youtube_url):
    try:
        # Extract video ID from the URL
        video_id = youtube_url.split("v=")[-1]

        # Get the transcript
        transcript = YouTubeTranscriptApi.get_transcript(video_id)

        # Create an empty list to store the transcript data
        transcript_data = []

        # Loop through each transcript item and extract timestamp and text
        for item in transcript:
            start_time = item["start"]
            end_time = item["start"] + item["duration"]
            text = item["text"]
            transcript_data.append({"Start": start_time, "End": end_time, "Text": text})

        # Convert the list to a pandas DataFrame
        df = pd.DataFrame(transcript_data)

        return df

    except Exception as e:
        print("Error:", e)
        return None

# Replace the following with the YouTube video URL of your choice
youtube_url = "https://www.youtube.com/watch?v=VyFk2sdw230&ab_channel=BiggerPockets"
transcript_df = get_youtube_video_transcript_dataframe(youtube_url)

if transcript_df is not None:
    print(transcript_df.head())
else:
    print("Failed to get the transcript.")

   Start     End                                      Text
0  0.000   5.339  welcome to mortgage Mondays Today's Show
1  2.280   6.720    we are going to break down an FHA loan
2  5.339   8.280     pretty much everything that you could
3  6.720   9.720      possibly need to know but if you now
4  8.280  10.980   show up to an appointment with the loan


# Preprocessed transcript

In [6]:
transcript_df

Unnamed: 0,Start,End,Text
0,0.000,5.339,welcome to mortgage Mondays Today's Show
1,2.280,6.720,we are going to break down an FHA loan
2,5.339,8.280,pretty much everything that you could
3,6.720,9.720,possibly need to know but if you now
4,8.280,10.980,show up to an appointment with the loan
...,...,...,...
512,920.160,924.160,Pockets video thanks a lot love you guys
513,922.260,933.559,go build some wealth
514,924.160,933.559,[Music]
515,937.390,944.480,[Music]


## Selected word from short

In [10]:
selected_word = pd.read_csv('slected_word.csv')
selected_word.head()

Unnamed: 0.1,Unnamed: 0,Word,Frequency
0,0,people,37
1,2,property,34
2,4,market,23
3,5,one,22
4,6,properties,20


# Algo (take directly from the df)

In [11]:
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

df = transcript_df

# Load NLTK stopwords
stop_words = set(stopwords.words("english"))

# Create the engaging words DataFrame
engaging_words_df = selected_word

# Function to preprocess the text and tokenize it
def preprocess_text(text):
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalnum() and word not in stop_words]
    return words

# Preprocess the text in the DataFrame
df["ProcessedText"] = df["Text"].apply(preprocess_text)

# Calculate word frequency across the entire transcript with weights for engaging words
word_frequency = {}
for _, row in df.iterrows():
    for word in row["ProcessedText"]:
        weight = engaging_words_df[engaging_words_df["Word"] == word]["Frequency"].values
        weight = weight[0] if len(weight) > 0 else 1  # Use 1 if word not found in engaging words DataFrame
        word_frequency[word] = word_frequency.get(word, 0) + weight

# Function to get the frequency score for a sentence
def get_sentence_frequency_score(sentence):
    words = preprocess_text(sentence)
    score = sum(word_frequency.get(word, 0) for word in words)
    return score

# Rank sentences based on frequency score
df["FrequencyScore"] = df["Text"].apply(get_sentence_frequency_score)

# Set the number of sentences you want in the summary
num_sentences = 10

# Extract the top sentences to form the summary
summary_sentences = df.nlargest(num_sentences, "FrequencyScore")["Text"].values


In [12]:
summary_sentences

array(["that's a big one that people a lot of",
       'the FHA one you can only have one at a',
       "you pick the one that's right for you",
       'purchases almost impossible right people',
       'like to note a lot of people say go buy',
       'people are having right now is what I',
       "just gonna buy one house a year and I'm",
       "buy a four Plex you're spending one and",
       'have one ever okay so you can buy',
       'media the one broker do you want to'], dtype=object)

# Expand to select 1-2 min long

In [13]:
# Get the result and put it back into orignal data
result = pd.DataFrame()
for sentence in summary_sentences:
    rows = df[df['Text'] == sentence]
    result = result.append(rows)
    
result

  result = result.append(rows)
  result = result.append(rows)
  result = result.append(rows)
  result = result.append(rows)
  result = result.append(rows)
  result = result.append(rows)
  result = result.append(rows)
  result = result.append(rows)
  result = result.append(rows)
  result = result.append(rows)


Unnamed: 0,Start,End,Text,ProcessedText,FrequencyScore
218,390.419,392.94,that's a big one that people a lot of,"[big, one, people, lot]",1486
179,317.04,320.52,the FHA one you can only have one at a,"[fha, one, one]",1454
473,851.279,854.1,you pick the one that's right for you,"[pick, one, right]",1199
248,446.4,450.0,purchases almost impossible right people,"[purchases, almost, impossible, right, people]",1129
172,303.66,307.68,like to note a lot of people say go buy,"[like, note, lot, people, say, go, buy]",1127
311,560.459,564.899,people are having right now is what I,"[people, right]",1123
220,392.94,395.4,just gonna buy one house a year and I'm,"[gon, na, buy, one, house, year]",1116
243,437.94,440.88,buy a four Plex you're spending one and,"[buy, four, plex, spending, one]",1025
181,320.52,324.66,have one ever okay so you can buy,"[one, ever, okay, buy]",1011
483,867.54,870.44,media the one broker do you want to,"[media, one, broker, want]",992


In [20]:
import pandas as pd

def filter_transcript_by_range(original_df, selected_df):
    filtered_dfs = []
    
    for _, row in selected_df.iterrows():
        start_time = row['Start']
        new_id = start_time

        # Calculate the range based on the midpoint of selected_df start
        lower_bound = start_time - 60
        upper_bound = start_time + 60
        
        # Filter rows based on the range
        filtered_df = original_df[(original_df['Start'] >= lower_bound) & 
                                  (original_df['Start'] <= upper_bound)].copy()
        
        # Add the 'new_id' column to represent the selected_df start
        filtered_df['new_id'] = new_id
        
        # Append the filtered DataFrame to the list
        filtered_dfs.append(filtered_df)

    # Concatenate all the filtered DataFrames into one
    filtered_df = pd.concat(filtered_dfs, ignore_index=True)

    return filtered_df

# Example usage
# Assuming your original DataFrame is called 'youtube_df' and the selected DataFrame is 'selected_df'
new_dataset = filter_transcript_by_range(transcript_df, result)
new_dataset

Unnamed: 0,Start,End,Text,ProcessedText,FrequencyScore,new_id
0,330.720,334.380,for first home it is not a first time,"[first, home, first, time]",24,390.419
1,332.639,336.479,home buyers you could use it for your,"[home, buyers, could, use]",34,390.419
2,334.380,337.860,third fourth fifth Property okay FHA,"[third, fourth, fifth, property, okay, fha]",495,390.419
3,336.479,339.900,loan is does not have to be your first,"[loan, first]",59,390.419
4,337.860,342.600,purchase and especially when you get,"[purchase, especially, get]",20,390.419
...,...,...,...,...,...,...
654,916.680,920.160,money with real estate if you've got,"[money, real, estate, got]",331,867.540
655,918.180,922.260,some time check out another Bigger,"[time, check, another, bigger]",20,867.540
656,920.160,924.160,Pockets video thanks a lot love you guys,"[pockets, video, thanks, lot, love, guys]",160,867.540
657,922.260,933.559,go build some wealth,"[go, build, wealth]",25,867.540


In [21]:
import pandas as pd

def group_rows_with_overlap_groupby(dataframe, window_size, overlap):
    combined_rows = []

    # Group the DataFrame by the 'new_id' column
    grouped_data = dataframe.groupby('new_id')

    # Iterate over each group
    for group_name, group_data in grouped_data:
        num_rows = len(group_data)

        # Calculate the adjusted overlap to ensure the final sequence is of the desired window_size
        adjusted_overlap = max(overlap, window_size - num_rows)

        # Iterate over the rows of the current group with the adjusted overlap
        for i in range(0, num_rows, adjusted_overlap):
            # Get the current window of rows from the group
            window_rows = group_data.iloc[i:i+window_size]

            if len(window_rows) == window_size:
                # Get the 'text' column values and combine them
                combined_text = ' '.join(window_rows['Text'])

                # Get the first 'Start' value from the window
                start_value = window_rows['Start'].iloc[0]

                # Create a new row with the combined 'text' and first 'start' value
                new_row = {
                    'Start': start_value,
                    'Text': combined_text,
                    'new_id': group_name  # Add the 'new_id' to the new row
                }

                # Append the new row to the combined_rows list
                combined_rows.append(new_row)

    # Create a new DataFrame from the combined_rows list
    combined_df = pd.DataFrame(combined_rows)

    return combined_df


window_size = 30
overlap = 1

sequence = group_rows_with_overlap_groupby(new_dataset, window_size, overlap)
sequence

Unnamed: 0,Start,Text,new_id
0,244.680,easy because it could still be more expensive ...,303.66
1,245.940,expensive loan with the lower rates right or t...,303.66
2,247.379,right or the lower credit score yeah and there...,303.66
3,249.120,there's underwriting criteria that you're goin...,303.66
4,250.500,you're going to have to hurdle through but yes...,303.66
...,...,...,...
364,863.880,to find more about you and connect with you ye...,867.54
365,865.620,you yep anywhere online I'm on social media th...,867.54
366,867.540,media the one broker do you want to direct ema...,867.54
367,869.279,direct email me Christian theonbrokers.com tha...,867.54


## Pick the best from the 1-2 min long

In [22]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.probability import FreqDist

# Download necessary resources (uncomment if not already downloaded)
# nltk.download('punkt')
# nltk.download('stopwords')

def calculate_fluency(text):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    # Remove stopwords to focus on meaningful words
    stop_words = set(stopwords.words('english'))
    words = [word.lower() for sentence in sentences for word in nltk.word_tokenize(sentence) if word.lower() not in stop_words and word.isalpha()]
    
    # Calculate word frequency distribution
    word_freq = FreqDist(words)
    
    # Compute fluency score as the number of unique words divided by the total number of words
    fluency_score = len(word_freq) / len(words)
    
    return fluency_score

df_seq = sequence

# Calculate fluency scores for each text in the DataFrame
df_seq['Fluency Score'] = df_seq['Text'].apply(calculate_fluency)


# Sort the DataFrame by fluency scores from highest to lowest
df_sorted = df_seq.sort_values(by='Fluency Score', ascending=False)

# Display the DataFrame with fluency scores
df_sorted

Unnamed: 0,Start,Text,new_id,Fluency Score
360,857.160,much for your time and sharing your knowledge ...,867.540,0.875000
329,857.160,much for your time and sharing your knowledge ...,851.279,0.875000
324,849.959,knows different products that can help you pic...,851.279,0.864583
355,849.959,knows different products that can help you pic...,867.540,0.864583
359,855.480,otherwise right Christian thank you so much fo...,867.540,0.855670
...,...,...,...,...
33,299.699,are best for you okay tell me more about FHA L...,303.660,0.644860
34,301.440,FHA Loans now a couple things that I'd like to...,303.660,0.644860
96,299.699,are best for you okay tell me more about FHA L...,320.520,0.644860
62,299.699,are best for you okay tell me more about FHA L...,317.040,0.644860


# Clip with timestamp

# (optional) shows the video with cutted part