In [5]:
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd


def get_youtube_video_transcript_dataframe(youtube_url):
    try:
        # Extract video ID from the URL
        video_id = youtube_url.split("v=")[-1]

        # Get the transcript
        transcript = YouTubeTranscriptApi.get_transcript(video_id)

        # Create an empty list to store the transcript data
        transcript_data = []

        # Loop through each transcript item and extract timestamp and text
        for item in transcript:
            start_time = item["start"]
            end_time = item["start"] + item["duration"]
            text = item["text"]
            transcript_data.append({"Start": start_time, "End": end_time, "Text": text})

        # Convert the list to a pandas DataFrame
        df = pd.DataFrame(transcript_data)

        return df

    except Exception as e:
        print("Error:", e)
        return None

# Replace the following with the YouTube video URL of your choice
youtube_url = "https://www.youtube.com/watch?v=VyFk2sdw230&ab_channel=BiggerPockets"
transcript_df = get_youtube_video_transcript_dataframe(youtube_url)

if transcript_df is not None:
    print(transcript_df.head())
else:
    print("Failed to get the transcript.")

   Start     End                                      Text
0  0.000   5.339  welcome to mortgage Mondays Today's Show
1  2.280   6.720    we are going to break down an FHA loan
2  5.339   8.280     pretty much everything that you could
3  6.720   9.720      possibly need to know but if you now
4  8.280  10.980   show up to an appointment with the loan


In [6]:
transcript_df.head()

Unnamed: 0,Start,End,Text
0,0.0,5.339,welcome to mortgage Mondays Today's Show
1,2.28,6.72,we are going to break down an FHA loan
2,5.339,8.28,pretty much everything that you could
3,6.72,9.72,possibly need to know but if you now
4,8.28,10.98,show up to an appointment with the loan


# Shor Selected word

In [7]:
selected_word = pd.read_csv('slected_word.csv')
selected_word.drop(columns='Unnamed: 0', inplace=True)
selected_word.head()

Unnamed: 0,Word,Frequency
0,people,37
1,property,34
2,market,23
3,one,22
4,properties,20


# Weighted Selection

In [8]:
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

df = transcript_df

# Load NLTK stopwords
stop_words = set(stopwords.words("english"))

# Create the engaging words DataFrame
engaging_words_df = selected_word

# Function to preprocess the text and tokenize it
def preprocess_text(text):
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalnum() and word not in stop_words]
    return words

# Preprocess the text in the DataFrame
df["ProcessedText"] = df["Text"].apply(preprocess_text)

# Calculate word frequency across the entire transcript with weights for engaging words
word_frequency = {}
for _, row in df.iterrows():
    for word in row["ProcessedText"]:
        weight = engaging_words_df[engaging_words_df["Word"] == word]["Frequency"].values
        weight = weight[0] if len(weight) > 0 else 1  # Use 1 if word not found in engaging words DataFrame
        word_frequency[word] = word_frequency.get(word, 0) + weight

# Function to get the frequency score for a sentence
def get_sentence_frequency_score(sentence):
    words = preprocess_text(sentence)
    score = sum(word_frequency.get(word, 0) for word in words)
    return score

# Rank sentences based on frequency score
df["FrequencyScore"] = df["Text"].apply(get_sentence_frequency_score)

# Set the number of sentences you want in the summary
num_sentences = 10

# Extract the top sentences to form the summary
summary_sentences = df.nlargest(num_sentences, "FrequencyScore")["Text"].values


In [9]:
summary_sentences

array(["that's a big one that people a lot of",
       'the FHA one you can only have one at a',
       "you pick the one that's right for you",
       'purchases almost impossible right people',
       'like to note a lot of people say go buy',
       'people are having right now is what I',
       "just gonna buy one house a year and I'm",
       "buy a four Plex you're spending one and",
       'have one ever okay so you can buy',
       'media the one broker do you want to'], dtype=object)

# Processed selected word

In [10]:
result = pd.DataFrame()
for sentence in summary_sentences:
    rows = df[df['Text'] == sentence]
    result = result.append(rows)
    
result

Unnamed: 0,Start,End,Text,ProcessedText,FrequencyScore
218,390.419,392.94,that's a big one that people a lot of,"[big, one, people, lot]",1486
179,317.04,320.52,the FHA one you can only have one at a,"[fha, one, one]",1454
473,851.279,854.1,you pick the one that's right for you,"[pick, one, right]",1199
248,446.4,450.0,purchases almost impossible right people,"[purchases, almost, impossible, right, people]",1129
172,303.66,307.68,like to note a lot of people say go buy,"[like, note, lot, people, say, go, buy]",1127
311,560.459,564.899,people are having right now is what I,"[people, right]",1123
220,392.94,395.4,just gonna buy one house a year and I'm,"[gon, na, buy, one, house, year]",1116
243,437.94,440.88,buy a four Plex you're spending one and,"[buy, four, plex, spending, one]",1025
181,320.52,324.66,have one ever okay so you can buy,"[one, ever, okay, buy]",1011
483,867.54,870.44,media the one broker do you want to,"[media, one, broker, want]",992


In [11]:
import pandas as pd

def filter_transcript_by_range(original_df, selected_df):
    filtered_rows = []
    for _, row in selected_df.iterrows():
        start_time = row['Start']
        # Calculate the range based on the midpoint of selected_df start
        lower_bound = start_time - 60
        upper_bound = start_time + 60
        
        # Filter rows based on the range
        filtered_rows.extend(original_df[(original_df['Start'] >= lower_bound) & 
                                         (original_df['Start'] <= upper_bound)].values.tolist())

    # Create a new DataFrame from the filtered rows
    filtered_df = pd.DataFrame(filtered_rows, columns=original_df.columns)

    return filtered_df

# Example usage
# Assuming your original DataFrame is called 'youtube_df' and the selected DataFrame is 'selected_df'
new_dataset = filter_transcript_by_range(transcript_df, result)

In [12]:
new_dataset

Unnamed: 0,Start,End,Text,ProcessedText,FrequencyScore
0,330.720,334.380,for first home it is not a first time,"[first, home, first, time]",24
1,332.639,336.479,home buyers you could use it for your,"[home, buyers, could, use]",34
2,334.380,337.860,third fourth fifth Property okay FHA,"[third, fourth, fifth, property, okay, fha]",495
3,336.479,339.900,loan is does not have to be your first,"[loan, first]",59
4,337.860,342.600,purchase and especially when you get,"[purchase, especially, get]",20
...,...,...,...,...,...
654,916.680,920.160,money with real estate if you've got,"[money, real, estate, got]",331
655,918.180,922.260,some time check out another Bigger,"[time, check, another, bigger]",20
656,920.160,924.160,Pockets video thanks a lot love you guys,"[pockets, video, thanks, lot, love, guys]",160
657,922.260,933.559,go build some wealth,"[go, build, wealth]",25


In [14]:
# COmbined 

import pandas as pd

def filter_transcript_by_range(original_df, selected_df):
    grouped_text = {}
    
    for _, row in selected_df.iterrows():
        start_time = row['Start']
        # Calculate the range based on the midpoint of selected_df start
        lower_bound = start_time - 60
        upper_bound = start_time + 60
        
        # Filter rows based on the range
        filtered_rows = original_df[(original_df['Start'] >= lower_bound) & 
                                    (original_df['Start'] <= upper_bound)]
        
        # Combine the text from filtered rows
        combined_text = ' '.join(filtered_rows['Text'])
        
        # Store the combined text in the grouped_text dictionary
        if combined_text:
            if start_time not in grouped_text:
                grouped_text[start_time] = []
            grouped_text[start_time].append(combined_text)

    # Convert the grouped_text dictionary into a DataFrame
    grouped_df = pd.DataFrame({'Start': list(grouped_text.keys()), 'CombinedText': list(grouped_text.values())})

    return grouped_df

# Example usage
# Assuming your original DataFrame is called 'youtube_df' and the selected DataFrame is 'selected_df'
new_dataset = filter_transcript_by_range(transcript_df, result)


In [16]:
new_dataset.to_csv('result_algo4_1.csv')

# Video Edit cut