# Link to transcript

In [14]:
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd


def get_youtube_video_transcript_dataframe(youtube_url):
    try:
        # Extract video ID from the URL
        video_id = youtube_url.split("v=")[-1]

        # Get the transcript
        transcript = YouTubeTranscriptApi.get_transcript(video_id)

        # Create an empty list to store the transcript data
        transcript_data = []

        # Loop through each transcript item and extract timestamp and text
        for item in transcript:
            start_time = item["start"]
            end_time = item["start"] + item["duration"]
            text = item["text"]
            transcript_data.append({"Start": start_time, "End": end_time, "Text": text})

        # Convert the list to a pandas DataFrame
        df = pd.DataFrame(transcript_data)

        return df

    except Exception as e:
        print("Error:", e)
        return None

# Replace the following with the YouTube video URL of your choice
youtube_url = "https://www.youtube.com/watch?v=VyFk2sdw230&ab_channel=BiggerPockets"
transcript_df = get_youtube_video_transcript_dataframe(youtube_url)

if transcript_df is not None:
    print(transcript_df.head())
else:
    print("Failed to get the transcript.")

   Start     End                                      Text
0  0.000   5.339  welcome to mortgage Mondays Today's Show
1  2.280   6.720    we are going to break down an FHA loan
2  5.339   8.280     pretty much everything that you could
3  6.720   9.720      possibly need to know but if you now
4  8.280  10.980   show up to an appointment with the loan


# Preprocessed transcritpt

In [15]:
transcript_df

Unnamed: 0,Start,End,Text
0,0.000,5.339,welcome to mortgage Mondays Today's Show
1,2.280,6.720,we are going to break down an FHA loan
2,5.339,8.280,pretty much everything that you could
3,6.720,9.720,possibly need to know but if you now
4,8.280,10.980,show up to an appointment with the loan
...,...,...,...
512,920.160,924.160,Pockets video thanks a lot love you guys
513,922.260,933.559,go build some wealth
514,924.160,933.559,[Music]
515,937.390,944.480,[Music]


In [None]:
import pandas as pd

def group_rows_with_overlap(dataframe, window_size, overlap):
    combined_rows = []
    num_rows = len(dataframe)

    # Iterate over the rows with the specified overlap
    for i in range(0, num_rows, overlap):
        # Get the current window of rows
        window_rows = dataframe.iloc[i:i+window_size]

        if len(window_rows) > 0:
            # Get the 'text' column values and combine them
            combined_text = ' '.join(window_rows['Text'])

            # Get the first 'Start' value
            start_value = window_rows['Start'].iloc[0]

            # Create a new row with the combined 'text' and first 'start' value
            new_row = {
                'Start': start_value,
                'Text': combined_text
            }

            # Append the new row to the combined_rows list
            combined_rows.append(new_row)

    # Create a new DataFrame from the combined_rows list
    combined_df = pd.DataFrame(combined_rows)

    return combined_df

# Example usage
# Assuming you have a DataFrame called 'your_dataframe'
# and you want to take a sequence of 30 rows with an overlap of 10 rows
window_size = 30
overlap = 10

new_dataset_overlap = group_rows_with_overlap(transcript_df, window_size, overlap)
new_dataset_overlap.head()


In [18]:
new_dataset_overlap.head()

Unnamed: 0,Start,Text
0,0.0,welcome to mortgage Mondays Today's Show we ar...
1,16.26,person so prepare to be educated on FHA Loans ...
2,33.899,mortgage you can clear your debt build wealth ...
3,54.059,like BiggerPockets welcome to mortgage Mondays...
4,75.6,wanted to hear and you listen and many of you ...


## The short trasncript data for algotihm

In [20]:
selected_word = pd.read_csv('slected_word.csv')
selected_word.head()

Unnamed: 0.1,Unnamed: 0,Word,Frequency
0,0,people,37
1,2,property,34
2,4,market,23
3,5,one,22
4,6,properties,20


# Algo select clip

In [21]:
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

df = new_dataset_overlap

# Load NLTK stopwords
stop_words = set(stopwords.words("english"))

# Create the engaging words DataFrame
engaging_words_df = selected_word

# Function to preprocess the text and tokenize it
def preprocess_text(text):
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalnum() and word not in stop_words]
    return words

# Preprocess the text in the DataFrame
df["ProcessedText"] = df["Text"].apply(preprocess_text)

# Calculate word frequency across the entire transcript with weights for engaging words
word_frequency = {}
for _, row in df.iterrows():
    for word in row["ProcessedText"]:
        weight = engaging_words_df[engaging_words_df["Word"] == word]["Frequency"].values
        weight = weight[0] if len(weight) > 0 else 1  # Use 1 if word not found in engaging words DataFrame
        word_frequency[word] = word_frequency.get(word, 0) + weight

# Function to get the frequency score for a sentence
def get_sentence_frequency_score(sentence):
    words = preprocess_text(sentence)
    score = sum(word_frequency.get(word, 0) for word in words)
    return score

# Rank sentences based on frequency score
df["FrequencyScore"] = df["Text"].apply(get_sentence_frequency_score)

# Set the number of sentences you want in the summary
num_sentences = 10

# Extract the top sentences to form the summary
summary_sentences = df.nlargest(num_sentences, "FrequencyScore")["Text"].values

In [22]:
summary_sentences

array(["FHA loan so in most cases when someone comes to us with the one brokerage and says I want to buy a primary residence I want a house act what can I do we start with the FHA low if you don't already have one that's usually what we do and if you do have one then we say well here's a conventional option or do you want to refinance out of the FHA and use that to buy so Christian what advice do you have for people that are going to approach their loan officer with this Vegas is that if you're not hearing this advice probably talking to the wrong guy you need to talk to somebody who's knowledgeable bread products don't talk to a loan officer that just offers one product it's the biggest mistake people make I go to Chase and I get a Chase loan okay that's not shade at Chase that's just a single bank that offers a single luck right go somewhere that knows different products that can help you pick the one that's right for you and can help you get in the property where you otherwise would

# Clip with timestamp

# (optional) shows the video with cutted part