In [2]:
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd


def get_youtube_video_transcript_dataframe(youtube_url):
    try:
        # Extract video ID from the URL
        video_id = youtube_url.split("v=")[-1]

        # Get the transcript
        transcript = YouTubeTranscriptApi.get_transcript(video_id)

        # Create an empty list to store the transcript data
        transcript_data = []

        # Loop through each transcript item and extract timestamp and text
        for item in transcript:
            start_time = item["start"]
            end_time = item["start"] + item["duration"]
            text = item["text"]
            transcript_data.append({"Start": start_time, "End": end_time, "Text": text})

        # Convert the list to a pandas DataFrame
        df = pd.DataFrame(transcript_data)

        return df

    except Exception as e:
        print("Error:", e)
        return None

# Replace the following with the YouTube video URL of your choice
youtube_url = "https://www.youtube.com/watch?v=VyFk2sdw230&ab_channel=BiggerPockets"
transcript_df = get_youtube_video_transcript_dataframe(youtube_url)

if transcript_df is not None:
    print(transcript_df.head())
else:
    print("Failed to get the transcript.")

   Start     End                                      Text
0  0.000   5.339  welcome to mortgage Mondays Today's Show
1  2.280   6.720    we are going to break down an FHA loan
2  5.339   8.280     pretty much everything that you could
3  6.720   9.720      possibly need to know but if you now
4  8.280  10.980   show up to an appointment with the loan


# Preprocessed

In [11]:
df = transcript_df

# Combine rows in groups of 10
combined_rows = []
group_size = 10
num_groups = (len(df) + group_size - 1) // group_size  # Calculate the number of groups

for i in range(num_groups):
    start_idx = i * group_size
    end_idx = min(start_idx + group_size, len(df)) - 1  # Handle last group with fewer than 10 rows
    start_text = df.loc[start_idx, 'Text']
    end_text = df.loc[end_idx, 'Text']
    combined_text = ' '.join(df.loc[start_idx:end_idx, 'Text'])
    combined_rows.append({'Start': df.loc[start_idx, 'Start'], 'End': df.loc[end_idx, 'End'], 'Text': combined_text})

# Create a new DataFrame with combined rows
combined_df = pd.DataFrame(combined_rows)

combined_df.head()

Unnamed: 0,Start,End,Text
0,0.0,19.26,welcome to mortgage Mondays Today's Show we ar...
1,16.26,35.579,person so prepare to be educated on FHA Loans ...
2,33.899,56.52,mortgage you can clear your debt build wealth ...
3,54.059,77.4,like BiggerPockets welcome to mortgage Mondays...
4,75.6,94.259,wanted to hear and you listen and many of you ...


# Short Selected word

In [12]:
selected_word = pd.read_csv('slected_word.csv')
selected_word.drop(columns='Unnamed: 0', inplace=True)
selected_word.head()

Unnamed: 0,Word,Frequency
0,people,37
1,property,34
2,market,23
3,one,22
4,properties,20


# Weighted Selection

In [13]:
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

df = combined_df

# Load NLTK stopwords
stop_words = set(stopwords.words("english"))

# Create the engaging words DataFrame
engaging_words_df = selected_word

# Function to preprocess the text and tokenize it
def preprocess_text(text):
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalnum() and word not in stop_words]
    return words

# Preprocess the text in the DataFrame
df["ProcessedText"] = df["Text"].apply(preprocess_text)

# Calculate word frequency across the entire transcript with weights for engaging words
word_frequency = {}
for _, row in df.iterrows():
    for word in row["ProcessedText"]:
        weight = engaging_words_df[engaging_words_df["Word"] == word]["Frequency"].values
        weight = weight[0] if len(weight) > 0 else 1  # Use 1 if word not found in engaging words DataFrame
        word_frequency[word] = word_frequency.get(word, 0) + weight

# Function to get the frequency score for a sentence
def get_sentence_frequency_score(sentence):
    words = preprocess_text(sentence)
    score = sum(word_frequency.get(word, 0) for word in words)
    return score

# Rank sentences based on frequency score
df["FrequencyScore"] = df["Text"].apply(get_sentence_frequency_score)

# Set the number of sentences you want in the summary
num_sentences = 10

# Extract the top sentences to form the summary
summary_sentences = df.nlargest(num_sentences, "FrequencyScore")["Text"].values


In [14]:
summary_sentences

array(["are best for you okay tell me more about FHA Loans now a couple things that I'd like to note a lot of people say go buy a home every year FHA loan FHA loan rinse and repeat right that's not impossible but there's some obstacles to overcome number one you can only have one FHA loan at a time so these are going to be kind of some myths around the FHA one you can only have one at a",
       "FHA loan so in most cases when someone comes to us with the one brokerage and says I want to buy a primary residence I want a house act what can I do we start with the FHA low if you don't already have one that's usually what we do and if you do have one then we say well here's a conventional option or do you want to refinance out of the FHA and use that to buy so Christian what advice do",
       "all the same so what are the issues people are having right now is what I refer to as the golden handcuffs dilemma you bought a property with an FHA loan you have a killer interest rate 3.25 3.75 ra

# Processed selected word

In [15]:
result = pd.DataFrame()
for sentence in summary_sentences:
    rows = df[df['Text'] == sentence]
    result = result.append(rows)
    
result

  result = result.append(rows)
  result = result.append(rows)
  result = result.append(rows)
  result = result.append(rows)
  result = result.append(rows)
  result = result.append(rows)
  result = result.append(rows)
  result = result.append(rows)
  result = result.append(rows)
  result = result.append(rows)


Unnamed: 0,Start,End,Text,ProcessedText,FrequencyScore
17,299.699,320.52,are best for you okay tell me more about FHA L...,"[best, okay, tell, fha, loans, couple, things,...",5253
45,814.38,833.1,FHA loan so in most cases when someone comes t...,"[fha, loan, cases, someone, comes, us, one, br...",3950
31,559.26,582.36,all the same so what are the issues people are...,"[issues, people, right, refer, golden, handcuf...",3307
21,376.68,394.02,that doesn't make the move impossible you have...,"[make, move, impossible, make, enough, money, ...",3110
47,846.959,863.88,that's just a single bank that offers a single...,"[single, bank, offers, single, luck, right, go...",3092
36,650.64,669.839,flow you know everybody can focus on one speci...,"[flow, know, everybody, focus, one, specific, ...",3088
22,392.94,412.68,just gonna buy one house a year and I'm gonna ...,"[gon, na, buy, one, house, year, gon, na, buy,...",2919
13,232.26,250.5,absolutely right that's that's a Rob answer ri...,"[absolutely, right, rob, answer, right, yeah, ...",2791
46,832.079,848.639,you have for people that are going to approach...,"[people, going, approach, loan, officer, vegas...",2764
29,523.979,544.2,three or a four unit property assuming it meet...,"[three, four, unit, property, assuming, meets,...",2723


In [39]:
import pandas as pd

def filter_transcript_by_range(original_df, selected_df):
    filtered_rows = []
    for _, row in selected_df.iterrows():
        start_time = row['Start']
        # Calculate the range based on the midpoint of selected_df start
        lower_bound = start_time - 30
        upper_bound = start_time + 30
        
        # Filter rows based on the range
        filtered_rows.extend(original_df[(original_df['Start'] >= lower_bound) & 
                                         (original_df['Start'] <= upper_bound)].values.tolist())

    # Create a new DataFrame from the filtered rows
    filtered_df = pd.DataFrame(filtered_rows, columns=original_df.columns)

    return filtered_df

# Example usage
# Assuming your original DataFrame is called 'youtube_df' and the selected DataFrame is 'selected_df'
new_dataset = filter_transcript_by_range(combined_df, result)

In [40]:
new_dataset

Unnamed: 0,Start,End,Text,ProcessedText,FrequencyScore
0,283.62,301.44,whereas FHA we've got them through as high as ...,"[whereas, fha, got, high, 57, half, grossing, ...",959
1,299.699,320.52,are best for you okay tell me more about FHA L...,"[best, okay, tell, fha, loans, couple, things,...",5253
2,318.72,339.9,time that does not mean you can only have one ...,"[time, mean, one, ever, okay, buy, refinance, ...",2169
3,795.779,816.6,it's going to be tough to use an FHA loan to g...,"[going, tough, use, fha, loan, get, triplex, f...",951
4,814.38,833.1,FHA loan so in most cases when someone comes t...,"[fha, loan, cases, someone, comes, us, one, br...",3950
5,832.079,848.639,you have for people that are going to approach...,"[people, going, approach, loan, officer, vegas...",2764
6,542.399,560.459,these properties correct that's the massive ad...,"[properties, correct, massive, advantage, fha,...",1263
7,559.26,582.36,all the same so what are the issues people are...,"[issues, people, right, refer, golden, handcuf...",3307
8,580.2,599.519,but you can't use the FHA loan to buy future p...,"[ca, use, fha, loan, buy, future, properties, ...",1279
9,358.08,378.36,your old house into a rent this is getting int...,"[old, house, rent, getting, weeds, little, bit...",1160


In [42]:
# Put them together
df = new_dataset

# Combine rows in groups of 10
combined_rows = []
group_size = 3
num_groups = (len(df) + group_size - 1) // group_size  # Calculate the number of groups

for i in range(num_groups):
    start_idx = i * group_size
    end_idx = min(start_idx + group_size, len(df)) - 1  # Handle last group with fewer than 10 rows
    start_text = df.loc[start_idx, 'Text']
    end_text = df.loc[end_idx, 'Text']
    combined_text = ' '.join(df.loc[start_idx:end_idx, 'Text'])
    combined_rows.append({'Start': df.loc[start_idx, 'Start'], 'End': df.loc[end_idx, 'End'], 'Text': combined_text})

# Create a new DataFrame with combined rows
combined_df = pd.DataFrame(combined_rows)

combined_df

Unnamed: 0,Start,End,Text
0,283.62,339.9,whereas FHA we've got them through as high as ...
1,795.779,848.639,it's going to be tough to use an FHA loan to g...
2,542.399,599.519,these properties correct that's the massive ad...
3,358.08,412.68,your old house into a rent this is getting int...
4,832.079,882.6,you have for people that are going to approach...
5,633.779,685.44,would ever be financially advantageous to refi...
6,376.68,432.6,that doesn't make the move impossible you have...
7,215.22,266.759,significant it's probably a much wider margin ...
8,814.38,863.88,FHA loan so in most cases when someone comes t...
9,862.079,544.2,does as well where can people reach out to fin...


In [43]:
# FInsihed
combined_df.to_csv('result4-2_2.csv')

# Combined anpther version

In [24]:
import pandas as pd

def filter_transcript_by_range(original_df, selected_df):
    grouped_text = {}
    
    for _, row in selected_df.iterrows():
        start_time = row['Start']
        # Calculate the range based on the midpoint of selected_df start
        lower_bound = start_time - 30
        upper_bound = start_time + 30
        
        # Filter rows based on the range
        filtered_rows = original_df[(original_df['Start'] >= lower_bound) & 
                                    (original_df['Start'] <= upper_bound)]
        
        # Combine the text from filtered rows
        combined_text = ' '.join(filtered_rows['Text'])
        
        # Store the combined text in the grouped_text dictionary
        if combined_text:
            if start_time not in grouped_text:
                grouped_text[start_time] = []
            grouped_text[start_time].append(combined_text)

    # Convert the grouped_text dictionary into a DataFrame
    grouped_df = pd.DataFrame({'Start': list(grouped_text.keys()), 'CombinedText': list(grouped_text.values())})

    return grouped_df

# Example usage
# Assuming your original DataFrame is called 'youtube_df' and the selected DataFrame is 'selected_df'
new_dataset = filter_transcript_by_range(combined_df, result)


In [25]:
new_dataset

Unnamed: 0,Start,CombinedText
0,299.699,[whereas FHA we've got them through as high as...
1,814.38,[it's going to be tough to use an FHA loan to ...
2,559.26,[these properties correct that's the massive a...
3,376.68,[your old house into a rent this is getting in...
4,846.959,[you have for people that are going to approac...
5,650.64,[would ever be financially advantageous to ref...
6,392.94,[that doesn't make the move impossible you hav...
7,232.26,[significant it's probably a much wider margin...
8,832.079,[FHA loan so in most cases when someone comes ...
9,523.979,[telling people to do for the whole time we've...


In [27]:
new_dataset.to_csv('result4-2_1.csv')

Unnamed: 0,Start,End,CombinedText
0,16 283.620 17 299.699 18 318.720 Name...,320.52,whereas FHA we've got them through as high as ...
1,44 795.779 45 814.380 46 832.079 Name...,833.1,it's going to be tough to use an FHA loan to g...
2,30 542.399 31 559.260 32 580.200 Name...,582.36,these properties correct that's the massive ad...
3,20 358.08 21 376.68 22 392.94 Name: S...,394.02,your old house into a rent this is getting int...
4,46 832.079 47 846.959 48 862.079 Name...,863.88,you have for people that are going to approach...
5,35 633.779 36 650.640 37 667.860 Name...,669.839,would ever be financially advantageous to refi...
6,21 376.68 22 392.94 23 411.30 Name: S...,412.68,that doesn't make the move impossible you have...
7,12 215.22 13 232.26 14 249.12 Name: S...,250.5,significant it's probably a much wider margin ...
8,45 814.380 46 832.079 47 846.959 48 ...,848.639,FHA loan so in most cases when someone comes t...
9,28 507.180 29 523.979 30 542.399 Name...,544.2,telling people to do for the whole time we've ...


Unnamed: 0,Start,End,CombinedText
0,299.699,320.52,are best for you okay tell me more about FHA L...
1,814.38,833.1,FHA loan so in most cases when someone comes t...
2,559.26,582.36,all the same so what are the issues people are...
3,376.68,394.02,that doesn't make the move impossible you have...
4,846.959,863.88,that's just a single bank that offers a single...
5,650.64,669.839,flow you know everybody can focus on one speci...
6,392.94,412.68,just gonna buy one house a year and I'm gonna ...
7,232.26,250.5,absolutely right that's that's a Rob answer ri...
8,832.079,848.639,you have for people that are going to approach...
9,523.979,544.2,three or a four unit property assuming it meet...
