In [1]:
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd


def get_youtube_video_transcript_dataframe(youtube_url):
    try:
        # Extract video ID from the URL
        video_id = youtube_url.split("v=")[-1]

        # Get the transcript
        transcript = YouTubeTranscriptApi.get_transcript(video_id)

        # Create an empty list to store the transcript data
        transcript_data = []

        # Loop through each transcript item and extract timestamp and text
        for item in transcript:
            start_time = item["start"]
            end_time = item["start"] + item["duration"]
            text = item["text"]
            transcript_data.append({"Start": start_time, "End": end_time, "Text": text})

        # Convert the list to a pandas DataFrame
        df = pd.DataFrame(transcript_data)

        return df

    except Exception as e:
        print("Error:", e)
        return None

# Replace the following with the YouTube video URL of your choice
youtube_url = "https://www.youtube.com/watch?v=sJwIQW4rbUM&ab_channel=BiggerPockets"
transcript_df = get_youtube_video_transcript_dataframe(youtube_url)

if transcript_df is not None:
    print(transcript_df.head())
else:
    print("Failed to get the transcript.")

   Start     End                                     Text
0  0.000   3.959   how many bits do you believe about the
1  2.639   4.980      mortgage industry is there a chance
2  3.959   6.600     you've been running around with that
3  4.980   8.280  info in your head that if you got right
4  6.600  10.019   could actually make you a lot of money


# Processed

In [2]:
df = transcript_df

# Combine rows in groups of 10
combined_rows = []
group_size = 10
num_groups = (len(df) + group_size - 1) // group_size  # Calculate the number of groups

for i in range(num_groups):
    start_idx = i * group_size
    end_idx = min(start_idx + group_size, len(df)) - 1  # Handle last group with fewer than 10 rows
    start_text = df.loc[start_idx, 'Text']
    end_text = df.loc[end_idx, 'Text']
    combined_text = ' '.join(df.loc[start_idx:end_idx, 'Text'])
    combined_rows.append({'Start': df.loc[start_idx, 'Start'], 'End': df.loc[end_idx, 'End'], 'Text': combined_text})

# Create a new DataFrame with combined rows
combined_df = pd.DataFrame(combined_rows)

combined_df.head()

Unnamed: 0,Start,End,Text
0,0.0,17.64,how many bits do you believe about the mortgag...
1,16.139,35.579,perfectly as we can we've come to the right pl...
2,33.239,53.82,coming up on today's show of mortgage Mondays ...
3,52.379,74.28,glad to hear it all right let's get into this ...
4,72.54,90.36,the Burr in most cases now is there a value to...


# Short selected word

In [3]:
selected_word = pd.read_csv('slected_word.csv')
selected_word.drop(columns='Unnamed: 0', inplace=True)
selected_word.head()

Unnamed: 0,Word,Frequency
0,people,37
1,property,34
2,market,23
3,one,22
4,properties,20


In [4]:
# combine into 1 text
# Combine the words into a single text
short_combined = ' '.join(selected_word['Word'])
short_combined


'people property market one properties real right money really want going estate good would cash buy look lot deal make house two find equity way'

# Cosine SImilarity selection

In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the datasets
video_transcript_df = combined_df

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Vectorize the video transcript and short combined transcript text
video_transcript_vectors = vectorizer.fit_transform(video_transcript_df['Text'])
short_combined_vector = vectorizer.transform([short_combined])

# Calculate cosine similarity for each row in video transcript against the combined short transcript
cosine_similarities = cosine_similarity(video_transcript_vectors, short_combined_vector)

# Add the similarity scores to the video transcript DataFrame
video_transcript_df['SimilarityScore'] = cosine_similarities

# Sort the video transcript DataFrame by similarity score in descending order
result_df = video_transcript_df.sort_values(by='SimilarityScore', ascending=False)

# Reset the index of the sorted DataFrame
result_df = result_df.reset_index(drop=True)

# Display the result DataFrame
result_df

Unnamed: 0,Start,End,Text,SimilarityScore
0,597.839,618.0,offer it it's more difficult cold than a norma...,0.247375
1,385.38,404.46,good for me right don't just go pull your cred...,0.247273
2,756.12,774.959,appreciate you helping me dispel some of these...,0.228006
3,616.32,634.98,them on Zillow are the hardest to put in a con...,0.223601
4,106.5,128.58,just buy a primary residence every year if pos...,0.183452
5,0.0,17.64,how many bits do you believe about the mortgag...,0.180537
6,164.34,184.019,deal and if you get more than your money out s...,0.172303
7,52.379,74.28,glad to hear it all right let's get into this ...,0.167504
8,126.96,146.16,payment and the Renault or if you're just refi...,0.148629
9,544.2,562.56,conventional one and there's a hard money one ...,0.144501


In [6]:
result_df.to_csv('Algo4-4_result.csv')