In [1]:
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd


def get_youtube_video_transcript_dataframe(youtube_url):
    try:
        # Extract video ID from the URL
        video_id = youtube_url.split("v=")[-1]

        # Get the transcript
        transcript = YouTubeTranscriptApi.get_transcript(video_id)

        # Create an empty list to store the transcript data
        transcript_data = []

        # Loop through each transcript item and extract timestamp and text
        for item in transcript:
            start_time = item["start"]
            end_time = item["start"] + item["duration"]
            text = item["text"]
            transcript_data.append({"Start": start_time, "End": end_time, "Text": text})

        # Convert the list to a pandas DataFrame
        df = pd.DataFrame(transcript_data)

        return df

    except Exception as e:
        print("Error:", e)
        return None

# Replace the following with the YouTube video URL of your choice
youtube_url = "https://www.youtube.com/watch?v=sJwIQW4rbUM&ab_channel=BiggerPockets"
transcript_df = get_youtube_video_transcript_dataframe(youtube_url)

if transcript_df is not None:
    print(transcript_df.head())
else:
    print("Failed to get the transcript.")

   Start     End                                     Text
0  0.000   3.959   how many bits do you believe about the
1  2.639   4.980      mortgage industry is there a chance
2  3.959   6.600     you've been running around with that
3  4.980   8.280  info in your head that if you got right
4  6.600  10.019   could actually make you a lot of money


# Preprocessed

In [2]:
df = transcript_df

# Combine rows in groups of 10
combined_rows = []
group_size = 10
num_groups = (len(df) + group_size - 1) // group_size  # Calculate the number of groups

for i in range(num_groups):
    start_idx = i * group_size
    end_idx = min(start_idx + group_size, len(df)) - 1  # Handle last group with fewer than 10 rows
    start_text = df.loc[start_idx, 'Text']
    end_text = df.loc[end_idx, 'Text']
    combined_text = ' '.join(df.loc[start_idx:end_idx, 'Text'])
    combined_rows.append({'Start': df.loc[start_idx, 'Start'], 'End': df.loc[end_idx, 'End'], 'Text': combined_text})

# Create a new DataFrame with combined rows
combined_df = pd.DataFrame(combined_rows)

combined_df.head()

Unnamed: 0,Start,End,Text
0,0.0,17.64,how many bits do you believe about the mortgag...
1,16.139,35.579,perfectly as we can we've come to the right pl...
2,33.239,53.82,coming up on today's show of mortgage Mondays ...
3,52.379,74.28,glad to hear it all right let's get into this ...
4,72.54,90.36,the Burr in most cases now is there a value to...


# Short Selected word

In [3]:
# Instead of the word it should be whole sentences of short video


shorts = pd.read_csv('Short_combinedtranscript.csv')
shorts.drop(columns='Unnamed: 0', inplace=True)
shorts.head()

Unnamed: 0,VideoID,CombinedText
0,0,so a winning bur is anytime you get more of yo...
1,1,I think one of the problems that people have a...
2,2,I think that the reason most people don't scal...
3,3,what's your advice for people who are chasing ...
4,4,everybody focuses on buying Equity buying Equi...


# Cosine SImilarity Selection V1

In [13]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load the datasets
video_transcript_df = combined_df
short_video_transcript_df = shorts

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Vectorize the video transcript and short video transcript text
video_transcript_vectors = vectorizer.fit_transform(video_transcript_df['Text'])
short_video_transcript_vectors = vectorizer.transform(short_video_transcript_df['CombinedText'])

# Calculate cosine similarity between short video transcript and video transcript
cosine_similarities = cosine_similarity(short_video_transcript_vectors, video_transcript_vectors)

# Find the indices and values of most similar rows in video transcript for each short video transcript
most_similar_indices = np.argmax(cosine_similarities, axis=1)
max_similarity_values = np.max(cosine_similarities, axis=1)

# Create a DataFrame to store the results
result_df = pd.DataFrame(columns=['VideoID', 'CombinedText', 'Start', 'End', 'Text', 'SimilarityScore'])

# Populate the result DataFrame with most similar rows from video transcript
for short_idx, video_idx in enumerate(most_similar_indices):
    result_df = result_df.append({
        'VideoID': short_idx,
        'CombinedText': short_video_transcript_df.loc[short_idx, 'CombinedText'],
        'Start': video_transcript_df.loc[video_idx, 'Start'],
        'End': video_transcript_df.loc[video_idx, 'End'],
        'Text': video_transcript_df.loc[video_idx, 'Text'],
        'SimilarityScore': max_similarity_values[short_idx]
    }, ignore_index=True)

# Sort the result DataFrame by similarity score in descending order
result_df = result_df.sort_values(by='SimilarityScore', ascending=False)

# Reset the index of the sorted DataFrame
result_df = result_df.reset_index(drop=True)

# Display the result DataFrame
result_df

  result_df = result_df.append({
  result_df = result_df.append({
  result_df = result_df.append({
  result_df = result_df.append({
  result_df = result_df.append({
  result_df = result_df.append({
  result_df = result_df.append({
  result_df = result_df.append({
  result_df = result_df.append({
  result_df = result_df.append({
  result_df = result_df.append({
  result_df = result_df.append({
  result_df = result_df.append({
  result_df = result_df.append({
  result_df = result_df.append({
  result_df = result_df.append({
  result_df = result_df.append({
  result_df = result_df.append({
  result_df = result_df.append({
  result_df = result_df.append({
  result_df = result_df.append({
  result_df = result_df.append({
  result_df = result_df.append({
  result_df = result_df.append({
  result_df = result_df.append({
  result_df = result_df.append({
  result_df = result_df.append({
  result_df = result_df.append({
  result_df = result_df.append({
  result_df = result_df.append({
  result_d

Unnamed: 0,VideoID,CombinedText,Start,End,Text,SimilarityScore
0,2,I think that the reason most people don't scal...,450.0,468.539,conventional loan or just put more money down ...,0.448474
1,17,tell us about the nature of the housing market...,597.899,617.459,it's a one and a half percent down payment dif...,0.303672
2,40,you know how sometimes you'll be trying really...,215.22,233.58,significant it's probably a much wider margin ...,0.301923
3,3,what's your advice for people who are chasing ...,0.0,19.26,welcome to mortgage Mondays Today's Show we ar...,0.296844
4,18,this is an unpopular opinion people get mad at...,916.68,944.48,money with real estate if you've got some time...,0.288358
5,10,how do you optimize that part of the process t...,318.72,339.9,time that does not mean you can only have one ...,0.286322
6,13,what's the first thing you do whenever you're ...,0.0,19.26,welcome to mortgage Mondays Today's Show we ar...,0.278878
7,5,we all learn as Real Estate Investors every si...,814.38,833.1,FHA loan so in most cases when someone comes t...,0.274576
8,39,don't just think the llc's or magic pill is go...,814.38,833.1,FHA loan so in most cases when someone comes t...,0.265357
9,36,what are your thoughts on selling our primary ...,110.759,130.739,only loan so that means you have to live there...,0.262191


# Cosine Similarity V2 (combined all the shorts into 1 rows)

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the datasets
video_transcript_df = combined_df
short_video_transcript_df = shorts

# Combine the short video transcript rows into a single text
short_combined_text = ' '.join(short_video_transcript_df['CombinedText'])

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Vectorize the video transcript and short combined transcript text
video_transcript_vectors = vectorizer.fit_transform(video_transcript_df['Text'])
short_combined_vector = vectorizer.transform([short_combined_text])

# Calculate cosine similarity for each row in video transcript against the combined short transcript
cosine_similarities = cosine_similarity(video_transcript_vectors, short_combined_vector)

# Add the similarity scores to the video transcript DataFrame
video_transcript_df['SimilarityScore'] = cosine_similarities

# Sort the video transcript DataFrame by similarity score in descending order
result_df = video_transcript_df.sort_values(by='SimilarityScore', ascending=False)

# Reset the index of the sorted DataFrame
result_df = result_df.reset_index(drop=True)

# Display the result DataFrame
result_df


Unnamed: 0,Start,End,Text,SimilarityScore
0,597.839,618.0,offer it it's more difficult cold than a norma...,0.27325
1,756.12,774.959,appreciate you helping me dispel some of these...,0.263108
2,52.379,74.28,glad to hear it all right let's get into this ...,0.259854
3,616.32,634.98,them on Zillow are the hardest to put in a con...,0.251078
4,454.56,473.819,someone did I would say this is a stupid way t...,0.248314
5,385.38,404.46,good for me right don't just go pull your cred...,0.247323
6,72.54,90.36,the Burr in most cases now is there a value to...,0.247315
7,126.96,146.16,payment and the Renault or if you're just refi...,0.241056
8,632.94,652.32,been sitting on the market for 250 days but th...,0.225071
9,348.78,367.86,can get multiple credit pulls and it will only...,0.208119


In [5]:
result_df.to_csv('Algo4-3_result.csv')

# Cosine similairty V3 (using Word embedding instead of TF/IDF)

In [17]:
!pip install spacy
!python -m spacy download en_core_web_md


Collecting en-core-web-md==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.6.0/en_core_web_md-3.6.0-py3-none-any.whl (42.8 MB)
     ---------------------------------------- 42.8/42.8 MB 9.1 MB/s eta 0:00:00
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.6.0
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [19]:
import pandas as pd
import spacy
from sklearn.metrics.pairwise import cosine_similarity

# Load the pre-trained GloVe embeddings model
nlp = spacy.load("en_core_web_md")

# Load the datasets
video_transcript_df = combined_df
short_video_transcript_df = shorts

# Combine the short video transcript rows into a single text
short_combined_text = ' '.join(short_video_transcript_df['CombinedText'])

# Calculate embeddings for the video transcript and short combined transcript text
video_transcript_embeddings = [nlp(text).vector for text in video_transcript_df['Text']]
short_combined_embedding = nlp(short_combined_text).vector

# Calculate cosine similarity for each row in video transcript against the combined short transcript
cosine_similarities = cosine_similarity(video_transcript_embeddings, [short_combined_embedding])

# Add the similarity scores to the video transcript DataFrame
video_transcript_df['SimilarityScore'] = cosine_similarities.T

# Sort the video transcript DataFrame by similarity score in descending order
result_df = video_transcript_df.sort_values(by='SimilarityScore', ascending=False)

# Reset the index of the sorted DataFrame
result_df = result_df.reset_index(drop=True)

# Display the result DataFrame
result_df


ValueError: Length of values (1) does not match length of index (52)