In [1]:
from youtube_transcript_api import YouTubeTranscriptApi
import re
import pandas as pd
import requests
from dotenv import load_dotenv
import os


# load env variables
load_dotenv()

True

In [2]:
# get video id from url. the following is the video id for the video at https://www.youtube.com/watch?v=psU4YbCf0bE
video_id = "psU4YbCf0bE"

In [3]:
# takes a video id and returns the english transcript of the video
def get_transcript(video_id):
    try:
        # get the transcript
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])

        # join all the text in the transcript into one string
        transcript = " ".join([entry['text'] for entry in transcript])

        # remove backslashes and line breaks
        transcript = re.sub(r'\n', ' ', transcript)  # Replace line breaks with space
        transcript = re.sub(r'\\', '', transcript)  # Remove backslashes

        return transcript
    except Exception as e:
        return f"An error occurred: {e}"

get_transcript(video_id)

'- [Narrator] Countries around the world have pumped millions and even billions of dollars into huge construction projects that make our lives better. But some megaprojects should have been thrown in the garbage at the planning stage. So grab a hard hat and a hi-vis jacket. We\'ve got megaprojects, mega-prospects, and something a little unstable to look at. Ouch! All right, it\'s time to check out the most useless megaprojects in the world. (upbeat music) Road to Nowhere. Hawaii is a beautiful place. It\'s got beaches, grass skirts, coconut bras, oh, and this super expensive blight on the landscape, the H-3 Interstate. Now I know what you\'re thinking. Interstate, Hawaii? Well, that\'s one big highway. Well, here\'s the thing, it doesn\'t connect to any other state. The name just means it\'s been funded by the federal government. So rather than spanning 2,500 miles to the nearest state, California it spans just 15 from northwest of downtown Honolulu to the Marine Corps Base Hawaii. Oh,

In [4]:
# %%timeit
get_transcript("4-Tas6Uw_OQ")

'So if you go to the stock market and look at the price of say a favourite company’s share. So what you would observe is the following. So if the horizontal axis is the time axis and if this is the price axis it tells you what is the price then you will see starting from a certain time price  you will see some zigzagging motions like this. So you observe it up to time T and you observe this zigzagging motion. This is of course random. Nobody knows what is the next price is. So if a particular scenario evolves, you have a particular path. This is called a sample path. So if another scenario evolves, if another scenario evolves there would be another path, for example it could be like this. The stock price is going down down down down and you are in a bad shape and then it again climbs up and again it falls down down down and again then again climbs up. So under a different scenario it has a different path. So it is sample path 1 it is sample path 2. So it is what type of scenario one ev

In [16]:
# youtube api endpoint
endpoint = "https://www.googleapis.com/youtube/v3/search"

# question asked by the user
user_query = "brownian motion"

# parameters for the endpoint based on documentation https://developers.google.com/youtube/v3/docs/search/list
params = {
    "q": user_query,
    "type": "video",
    "part": "snippet",
    "maxResults": 3,
    "key": os.getenv("YOUTUBE_API_KEY"),
    "videoCaption": "closedCaption" # only return videos with closed captions
}

# send a request to the endpoint
response = requests.get(endpoint, params=params)

In [18]:
import time

# Get the start time
start_time = time.time()

# Check if the request was successful
if response.status_code == 200:
    # Parse and print the JSON response
    data = response.json()

    # Create a list to store video information
    videos = []

    # Extract relevant data from the response
    for item in data["items"]:
        video_id = item['id']['videoId']
        title = item['snippet']['title']
        channel = item['snippet']['channelTitle']
        
        videos.append([video_id, title, channel, user_query])

    # Create a Pandas DataFrame from the list of videos
    df = pd.DataFrame(videos, columns=["video_id", "title", "channel", "question"])

    # Get transcript for each video by applying the get_transcript(video_id) function
    df["transcript"] = df.apply(lambda row: get_transcript(row["video_id"]), axis=1)

    # Save the DataFrame as a CSV file
    df.to_csv("videos.csv")
else:
    print(f"Request failed with status code {response.status_code}")
    print(response.text)

# Get the end time
end_time = time.time()

# Print the total time
print(f"Total time: {end_time - start_time} seconds")

Total time: 9.441158056259155 seconds


In [19]:
df.columns

Index(['video_id', 'title', 'channel', 'question', 'transcript'], dtype='object')

In [9]:
import mindsdb_sdk

# connect to mindsdb server
server = mindsdb_sdk.connect(
    login = os.getenv('MINDSDB_USERNAME'),
    password = os.getenv('MINDSDB_PASSWORD'),
)

In [10]:
# get default mindsdb project
project = server.get_project()

In [11]:
# run queries on models, tables and views stored in a project
query = project.query('SHOW TABLES;')
query.fetch()

Unnamed: 0,Tables_in_mindsdb
0,models
1,models_versions
2,jobs
3,jobs_history
4,mdb_triggers
5,chatbots
6,sentiment_classifier
7,summarizer_10_20


In [20]:
# create a new table to store the videos
files_db = server.get_database('files')

# remove the table if it already exists
files_db.drop_table('video')

# create a new table
files_db.create_table('video', df)

Table(video)

In [22]:
# list all models
project.list_models()

[Model(sentiment_classifier, status=complete),
 Model(summarizer_10_20, status=complete),
 Model(youtube_video_recommender, status=error)]

In [None]:
# CREATE MODEL youtube_video_recommender
# PREDICT recommendation_score
# USING
#     engine = 'openai_engine',
#     prompt_template = 'give an integer for the relevancy score for how well the transcipt text:{{transcript}} answers the question text:{{question}}. the higher the integer, the higher the relevancy of the transcript in answering the question';

# drop model youtube_video_recommender

In [None]:
project.query('SHOW TABLES;')