In [30]:
import subprocess
import os
import json
import re
import yt_dlp

In [1]:
import google.generativeai as genai

GOOGLE_API_KEY='AIzaSyAbqPnsa-fdd0rg-6P3xKKFaDPQnkojEmY'
genai.configure(api_key=GOOGLE_API_KEY)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def upload_to_gemini(path, mime_type=None):
  """Uploads the given file to Gemini.

  See https://ai.google.dev/gemini-api/docs/prompting_with_media
  """
  file = genai.upload_file(path, mime_type=mime_type)
  print(f"Uploaded file '{file.display_name}' as: {file.uri}")
  return file

In [3]:
# Create the model
# See https://ai.google.dev/api/python/google/generativeai/GenerativeModel
generation_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 64,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",
}

In [4]:
model = genai.GenerativeModel(
  model_name="gemini-1.5-flash",
  generation_config=generation_config,
  # safety_settings = Adjust safety settings
  # See https://ai.google.dev/gemini-api/docs/safety-settings
)

In [31]:
VIDEO_DOWNLOAD_PATH = "video_downloads"
FRAME_OUTPUT_PATH = os.path.join("static","images")
def download_video_and_subtitles(video_url):
    """Download video and subtitles using yt_dlp."""
    ydl_opts = {
        'outtmpl': os.path.join(VIDEO_DOWNLOAD_PATH, '%(id)s.%(ext)s'),
        'writesubtitles': True,
        'subtitlesformat': 'srt',
        'format': 'best'
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info_dict = ydl.extract_info(video_url, download=True)
        video_id = info_dict.get("id", None)
        video_ext = info_dict.get("ext", None)
        video_path = os.path.join(VIDEO_DOWNLOAD_PATH, f"{video_id}.{video_ext}")
        subtitle_path = os.path.join(VIDEO_DOWNLOAD_PATH, f"{video_id}.en.srt")
        return video_path, subtitle_path

In [15]:

def get_captions_with_time(url):
    url_for_captions = ""
    if 'youtube.com' in url:
        url_for_captions = url.split('v=')[1].split('&')[0]
    elif 'youtu.be' in url:
        url_for_captions = url.split('/')[-1].split('?')[0]
    try:
        result = subprocess.run(['node', './caption-scraper.js', url_for_captions], capture_output=True, text=True,
                                encoding='utf-8')

        print(result.returncode)
        if result.returncode != 0:
            print(f"Subprocess returned a non-zero exit code: {result.returncode}")
            print(f"Subprocess stderr: {result.stderr}")
            print("No captions in video")

        if result.stdout:
            try:
                stdout_cleaned = result.stdout.strip()
                pattern = r'"text":"(.*?)"'
                time = r'"start":"(.*?)"'

                # Find all matches
                textmatch = re.findall(pattern, stdout_cleaned)
                timematch = re.findall(time, stdout_cleaned)
                main_text = ""
                # Print the extracted text parts
                for i in range(len(textmatch)):
                    main_text = main_text + timematch[i] + ':'
                    main_text = main_text + textmatch[i]
                    main_text = main_text + ","
                return main_text
            except json.JSONDecodeError:
                print(f"Failed to decode JSON from subprocess output: {result.stdout}")
                print("No captions in video")
        else:
            print("No captions in video")

    except subprocess.CalledProcessError as e:
        print(f"An error occurred while running the subprocess: {e}")
        print("No captions in video")

    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        print("No captions in video")


In [59]:

def extract_frame(video_path, timestamps_seconds):
    timestamps_seconds = list(dict.fromkeys(timestamps_seconds)) #to remove duplicates

    for i, ts in enumerate(timestamps_seconds):
        output_frame = os.path.join(FRAME_OUTPUT_PATH, f"frame_{int(ts)}.jpg")
        cmd = [
            "ffmpeg",
            "-ss", str(ts),  # Specify the start time
            "-i", video_path,
            "-vframes", "1",  # Number of frames to output
            "-q:v", "2",      # Quality
            "-y",             # Overwrite output files without asking
            output_frame
        ]
        subprocess.run(cmd, capture_output=True)
    return output_frame

In [16]:
url = 'https://youtu.be/v5wQFm4KfJQ?si=zGyPZfVzvgb17eXF'

main_text = get_captions_with_time(url)

0


In [17]:
prompt = f'''provided the following captions of a YouTube video along with their respective timestamps, I want you to provide me with a list of most important\n highlights from these video captions. provide me with important highlight\'s text along with their respective timestamp.\n
Note you should provide output in the following format given below: \n
['highlight text 1', 'timestamp1']\n
['highlight text 2', 'timestamp2']\n
.\n
.\n
['highlight text N', 'timestampN']\n\n
The captions for the YouTube video are:\n
{main_text}'''

In [21]:
# now according to the captions generated, provide me with keys and values

output = model.generate_content(prompt)

In [50]:
result_dict = {}

for item in output:
    # Remove the brackets and split by the comma
    item = item.strip("[]")

    # Check if item has both description and timestamp
    if ", " in item:
        description, timestamp = item.split(", ")

        # Clean up the quotation marks around the description and timestamp
        description = description.strip("'")
        timestamp = timestamp.strip("'")

        # Add to dictionary
        result_dict[description] = timestamp

# Print the resulting dictionary
print(result_dict)

{'why are Americans so unhappy with their leaders': '4.56', 'American Trust in their government has been on a decades long steady decline': '9.639', 'elections for many have become a matter of choosing the least bad option': '16.84', 'the candidate who will do the least damage to the United States': '21.16', 'many others now believe that the system itself is broken': '27.88', 'the only solution is for an outsider to come in and more or less break the system': '34.68', 'how did things get to be this way': '41.399', 'the United States has had its share of inspirational leaders': '43.28', 'leaders that were widely popular in their time': '44.76', 'who were seen to have positively contributed to the welfare of the American people': '49.079', 'and who went on to be venerated in American lore as one of the greats': '53.28', 'why did these leaders seem so few and far between': '57.879', 'the United States has hundreds of millions of citizens to choose from': '59.6', 'how can such a massive co

In [32]:
video_path, subtitle_path = download_video_and_subtitles(url)

[youtube] Extracting URL: https://youtu.be/v5wQFm4KfJQ?si=zGyPZfVzvgb17eXF
[youtube] v5wQFm4KfJQ: Downloading webpage
[youtube] v5wQFm4KfJQ: Downloading ios player API JSON
[youtube] v5wQFm4KfJQ: Downloading m3u8 information
[info] v5wQFm4KfJQ: Downloading 1 format(s): 18
[info] There are no subtitles for the requested languages
[download] Destination: video_downloads\v5wQFm4KfJQ.mp4
[download] 100% of   78.36MiB in 00:00:36 at 2.13MiB/s   


In [54]:
timestamps = []

# Process each timestamp
for key, value in result_dict.items():
    try:
        cleaned_ts = int(float(value.strip())) + 1 if float(value.strip()) <= 1 else int(float(value.strip()))
        cleaned_ts = cleaned_ts + 1
        timestamps.append(str(cleaned_ts))
    except ValueError:
        continue

In [None]:
image_frames = []
image_filenames = []

# Limit to first two timestamps
for i in range(len(timestamps)):
    # Generate frame path and append to image_filenames
    image_frames.append(timestamps[i])
    print(image_frames)
    print("timeframe", timestamps[i])
    frame_path = f"frame_{timestamps[i]}.jpg"
    image_filenames.append(frame_path)

In [None]:
print("extracting image from", image_frames)
extract_frame(video_path, image_frames)

In [81]:
# Upload the images to Gemini along with their description which is stored in the result_dict's keys

for i in range(len(image_filenames)):
    file = upload_to_gemini(os.path.join(FRAME_OUTPUT_PATH, image_filenames[i]), mime_type="image/jpeg")
    result_dict[image_filenames[i]] = file.uri

Uploaded file 'frame_5.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/rqziwebapw4y
Uploaded file 'frame_10.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/830guvh30mca
Uploaded file 'frame_17.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/sia2l9oogtv7
Uploaded file 'frame_22.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/mcwwvuf7ec8y
Uploaded file 'frame_28.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/dpe8b9gvndto
Uploaded file 'frame_35.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/zcv57f913zh9
Uploaded file 'frame_42.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/eze81u4418xe
Uploaded file 'frame_44.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/8302uiqd88e5
Uploaded file 'frame_45.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/jmnqn01ldiv1
Uploaded file 'frame_50.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/h4wy

In [84]:
import os

image_directory = "static\\images"
image_files = os.listdir(image_directory)

image_dict = {}

for image_file in image_files:
    image_path = os.path.join(image_directory, image_file)
    image_dict[image_file] = image_path

print(image_dict)

{'frame_10.jpg': 'static\\images\\frame_10.jpg', 'frame_1004.jpg': 'static\\images\\frame_1004.jpg', 'frame_1007.jpg': 'static\\images\\frame_1007.jpg', 'frame_1015.jpg': 'static\\images\\frame_1015.jpg', 'frame_102.jpg': 'static\\images\\frame_102.jpg', 'frame_1025.jpg': 'static\\images\\frame_1025.jpg', 'frame_1031.jpg': 'static\\images\\frame_1031.jpg', 'frame_1036.jpg': 'static\\images\\frame_1036.jpg', 'frame_1039.jpg': 'static\\images\\frame_1039.jpg', 'frame_1046.jpg': 'static\\images\\frame_1046.jpg', 'frame_1050.jpg': 'static\\images\\frame_1050.jpg', 'frame_1059.jpg': 'static\\images\\frame_1059.jpg', 'frame_1068.jpg': 'static\\images\\frame_1068.jpg', 'frame_1071.jpg': 'static\\images\\frame_1071.jpg', 'frame_108.jpg': 'static\\images\\frame_108.jpg', 'frame_1080.jpg': 'static\\images\\frame_1080.jpg', 'frame_1084.jpg': 'static\\images\\frame_1084.jpg', 'frame_1089.jpg': 'static\\images\\frame_1089.jpg', 'frame_1091.jpg': 'static\\images\\frame_1091.jpg', 'frame_1097.jpg': '

In [6]:
image_files = []

print('Appending images in image_files...')
for key,value in image_dict.items():
        image_files.append(value)

Appending images in image_files...


In [7]:
print(image_files)

['static\\images\\frame_134.jpg', 'static\\images\\frame_166.jpg', 'static\\images\\frame_194.jpg', 'static\\images\\frame_8.jpg']


In [8]:
FRAME_OUTPUT_PATH = os.path.join("static","images")

In [9]:
files = []
for image_file in image_files:
    file = upload_to_gemini(os.path.join(image_file), mime_type="image/jpeg")
    files.append(file)


Uploaded file 'frame_134.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/8ljzdz2c8vzv
Uploaded file 'frame_166.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/2xl05du03pyy
Uploaded file 'frame_194.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/lsd4nce2wmsn
Uploaded file 'frame_8.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/7ku35veojbmq


In [66]:
for file in files:
    print(file)

genai.File({
    'name': 'files/ekomrdoifxsr',
    'display_name': 'frame_1.jpg',
    'mime_type': 'image/jpeg',
    'sha256_hash': 'ZjJkZDQ2OGM4MjIyYTNkNWJmYTBmMTYzMjhhYzRkNjBmMjk1ZjFlODdkNjNhODI1YzJiZDk1YWJmY2ZkNzJiMQ==',
    'size_bytes': '10415',
    'state': 'ACTIVE',
    'uri': 'https://generativelanguage.googleapis.com/v1beta/files/ekomrdoifxsr',
    'create_time': '2024-07-07T17:16:40.059077Z',
    'expiration_time': '2024-07-09T17:16:40.041061384Z',
    'update_time': '2024-07-07T17:16:40.059077Z'})
genai.File({
    'name': 'files/7y2259b9bl9j',
    'display_name': 'frame_1002.jpg',
    'mime_type': 'image/jpeg',
    'sha256_hash': 'ZWM5OWQxZDY5ZmZjMDBhMjY5ZWQ0NWU2MTk5ZWNhMjMxODMyYmRiNDE5Mzg0YmZhZmMyZjM5YmVhYWJhMGUzOA==',
    'size_bytes': '27490',
    'state': 'ACTIVE',
    'uri': 'https://generativelanguage.googleapis.com/v1beta/files/7y2259b9bl9j',
    'create_time': '2024-07-07T17:16:43.835915Z',
    'expiration_time': '2024-07-09T17:16:43.786828717Z',
    'update_time': '

In [10]:
chat_session = model.start_chat(
    history=[
        {
            "role": "user",
            "parts": files,
        }
    ]
)

In [78]:
key_points = '1. Researchers trained an AI to predict heart conditions from eye images in 2018.\n2. Unexpectedly, this AI also learned to identify people\'s biological sex with very high accuracy.\n3. This highlights the black box nature of deep learning, where AIs learn complex relationships between input and output without explicitly being taught.\n4. Current AI models are extremely complex, with some using billions of parameters and multiple layers.\n5. These models are increasingly used in various sectors, including hiring, healthcare, criminal justice, and online recommendations.\n6. Concerns about the potential risks of such models to humanity highlight the importance of understanding their decision-making processes.\n7. Researchers like Chris Olah investigated the inner workings of these models by analyzing small groups of neurons.\n8. They discovered distinct parts of the network responsible for detecting various features, from simple shapes like curves and circles to objects like dog heads and cars.\n9. The researchers utilized feature visualization techniques to understand what individual neurons and channels are doing by generating images that maximize their activation.\n10. They found that while these techniques can reveal some insights, they don\'t always capture the full picture due to a phenomenon called polysemanticity, where neurons can track multiple features simultaneously.\n11. Further research aims to address these challenges by exploring how models become polysemantic and developing methods to discover patterns of neuron activation linked to specific features.\n12. The field of mechanistic interpretability is emerging as a key tool for understanding how AI systems make decisions and ensuring their responsible development and deployment.'

In [None]:
# image_response = chat_session.send_message("I have attached few images. Can you tell me what each image represents?")
image_response = chat_session.send_message('')

In [80]:
print('Useful images', image_response.text)

Useful images [0, 1, 5, 6, 7, 8, 9, 10, 11] 

