In [1]:
# pip install torch torchvision transformers opencv-python pandas ultralytics moviepy ollama
# Download ollama from website: 
# Run in Console: ollama pull llava:13b
# Run in Console: ollama run llava
# Run this file

In [2]:
import ollama
import pandas as pd
from moviepy.editor import VideoFileClip
from PIL import Image
import io
from tqdm import tqdm

In [3]:
# Helper function to extract frames from video and return them as bytes
def extract_frames(video_path, frame_rate=1):
    clip = VideoFileClip(video_path)
    fps = clip.fps
    frame_list = []
    timestamps = []

    for i, frame in enumerate(clip.iter_frames()):
        if i % int(fps / frame_rate) == 0:
            # Convert frame (numpy array) to PIL Image
            image = Image.fromarray(frame)
            # Save the image to a bytes buffer
            buffer = io.BytesIO()
            image.save(buffer, format="JPEG")
            buffer.seek(0)
            # Append the bytes buffer to the list
            frame_list.append(buffer)
            timestamps.append(i / fps)

    return frame_list, timestamps

In [4]:
def detect_objects(input):
    message_list = []

    # Use tqdm for progress bar
    for i in tqdm(input, desc="Detecting Objects in Images"):
        res = ollama.chat(
            model='llava',
            messages=[
                {
                    'role': 'user',
                    'content': 'Describe all objects seen in this image',
                    'images': [i]
                }
            ]
        )
        message_list.append(res['message']['content'])

    return message_list


In [5]:
def theme_detection(input):
    message_list = []
    
    # Use tqdm for progress bar
    for i in tqdm(input, desc="Processing Images"):
        res = ollama.chat(
            model='llava',
            messages=[
                {
                    'role': 'user',
                    'content': 'Describe the theme of this image',
                    'images': [i]
                }
            ]
        )
        message_list.append(res['message']['content'])

    return message_list


In [None]:
# Main function to process video and return a DataFrame
def analyze_video(video_path):
    print(f"Extracting frames from video ({video_path})...")
    # Change frames to fit video 0.5 means half of the frames, 1 is all the frames
    frames, timestamps = extract_frames(video_path, frame_rate=0.02)

    print("Running object detection...")
    object_data = detect_objects(frames)

    # print("Running theme detection...")
    # theme_data = theme_detection(frames)

    # Create a DataFrame with results
    data = {
        "Timestamp": timestamps,
        "Detected Objects": object_data
        #"Theme Embeddings": theme_data
    }

    df = pd.DataFrame(data)
    return df

In [7]:
# Upload and analyze your .mp4 video file
video_path = "Test.mp4"
df_results = analyze_video(video_path)

# Display the DataFrame
print(df_results)

Extracting frames from video (Test.mp4)...
Running object detection...


Detecting Objects in Images: 100%|██████████| 11/11 [31:34<00:00, 172.26s/it]


     Timestamp                                   Detected Objects
0     0.000000   The image is quite small and low-resolution, ...
1    49.983267   The image shows a person with their arm raise...
2    99.966533   In the image, there is a person sitting at a ...
3   149.949800   The image shows a person using a smartwatch w...
4   199.933067   In the image, there is a person holding a sma...
5   249.916333   The image shows a person seated and looking d...
6   299.899600   In the image, there is a person giving a thum...
7   349.882867   The image is a screenshot from a video, likel...
8   399.866133   The image appears to be a screenshot from a v...
9   449.849400   The image shows a person sitting at a desk wi...
10  499.832667   In the image, we see a man standing indoors. ...


In [8]:
df_results.to_csv('VideoResults2.csv',index = False)