In [2]:
import pandas as pd
df = pd.read_csv('output.csv')
df_json = pd.read_json('output.json')

In [3]:
df_json

Unnamed: 0,text,start,duration
0,okay let me guess quesadilla with soy,0.160,3.680
1,cheese for the lactose intolerant,2.200,3.880
2,Leonard thank you shrimp Caesar salad,3.840,4.240
3,with no almonds for the highly allergic,6.080,4.800
4,kosher only on the high holidays Howard,8.080,4.400
...,...,...,...
906,about you so,2566.640,4.560
907,much hello some of us need to check our,2568.680,4.399
908,hair because we might have a shot with,2571.200,5.399
909,Ward's mother,2573.079,3.520


In [4]:
# Get the top 5 words from df based on the 'Value' column
top_5_words = df.nlargest(5, 'Value')['Item']

# Count occurrences of each word in the 'text' column of df_json
word_counts = {word: df_json['text'].str.contains(rf'\b{word}\b', case=False).sum() for word in top_5_words}

# Display the counts
word_counts

# Find occurrences of top 5 words in df_json and include timestamp and duration
word_occurrences = {}

for word in top_5_words:
    matches = df_json[df_json['text'].str.contains(word, case=False, na=False)]
    word_occurrences[word] = matches[['text', 'start', 'duration']].to_dict('records')

# Display the occurrences
word_occurrences

{'water': [{'text': 'stayed for the California Venture water',
   'start': 845.639,
   'duration': 5.12},
  {'text': 'bodies they can draw water in through',
   'start': 1131.2,
   'duration': 4.3}],
 'babies': [{'text': 'he said to me that their babies would be',
   'start': 2518.68,
   'duration': 4.76}],
 'story': [{'text': "story there's a guy I liked and I never",
   'start': 1786.48,
   'duration': 4.319}],
 'birthday': [{'text': 'birthday party and for another I told',
   'start': 672.12,
   'duration': 3.6},
  {'text': "birthday today's your birthday yes what",
   'start': 1187.88,
   'duration': 7.159},
  {'text': 'birthday in a', 'start': 2542.88, 'duration': 6.239}],
 'molecular': [{'text': 'symposium on molecular patronum I think',
   'start': 587.12,
   'duration': 4.64}]}

In [5]:
time_range = 15

In [6]:
from math import floor, ceil

# Process each word in word_occurrences
adjusted_timestamps = {}

for word, occurrences in word_occurrences.items():
    adjusted_timestamps[word] = []
    for occurrence in occurrences:
        start_time = occurrence['start']
        lower_bound = start_time - time_range
        upper_bound = start_time + time_range

        # Find the nearest floor and ceil timestamps in df_json
        nearest_floor = df_json[df_json['start'] <= lower_bound]['start'].max() if not df_json[df_json['start'] <= lower_bound].empty else lower_bound
        nearest_ceil = df_json[df_json['start'] >= upper_bound]['start'].min() if not df_json[df_json['start'] >= upper_bound].empty else upper_bound

        # Append the adjusted timestamps
        adjusted_timestamps[word].append({
            'original_start': start_time,
            'lower_bound': nearest_floor,
            'upper_bound': nearest_ceil
        })

# Display the adjusted timestamps
adjusted_timestamps

{'water': [{'original_start': 845.639,
   'lower_bound': 830.04,
   'upper_bound': 861.48},
  {'original_start': 1131.2, 'lower_bound': 1114.24, 'upper_bound': 1147.08}],
 'babies': [{'original_start': 2518.68,
   'lower_bound': 2501.0,
   'upper_bound': 2536.319}],
 'story': [{'original_start': 1786.48,
   'lower_bound': 1769.88,
   'upper_bound': 1804.08}],
 'birthday': [{'original_start': 672.12,
   'lower_bound': 655.279,
   'upper_bound': 687.48},
  {'original_start': 1187.88, 'lower_bound': 1171.28, 'upper_bound': 1204.36},
  {'original_start': 2542.88, 'lower_bound': 2527.76, 'upper_bound': 2558.16}],
 'molecular': [{'original_start': 587.12,
   'lower_bound': 572.079,
   'upper_bound': 603.12}]}

In [7]:
from moviepy.video.io.VideoFileClip import VideoFileClip
import os
import yt_dlp

In [8]:
def download_youtube_video(url, output_dir='.'):
    ydl_opts = {
        'format': 'best',
        'outtmpl': os.path.join(output_dir, 'source_video.%(ext)s')
    }
    try:
        # Download the video using yt_dlp
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(url, download=True)
            # Get the extension from the downloaded file
            ext = info.get('ext', 'mp4')
            source_path = os.path.join(output_dir, f'source_video.{ext}')
            return source_path
    except Exception as e:
        print(f"Error downloading video: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

In [9]:
def create_trimmed_videos(source_path, timestamps_dict, output_dir='.'):
    try:
        # Load the source video once
        video = VideoFileClip(source_path)
        
        for word, timestamps in timestamps_dict.items():
            for i, timestamp in enumerate(timestamps):
                start_time = timestamp['lower_bound']
                end_time = timestamp['upper_bound']
                output_file = os.path.join(output_dir, f"trimmed_video_{word}_{i + 1}.mp4")
                
                # Create subclip
                trimmed_video = video.subclipped(start_time, end_time)
                trimmed_video.write_videofile(output_file, codec="libx264", audio_codec="aac")
                trimmed_video.close()
                
                print(f"Created {output_file}")
        
        # Close the source video after all subclips are created
        video.close()
    
    except Exception as e:
        print(f"Error creating trimmed videos: {str(e)}")
        import traceback
        traceback.print_exc()

In [10]:
youtube_url = 'https://www.youtube.com/watch?v=dLuQ1wSJACU'
output_directory = '.'

# Download the source video once
source_video_path = download_youtube_video(youtube_url, output_directory)

if source_video_path:
    create_trimmed_videos(source_video_path, adjusted_timestamps, output_directory)
    
    os.remove(source_video_path)
    print(f"Removed temporary source file: {source_video_path}")

[youtube] Extracting URL: https://www.youtube.com/watch?v=dLuQ1wSJACU
[youtube] dLuQ1wSJACU: Downloading webpage
[youtube] dLuQ1wSJACU: Downloading tv client config
[youtube] dLuQ1wSJACU: Downloading player 69f581a5
[youtube] dLuQ1wSJACU: Downloading tv player API JSON
[youtube] dLuQ1wSJACU: Downloading ios player API JSON
[youtube] dLuQ1wSJACU: Downloading m3u8 information
[info] dLuQ1wSJACU: Downloading 1 format(s): 18
[download] source_video.mp4 has already been downloaded
[download] 100% of  137.92MiB
{'video_found': True, 'audio_found': True, 'metadata': {'major_brand': 'mp42', 'minor_version': '0', 'compatible_brands': 'isommp42', 'encoder': 'Google'}, 'inputs': [{'streams': [{'input_number': 0, 'stream_number': 0, 'stream_type': 'video', 'language': None, 'default': True, 'size': [640, 360], 'bitrate': 348, 'fps': 23.976023976023978, 'codec_name': 'h264', 'profile': '(Main)', 'metadata': {'Metadata': '', 'handler_name': 'ISO Media file produced by Google Inc.', 'vendor_id': '[0]

                                                                    

MoviePy - Done.
MoviePy - Writing video .\trimmed_video_water_1.mp4



                                                                         

MoviePy - Done !
MoviePy - video ready .\trimmed_video_water_1.mp4
Created .\trimmed_video_water_1.mp4
Proc not detected
{'video_found': True, 'audio_found': True, 'metadata': {'major_brand': 'mp42', 'minor_version': '0', 'compatible_brands': 'isommp42', 'encoder': 'Google'}, 'inputs': [{'streams': [{'input_number': 0, 'stream_number': 0, 'stream_type': 'video', 'language': None, 'default': True, 'size': [640, 360], 'bitrate': 348, 'fps': 23.976023976023978, 'codec_name': 'h264', 'profile': '(Main)', 'metadata': {'Metadata': '', 'handler_name': 'ISO Media file produced by Google Inc.', 'vendor_id': '[0][0][0][0]'}}, {'input_number': 0, 'stream_number': 1, 'stream_type': 'audio', 'language': 'eng', 'default': True, 'fps': 44100, 'bitrate': 95, 'metadata': {'Metadata': '', 'handler_name': 'ISO Media file produced by Google Inc.', 'vendor_id': '[0][0][0][0]'}}], 'input_number': 0}], 'duration': 2588.82, 'bitrate': 446, 'start': 0.0, 'default_video_input_number': 0, 'default_video_stream_n

                                                                   

MoviePy - Done.
MoviePy - Writing video .\trimmed_video_water_2.mp4



                                                                         

MoviePy - Done !
MoviePy - video ready .\trimmed_video_water_2.mp4
Created .\trimmed_video_water_2.mp4
Proc not detected
{'video_found': True, 'audio_found': True, 'metadata': {'major_brand': 'mp42', 'minor_version': '0', 'compatible_brands': 'isommp42', 'encoder': 'Google'}, 'inputs': [{'streams': [{'input_number': 0, 'stream_number': 0, 'stream_type': 'video', 'language': None, 'default': True, 'size': [640, 360], 'bitrate': 348, 'fps': 23.976023976023978, 'codec_name': 'h264', 'profile': '(Main)', 'metadata': {'Metadata': '', 'handler_name': 'ISO Media file produced by Google Inc.', 'vendor_id': '[0][0][0][0]'}}, {'input_number': 0, 'stream_number': 1, 'stream_type': 'audio', 'language': 'eng', 'default': True, 'fps': 44100, 'bitrate': 95, 'metadata': {'Metadata': '', 'handler_name': 'ISO Media file produced by Google Inc.', 'vendor_id': '[0][0][0][0]'}}], 'input_number': 0}], 'duration': 2588.82, 'bitrate': 446, 'start': 0.0, 'default_video_input_number': 0, 'default_video_stream_n

                                                                    

MoviePy - Done.
MoviePy - Writing video .\trimmed_video_babies_1.mp4



                                                                         

MoviePy - Done !
MoviePy - video ready .\trimmed_video_babies_1.mp4
Created .\trimmed_video_babies_1.mp4
Proc not detected
{'video_found': True, 'audio_found': True, 'metadata': {'major_brand': 'mp42', 'minor_version': '0', 'compatible_brands': 'isommp42', 'encoder': 'Google'}, 'inputs': [{'streams': [{'input_number': 0, 'stream_number': 0, 'stream_type': 'video', 'language': None, 'default': True, 'size': [640, 360], 'bitrate': 348, 'fps': 23.976023976023978, 'codec_name': 'h264', 'profile': '(Main)', 'metadata': {'Metadata': '', 'handler_name': 'ISO Media file produced by Google Inc.', 'vendor_id': '[0][0][0][0]'}}, {'input_number': 0, 'stream_number': 1, 'stream_type': 'audio', 'language': 'eng', 'default': True, 'fps': 44100, 'bitrate': 95, 'metadata': {'Metadata': '', 'handler_name': 'ISO Media file produced by Google Inc.', 'vendor_id': '[0][0][0][0]'}}], 'input_number': 0}], 'duration': 2588.82, 'bitrate': 446, 'start': 0.0, 'default_video_input_number': 0, 'default_video_stream

                                                                   

MoviePy - Done.
MoviePy - Writing video .\trimmed_video_story_1.mp4



                                                                         

MoviePy - Done !
MoviePy - video ready .\trimmed_video_story_1.mp4
Created .\trimmed_video_story_1.mp4
Proc not detected
{'video_found': True, 'audio_found': True, 'metadata': {'major_brand': 'mp42', 'minor_version': '0', 'compatible_brands': 'isommp42', 'encoder': 'Google'}, 'inputs': [{'streams': [{'input_number': 0, 'stream_number': 0, 'stream_type': 'video', 'language': None, 'default': True, 'size': [640, 360], 'bitrate': 348, 'fps': 23.976023976023978, 'codec_name': 'h264', 'profile': '(Main)', 'metadata': {'Metadata': '', 'handler_name': 'ISO Media file produced by Google Inc.', 'vendor_id': '[0][0][0][0]'}}, {'input_number': 0, 'stream_number': 1, 'stream_type': 'audio', 'language': 'eng', 'default': True, 'fps': 44100, 'bitrate': 95, 'metadata': {'Metadata': '', 'handler_name': 'ISO Media file produced by Google Inc.', 'vendor_id': '[0][0][0][0]'}}], 'input_number': 0}], 'duration': 2588.82, 'bitrate': 446, 'start': 0.0, 'default_video_input_number': 0, 'default_video_stream_n

                                                                    

MoviePy - Done.
MoviePy - Writing video .\trimmed_video_birthday_1.mp4



                                                                         

MoviePy - Done !
MoviePy - video ready .\trimmed_video_birthday_1.mp4
Created .\trimmed_video_birthday_1.mp4
Proc not detected
{'video_found': True, 'audio_found': True, 'metadata': {'major_brand': 'mp42', 'minor_version': '0', 'compatible_brands': 'isommp42', 'encoder': 'Google'}, 'inputs': [{'streams': [{'input_number': 0, 'stream_number': 0, 'stream_type': 'video', 'language': None, 'default': True, 'size': [640, 360], 'bitrate': 348, 'fps': 23.976023976023978, 'codec_name': 'h264', 'profile': '(Main)', 'metadata': {'Metadata': '', 'handler_name': 'ISO Media file produced by Google Inc.', 'vendor_id': '[0][0][0][0]'}}, {'input_number': 0, 'stream_number': 1, 'stream_type': 'audio', 'language': 'eng', 'default': True, 'fps': 44100, 'bitrate': 95, 'metadata': {'Metadata': '', 'handler_name': 'ISO Media file produced by Google Inc.', 'vendor_id': '[0][0][0][0]'}}], 'input_number': 0}], 'duration': 2588.82, 'bitrate': 446, 'start': 0.0, 'default_video_input_number': 0, 'default_video_st

                                                                   

MoviePy - Done.
MoviePy - Writing video .\trimmed_video_birthday_2.mp4



                                                                         

MoviePy - Done !
MoviePy - video ready .\trimmed_video_birthday_2.mp4
Created .\trimmed_video_birthday_2.mp4
Proc not detected
{'video_found': True, 'audio_found': True, 'metadata': {'major_brand': 'mp42', 'minor_version': '0', 'compatible_brands': 'isommp42', 'encoder': 'Google'}, 'inputs': [{'streams': [{'input_number': 0, 'stream_number': 0, 'stream_type': 'video', 'language': None, 'default': True, 'size': [640, 360], 'bitrate': 348, 'fps': 23.976023976023978, 'codec_name': 'h264', 'profile': '(Main)', 'metadata': {'Metadata': '', 'handler_name': 'ISO Media file produced by Google Inc.', 'vendor_id': '[0][0][0][0]'}}, {'input_number': 0, 'stream_number': 1, 'stream_type': 'audio', 'language': 'eng', 'default': True, 'fps': 44100, 'bitrate': 95, 'metadata': {'Metadata': '', 'handler_name': 'ISO Media file produced by Google Inc.', 'vendor_id': '[0][0][0][0]'}}], 'input_number': 0}], 'duration': 2588.82, 'bitrate': 446, 'start': 0.0, 'default_video_input_number': 0, 'default_video_st

                                                                    

MoviePy - Done.
MoviePy - Writing video .\trimmed_video_birthday_3.mp4



                                                                         

MoviePy - Done !
MoviePy - video ready .\trimmed_video_birthday_3.mp4
Created .\trimmed_video_birthday_3.mp4
Proc not detected
{'video_found': True, 'audio_found': True, 'metadata': {'major_brand': 'mp42', 'minor_version': '0', 'compatible_brands': 'isommp42', 'encoder': 'Google'}, 'inputs': [{'streams': [{'input_number': 0, 'stream_number': 0, 'stream_type': 'video', 'language': None, 'default': True, 'size': [640, 360], 'bitrate': 348, 'fps': 23.976023976023978, 'codec_name': 'h264', 'profile': '(Main)', 'metadata': {'Metadata': '', 'handler_name': 'ISO Media file produced by Google Inc.', 'vendor_id': '[0][0][0][0]'}}, {'input_number': 0, 'stream_number': 1, 'stream_type': 'audio', 'language': 'eng', 'default': True, 'fps': 44100, 'bitrate': 95, 'metadata': {'Metadata': '', 'handler_name': 'ISO Media file produced by Google Inc.', 'vendor_id': '[0][0][0][0]'}}], 'input_number': 0}], 'duration': 2588.82, 'bitrate': 446, 'start': 0.0, 'default_video_input_number': 0, 'default_video_st

                                                                   

MoviePy - Done.
MoviePy - Writing video .\trimmed_video_molecular_1.mp4



                                                                         

MoviePy - Done !
MoviePy - video ready .\trimmed_video_molecular_1.mp4
Created .\trimmed_video_molecular_1.mp4
Removed temporary source file: .\source_video.mp4
