## 1. Transcribing the videos
 ([Go to top](#Capstone-8:-Bringing-It-All-Together))

Use this section to implement your solution to transcribe the videos. 

In [None]:
!pip install moviepy SpeechRecognition mutagen

In [2]:
prefix = 'CUR-TF-200-ACMNLP-1/video/'
bucket_name = 'your-bucket-name'

In [3]:
import boto3
import io
import os
from moviepy.editor import VideoFileClip
import speech_recognition as sr
from urllib.request import urlopen

Matplotlib is building the font cache; this may take a moment.


In [4]:
def generate_presigned_urls(s3_client, bucket_name, prefix, external_bucket=False):

    response = s3_client.list_objects_v2(Bucket = bucket_name, Prefix=prefix)
    presigned_urls = []
    obj_keys = []

    for item in response.get('Contents', []):
        file_key = item['Key']
        presigned_url = s3_client.generate_presigned_url(
            ClientMethod='get_object',
            Params={'Bucket': bucket_name, 'Key': file_key},
            ExpiresIn=3600
            )
        presigned_urls.append(presigned_url)
        obj_keys.append(file_key)
    # obj_keys.pop(0)
    # presigned_urls.pop(0)
    return [presigned_urls, obj_keys]

# CONVERTING MP4 TO WAV

In [None]:
s3_client = boto3.client('s3')
presigned_urls_mp4, obj_keys_mp4 = generate_presigned_urls(s3_client, bucket_name, prefix)
print(presigned_urls_mp4)

In [6]:
def mp4_to_wav(url, obj_key):
    obj_key = obj_key.replace(' ', '_')
    obj_key = obj_key.split('/')
    # print(obj_key)
    output_filename = f'audio/{obj_key[-1][:-4]}.wav'
    # print(output_filename)
    video_clip = VideoFileClip(url)
    video_clip.audio.write_audiofile(output_filename)
    print(f"Successfully converted {obj_key} to {output_filename}")

In [None]:
obj_count = 0
for url in presigned_urls_mp4:
    mp4_to_wav(url, obj_keys_mp4[obj_count])
    obj_count += 1

In [None]:
mp4_to_wav(presigned_urls_mp4[0], obj_keys_mp4[0])

In [None]:
!pip install torch transformers

In [2]:
import torch
from transformers import pipeline
import os

whisper = pipeline("automatic-speech-recognition", "openai/whisper-large-v2", device="cuda:0")

config.json:   0%|          | 0.00/1.99k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.17G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/4.29k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

In [5]:
def transcribe_audio_folder(audio_folder_path="audio", output_folder_path="subtitles"):

    os.makedirs(output_folder_path, exist_ok=True)

    audio_files = [os.path.join(audio_folder_path, f) for f in os.listdir(audio_folder_path) if f.endswith((".mp3", ".wav"))]

    for audio_file in audio_files:
        filename, _ = os.path.splitext(os.path.basename(audio_file))
        output_filename = os.path.join(output_folder_path, f"{filename}.txt")

        try:
            transcription = whisper(audio_file, chunk_length_s=30)

            with open(output_filename, "w") as transcript_file:
                transcript_file.write(transcription["text"])
            print(f"Transcribed: {audio_file} to {output_filename}")
        except Exception as e:
            print(f"Error transcribing {audio_file}: {e}")

In [6]:
transcribe_audio_folder()

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


Transcribed: audio/Mod02_Sect02.wav to subtitles/Mod02_Sect02.txt
Transcribed: audio/Mod03_Sect04_part1.wav to subtitles/Mod03_Sect04_part1.txt
Transcribed: audio/Mod06_WrapUp.wav to subtitles/Mod06_WrapUp.txt
Transcribed: audio/Mod06_Sect01.wav to subtitles/Mod06_Sect01.txt
Transcribed: audio/Mod03_Sect02_part3.wav to subtitles/Mod03_Sect02_part3.txt
Transcribed: audio/Mod03_Sect03_part1.wav to subtitles/Mod03_Sect03_part1.txt
Transcribed: audio/Mod03_Sect03_part2.wav to subtitles/Mod03_Sect03_part2.txt
Transcribed: audio/Mod05_Sect03_part2.wav to subtitles/Mod05_Sect03_part2.txt
Transcribed: audio/Mod01_Course_Overview.wav to subtitles/Mod01_Course_Overview.txt
Transcribed: audio/Mod02_Sect04.wav to subtitles/Mod02_Sect04.txt


--- Logging error ---
Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/logging/__init__.py", line 1100, in emit
    msg = self.format(record)
  File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/logging/__init__.py", line 943, in format
    return fmt.format(record)
  File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/logging/__init__.py", line 678, in format
    record.message = record.getMessage()
  File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/logging/__init__.py", line 368, in getMessage
    msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
  File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/ec2-use

Transcribed: audio/Mod04_WrapUp.wav to subtitles/Mod04_WrapUp.txt
Transcribed: audio/Mod02_Sect05.wav to subtitles/Mod02_Sect05.txt
Transcribed: audio/Mod03_Sect08.wav to subtitles/Mod03_Sect08.txt
Transcribed: audio/Mod04_Sect02_part1.wav to subtitles/Mod04_Sect02_part1.txt
Transcribed: audio/Mod05_Sect02_part1_ver2.wav to subtitles/Mod05_Sect02_part1_ver2.txt
Transcribed: audio/Mod07_Sect01.wav to subtitles/Mod07_Sect01.txt
Transcribed: audio/Mod03_Sect02_part1.wav to subtitles/Mod03_Sect02_part1.txt
Transcribed: audio/Mod06_Sect02.wav to subtitles/Mod06_Sect02.txt
Transcribed: audio/Mod05_Sect03_part4_ver2.wav to subtitles/Mod05_Sect03_part4_ver2.txt
Transcribed: audio/Mod02_Intro.wav to subtitles/Mod02_Intro.txt
Transcribed: audio/Mod03_Sect06.wav to subtitles/Mod03_Sect06.txt
Error transcribing audio/.wav: Soundfile is either not in the correct format or is malformed. Ensure that the soundfile has a valid audio file extension (e.g. wav, flac or mp3) and is not corrupted. If readin

## 2. Normalizing the text
([Go to top](#Capstone-8:-Bringing-It-All-Together))

Use this section to perform any text normalization steps that are necessary for your solution.

In [None]:
!pip install nltk numpy spacy

In [None]:
!python -m spacy download en_core_web_sm

In [3]:
import nltk
from nltk import FreqDist
from collections import Counter
import numpy as np
import spacy
import os

In [5]:
def preprocess_text(text_file_path):

    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)

    with open(text_file_path, 'r') as f:
        text = f.read()

    normalized_text = text.lower()
    normalized_text = ''.join([c for c in normalized_text if c.isalnum() or c.isspace()])
    stopwords = nltk.corpus.stopwords.words('english')
    normalized_text = ' '.join([word for word in normalized_text.split() if word not in stopwords])

    return normalized_text

## 3. Extracting key phrases and topics
([Go to top](#Capstone-8:-Bringing-It-All-Together))

Use this section to extract the key phrases and topics from the videos.

In [6]:
def tf_idf(text):

    word_counts = FreqDist(text.split())

    total_words = len(text.split())

    documents = [text]

    tf_idf_scores = {}

    for word, count in word_counts.items():
        tf = count / total_words 
        df = 0

        for doc in documents:
            if word in doc:
                df += 1

        idf = np.log((len(documents) / (df + 1)))

        tf_idf_scores[word] = tf * idf

    return tf_idf_scores

In [7]:
def extract_key_phrases(text):

    nlp = spacy.load('en_core_web_sm')  # Load spaCy model
    doc = nlp(text)
    phrases = []
    for noun_phrase in doc.noun_chunks:
        phrases.append(noun_phrase.text)
    return phrases

In [8]:
def analyze_subtitles(subtitles_folder="subtitles"):
    all_results = {}
    for filename in os.listdir(subtitles_folder):
        if filename.endswith(".txt"):
            file_path = os.path.join(subtitles_folder, filename)
            normalized_text = preprocess_text(file_path)
            tf_idf_scores = tf_idf(normalized_text)
            top_10_keywords = FreqDist(tf_idf_scores).most_common(10)  
            key_phrases = extract_key_phrases(normalized_text)

            all_results[filename] = {
                  "top_keywords": top_10_keywords,
                  "key_phrases": key_phrases
              }

    return all_results

In [9]:
all_analysis_results = analyze_subtitles()

In [11]:
def search_videos(all_results, search_query):

    matching_videos = []
    query_keywords = [word.strip() for word in search_query.split(",") if word.strip()]
    query_phrases = [phrase.strip() for phrase in search_query.split(",") if phrase.strip()]

    for filename, video_data in all_results.items():

        top_keywords = [word for word, _ in video_data["top_keywords"]]
        key_phrases = video_data["key_phrases"]

        keyword_match = any(keyword in top_keywords for keyword in query_keywords)

        phrase_match = any(phrase in key_phrases for phrase in query_phrases)

        if keyword_match or phrase_match:
            matching_videos.append(filename)

    return matching_videos

## 4. Creating the dashboard
([Go to top](#Capstone-8:-Bringing-It-All-Together))

Use this section to create the dashboard for your solution.

In [12]:
import boto3
s3_client=boto3.client('s3')
bucket_name = 'your-bucket-name'
prefix = 'CUR-TF-200-ACMNLP-1/video/'

In [107]:
def get_urls(s3_client, bucket_name, prefix, key):
    response = s3_client.list_objects_v2(Bucket = bucket_name, Prefix=prefix)
    presigned_urls = []
    obj_keys = []
    key = key[:-4]
    for item in response.get('Contents', []):
        file_key = item['Key']
        modified_key = file_key.replace(' ', '_')
        if key in modified_key:
            presigned_url = s3_client.generate_presigned_url(
                ClientMethod='get_object',
                Params={'Bucket': bucket_name, 'Key': file_key},
                ExpiresIn=3600  
                )
            presigned_urls.append(presigned_url)
            file_key = file_key.split('/')[-1][:-4]
            obj_keys.append(file_key)

    return [presigned_urls, obj_keys]

In [108]:
from ipywidgets import interact, Layout, Textarea, Button, Label, VBox, HTML

In [112]:
initial_video_list = list(all_analysis_results.keys())
video_list_sorted = initial_video_list.copy()
video_list_sorted.sort()

search_bar = Textarea(description="Search Keywords or Phrases (separated by commas)", value="")

search_button = Button(description="Search")

video_list_output = HTML(layout={'width': '100%'})

links_html = ''
for key in video_list_sorted:
    video_url, video_name = get_urls(s3_client, bucket_name, prefix, key)
    link_html = f'<h3><a href=\"{video_url[0]}\"  target=\'blank\'>{video_name[0]}</a></h3><br>'
    links_html += link_html
video_list_output.value = links_html

In [113]:
def update_video_list(search_query):

    matching_videos = search_videos(all_analysis_results, search_bar.value)
    matching_videos_sorted = matching_videos.copy()
    matching_videos_sorted.sort()
    links_html = ""
    for key in matching_videos_sorted:
        video_url, video_name = get_urls(s3_client, bucket_name, prefix, key)
        link_html = f'<h3><a href=\"{video_url[0]}\" target=\'blank\'>{video_name[0]}</a></h3><br>'
        links_html += link_html
    video_list_output.value = links_html


search_button.on_click(update_video_list)


layout = VBox([search_bar, search_button, video_list_output])

display(layout)

VBox(children=(Textarea(value='', description='Search Keywords or Phrases (separated by commas)'), Button(desc…