# Youtube Videos Speech-to-text

## Define paths

In [1]:
AUDIO_PATH = '../Data/Audio'

## Import scraped links

In [2]:
import pandas as pd

links = pd.read_csv('../Data/YoutubeLinks.tsv', sep='\t')
links

Unnamed: 0,creator,title,link
0,@gradehacker,Coursera Review: Our Experience and How it Works,https://www.youtube.com/watch?v=l5V2BaoYnWo&pp...
1,@RichardWalls,Coursera Review | My Thoughts After 5 Years an...,https://www.youtube.com/watch?v=V79x7045Bp0&pp...
2,@Daniel-Dann,Coursera Review (2024) - Is Coursera Worth it?...,https://www.youtube.com/watch?v=rnpzU7GBHlI&pp...
3,@gradehacker,Top 5 Online Learning Platforms 2024 | Review ...,https://www.youtube.com/watch?v=wY5n3uGZ6Js&pp...
4,@loistalagrand,Coursera Review (The Best E-learning Site?),https://www.youtube.com/watch?v=LdQBrMWAU_w&pp...
5,@MotasemHamdan,Palo Alto Networks Cybersecurity Professional ...,https://www.youtube.com/watch?v=Y6YNM-2P32Y&pp...
6,@JonGoodCyber,ONLY UNSPONSORED Review of the Google Cybersec...,https://www.youtube.com/watch?v=lZ6p_djgNWI&pp...
7,@vitaliylahno,Coursera Review: Why Is It the Best Online Lea...,https://www.youtube.com/watch?v=91w68nfT3Qw&pp...
8,@Khosomaty,Coursera Plus 2023 Review 7000+ Online Courses...,https://www.youtube.com/shorts/Lds9UVRlzlQ
9,@thesocialguide7659,Coursera Review - Best Platform for Courses?,https://www.youtube.com/watch?v=QXb9gNPLB4A&pp...


## Cleanse the video titles

To prevent inconsistencies, we need to remove special characters from every title

In [3]:
import re

titles_list = []

for i in range(len(links)):
    titles_list.append(re.sub(r'[^a-zA-Z0-9\s]', '', links['title'][i]))

## Download audio data

In [4]:
# from pytube import YouTube
from pytubefix import YouTube

for i in range(len(links)):
    yt = YouTube(links['link'][i])
    print(links['title'][i])
    yt.streams.filter(only_audio=True).order_by('mime_type').desc().first().download(output_path=AUDIO_PATH, filename=f'{titles_list[i]}', mp3=True)

Coursera Review: Our Experience and How it Works
Coursera Review | My Thoughts After 5 Years and 40 Online Courses
Coursera Review (2024) - Is Coursera Worth it? - My Honest Feedback After Using it for Several Years
Top 5 Online Learning Platforms 2024 | Review of Coursera / SkillShare / Udemy / EdX / LinkedIn
Coursera Review (The Best E-learning Site?)
Palo Alto Networks Cybersecurity Professional Certificate Review | Coursera
ONLY UNSPONSORED Review of the Google Cybersecurity Certificate From Coursera
Coursera Review: Why Is It the Best Online Learning Platform?
Coursera Plus 2023 Review 7000+ Online Courses #shortsvideo
Coursera Review - Best Platform for Courses?
Microsoft UX Design Professional Certificate Review - 2024 | Coursera Review
Coursera certificate value in India | Get a job with Coursera certificate | Coursera Review 2022
Google it Support Professional Certificate Review - 2024 (Coursera Review)
Meta Marketing Analytics Professional Certificate on Coursera | Review
Cou

## Convert mp3 to wav

### Using pydub

In [5]:
# from pydub import AudioSegment
# try:
#     sound = AudioSegment.from_mp3(f"{AUDIO_PATH}/Coursera Review  My Thoughts After 5 Years and 40 Online Courses.mp3")
#     sound.export(f"{AUDIO_PATH}/Coursera Review  My Thoughts After 5 Years and 40 Online Courses.wav", format="wav")

# except Exception as e:
#     print(e)

### Using ffmpeg

In [6]:
import subprocess

for i in range(len(links)):
    subprocess.call(['ffmpeg', '-i', f'{AUDIO_PATH}/{titles_list[i]}.mp3', f'{AUDIO_PATH}/{titles_list[i]}.wav'])

## Speech to text for summarization

### Import needed libraries and create recognizer class

In [7]:
import speech_recognition as sr
from mutagen.wave import WAVE
import numpy as np

recognizer = sr.Recognizer()

### Create empty raw text list

In [8]:
raw_text = []

### Do speech-to-text

In [9]:
drop_index = []

for i in range(len(links)):
    audio_title = titles_list[i]
    audio_file = sr.AudioFile(f'{AUDIO_PATH}/{audio_title}.wav')
    audio_chunk_size = int(np.ceil(WAVE(f'{AUDIO_PATH}/{audio_title}.wav').info.length / 30)) # To split the audio into chunks with 30 seconds interval
    text = ''
    with audio_file as source:

        try:
            for chunk in range(audio_chunk_size):
                audio = recognizer.record(source, duration=30)
                text += recognizer.recognize_google(audio)
            raw_text.append(text)

        except sr.UnknownValueError:
            print(f"Google Speech Recognition could not understand audio at {i}")
            drop_index.append(i)
        
        except sr.RequestError as e:
            print("Could not request results from Google Speech Recognition service; {0}".format(e))

        except TimeoutError:
            print(f"Timeout exceeded at {i}")
            drop_index.append(i)

Google Speech Recognition could not understand audio at 2
Google Speech Recognition could not understand audio at 11
Google Speech Recognition could not understand audio at 12
Google Speech Recognition could not understand audio at 13
Google Speech Recognition could not understand audio at 14
Google Speech Recognition could not understand audio at 19
Google Speech Recognition could not understand audio at 21


In [10]:
raw_data = pd.DataFrame({'title': titles_list})
raw_data = raw_data.drop(drop_index)
raw_data['raw_text'] = raw_text

In [12]:
raw_data.to_csv('../Data/rawtext.tsv', index=False, sep='\t')