In [2]:
!pip install youtube-transcript-api

Collecting youtube-transcript-api
  Downloading youtube_transcript_api-0.6.2-py3-none-any.whl.metadata (15 kB)
Downloading youtube_transcript_api-0.6.2-py3-none-any.whl (24 kB)
Installing collected packages: youtube-transcript-api
Successfully installed youtube-transcript-api-0.6.2


IMPORTS

In [3]:
import os
import polars as pl
from youtube_transcript_api import YouTubeTranscriptApi

SETTING PATH

In [4]:
# Change working directory to your specific folder
os.chdir('/content/drive/MyDrive/YouTube_Semantic_Search_ML/Data Engineering')

FUNCTIONS

In [5]:
def extract_text(transcript: list) -> str:
    """
        Function to extract text from transcript dictionary
    """

    text_list = [transcript[i]['text'] for i in range(len(transcript))]
    return ' '.join(text_list)

GET TRANSCRIPTS

In [6]:
# load data
df = pl.read_parquet('data/video-ids.parquet')
print(df.head())

shape: (5, 4)
┌─────────────┬──────────────────────┬──────────────────────────────┬──────────────────────────────┐
│ video_id    ┆ datetime             ┆ title                        ┆ description                  │
│ ---         ┆ ---                  ┆ ---                          ┆ ---                          │
│ str         ┆ str                  ┆ str                          ┆ str                          │
╞═════════════╪══════════════════════╪══════════════════════════════╪══════════════════════════════╡
│ 7Oy2NmPwJXo ┆ 2024-09-26T23:24:35Z ┆ I Quit My Job… Here’s How    ┆ Here, I share all 9 ways I   │
│             ┆                      ┆ Much…                        ┆ scr…                         │
│ ZVVkdXHqEuM ┆ 2024-09-23T15:45:12Z ┆ Knowledge Distillation       ┆                              │
│             ┆                      ┆ Explain…                     ┆                              │
│ reXoKNC_Wx4 ┆ 2024-09-20T18:15:44Z ┆ Quantization Explained in 60 ┆        

In [7]:
%%time
transcript_text_list = []

for i in range(len(df)):

    # try to extract captions
    try:
        transcript = YouTubeTranscriptApi.get_transcript(df['video_id'][i])
        transcript_text = extract_text(transcript)
    # if not available set as n/a
    except:
        transcript_text = "n/a"

    transcript_text_list.append(transcript_text)

CPU times: user 4.26 s, sys: 363 ms, total: 4.62 s
Wall time: 1min 28s


In [8]:
# add transcripts to dataframe
df = df.with_columns(pl.Series(name="transcript", values=transcript_text_list))
print(df.head())

shape: (5, 5)
┌─────────────┬─────────────────────┬────────────────────┬────────────────────┬────────────────────┐
│ video_id    ┆ datetime            ┆ title              ┆ description        ┆ transcript         │
│ ---         ┆ ---                 ┆ ---                ┆ ---                ┆ ---                │
│ str         ┆ str                 ┆ str                ┆ str                ┆ str                │
╞═════════════╪═════════════════════╪════════════════════╪════════════════════╪════════════════════╡
│ 7Oy2NmPwJXo ┆ 2024-09-26T23:24:35 ┆ I Quit My Job…     ┆ Here, I share all  ┆ 14 months ago I    │
│             ┆ Z                   ┆ Here’s How Much…   ┆ 9 ways I scr…      ┆ made a big lif…    │
│ ZVVkdXHqEuM ┆ 2024-09-23T15:45:12 ┆ Knowledge          ┆                    ┆ knowledge          │
│             ┆ Z                   ┆ Distillation       ┆                    ┆ distillation       │
│             ┆                     ┆ Explain…           ┆                   

WRITE DATA TO FILE

In [9]:
# write data to file
df.write_parquet('data/video-transcripts.parquet')
df.write_csv('data/video-transcripts.csv')