In [1]:
import srt
import sqlite3
from pathlib import Path
import os
import pandas as pd

In [2]:
ROOT_DIR = Path(os.path.abspath(os.getcwd())).parents[0]
RAW_DATA_DIR = os.path.abspath(os.path.join(ROOT_DIR, 'data/raw'))
DEFAULT_DB = os.path.abspath(os.path.join(RAW_DATA_DIR, 'video-info.db'))

In [3]:
conn = sqlite3.connect(DEFAULT_DB)

SQL = '''select CaptionInfo.id, CaptionInfo.vidId, CaptionInfo.caption, VideoInfo.channelId, VideoInfo.title
 from CaptionInfo inner join VideoInfo on CaptionInfo.vidId = VideoInfo.id'''
df = pd.read_sql(SQL, conn)
conn.close()
df

Unnamed: 0,id,vidId,caption,channelId,title
0,VVHUd5m5rTvmWDhATvUkjBUcGlE6F_t0,-0jeolifQPk,"b""1\n00:00:00,000 --> 00:00:04,160\n[Women sho...",UCDWIvJwLJsE4LG1Atne2blQ,curling exe [h3h3productions]
1,AjzxxTptahSTVTYnncipkCZcUJ8Dacba,-2fcSWvJ3wI,"b""1\n00:00:00,000 --> 00:00:02,100\nHere we go...",UCsvn_Po0SmunchJYOWpOxMg,Ultimate Smash Bruddas
2,vU5ONboXu19WX5MyS_22lOVd3RLHMvRI,-7AkcBWuTJ8,"b""1\n00:00:00,080 --> 00:00:01,280\nYou see th...",UCDWIvJwLJsE4LG1Atne2blQ,I'm the Best. PERIOD.
3,COR0deZYQbXqPLDXd1gfKM81aXqSQURy,-CrFHP19-S8,"b""1\n00:00:00,640 --> 00:00:02,480\nSpace Jam!...",UCsvn_Po0SmunchJYOWpOxMg,League of Legends : Fizz Jam
4,BIXAonnS7lmHBl0D3onHNsbASo8d_Ap8,-K1wsGeLSMM,"b'1\n00:00:00,000 --> 00:00:05,380\n[ raps ]\n...",UCsvn_Po0SmunchJYOWpOxMg,Playstation Allstars
5,n2szRfUoBECWv_Do0mWYb8Ni8ayCGq8b,-OProdXkbYQ,"b'1\n00:00:00,039 --> 00:00:04,079\nWelcome ev...",UCDWIvJwLJsE4LG1Atne2blQ,WE'RE IN PAYDAY 2!!
6,W5tK38foKHjJDjZWnm0YVgWJwYY3i5yr,-Wv6okd57t8,"b'1\n00:00:00,000 --> 00:00:03,380\n- [Draemag...",UCsvn_Po0SmunchJYOWpOxMg,Hamston Checks Out
7,WkwEdsqdTwLQWy-HB3fCamZbaZ8XWib7,-_ZZAX_zt7Y,"b'1\n00:00:00,100 --> 00:00:01,060\nIt\'s real...",UCsvn_Po0SmunchJYOWpOxMg,Banjo Kazooie (dunkview)
8,PKg4R-lu25QloCxv0DBJ_IE3JsHrBf-L,-w_Pvsz89wM,"b'1\n00:00:01,660 --> 00:00:03,580\nThere\'s s...",UCDWIvJwLJsE4LG1Atne2blQ,college kid | blaze it | 420
9,mfTyn3UrjHt8qXoeFZtElZxgf4dE7Z6T,0Eq8gBI5riU,"b'1\n00:00:00,000 --> 00:00:00,960\n \n\n2\n00...",UCDWIvJwLJsE4LG1Atne2blQ,100 LAYERS OF SHIRTS


In [4]:
# captionBytes = df['caption'].iloc[7]
# print(captionBytes.decode('unicode_escape').replace(r"\\'", r"'"))

Many captions have a new line in them so replace that with a space

In [5]:
def bytesToSubtitle(captionBytes):
    captionStr = captionBytes.decode('utf-8')
    for subtitle in srt.parse(captionStr):
        subtitle.content = subtitle.content.replace('\n', ' ')
        yield subtitle

Lets just join all the text together, ignoring time info

In [6]:
from functools import reduce
def subToText(subtitles):
    def joinSub(sub1, sub2):
        if sub1 == '':
            return sub2.content
        return f'{sub1} {sub2.content}'
    return reduce(joinSub, subtitles, '')

In [7]:
def bytesToText(captionBytes):
    return subToText(bytesToSubtitle(captionBytes))

In [8]:
df['caption'] = df['caption'].apply(bytesToText)
df

Unnamed: 0,id,vidId,caption,channelId,title
0,VVHUd5m5rTvmWDhATvUkjBUcGlE6F_t0,-0jeolifQPk,[Women shouting] [Narrator] Today one coveted ...,UCDWIvJwLJsE4LG1Atne2blQ,curling exe [h3h3productions]
1,AjzxxTptahSTVTYnncipkCZcUJ8Dacba,-2fcSWvJ3wI,"Here we go, here we go, come on! KABOWSH! HA T...",UCsvn_Po0SmunchJYOWpOxMg,Ultimate Smash Bruddas
2,vU5ONboXu19WX5MyS_22lOVd3RLHMvRI,-7AkcBWuTJ8,You see this body? It's the best body! You kno...,UCDWIvJwLJsE4LG1Atne2blQ,I'm the Best. PERIOD.
3,COR0deZYQbXqPLDXd1gfKM81aXqSQURy,-CrFHP19-S8,"Space Jam! Here we go. Now, the- He's a great ...",UCsvn_Po0SmunchJYOWpOxMg,League of Legends : Fizz Jam
4,BIXAonnS7lmHBl0D3onHNsbASo8d_Ap8,-K1wsGeLSMM,[ raps ] Super Smash Bros. Brawl is the third ...,UCsvn_Po0SmunchJYOWpOxMg,Playstation Allstars
5,n2szRfUoBECWv_Do0mWYb8Ni8ayCGq8b,-OProdXkbYQ,Welcome everybody to an h3h3 special announcem...,UCDWIvJwLJsE4LG1Atne2blQ,WE'RE IN PAYDAY 2!!
6,W5tK38foKHjJDjZWnm0YVgWJwYY3i5yr,-Wv6okd57t8,- [Draemagnce] You can get free charge. Outsid...,UCsvn_Po0SmunchJYOWpOxMg,Hamston Checks Out
7,WkwEdsqdTwLQWy-HB3fCamZbaZ8XWib7,-_ZZAX_zt7Y,It's real good. [ banjo music ] Banjo Kazooie ...,UCsvn_Po0SmunchJYOWpOxMg,Banjo Kazooie (dunkview)
8,PKg4R-lu25QloCxv0DBJ_IE3JsHrBf-L,-w_Pvsz89wM,"There's some other like, uh, Uh, guitars out t...",UCDWIvJwLJsE4LG1Atne2blQ,college kid | blaze it | 420
9,mfTyn3UrjHt8qXoeFZtElZxgf4dE7Z6T,0Eq8gBI5riU,Totally calm right now this is just a comple...,UCDWIvJwLJsE4LG1Atne2blQ,100 LAYERS OF SHIRTS


In [9]:
dunk = df['caption'].iloc[7]
print(dunk)

It's real good. [ banjo music ] Banjo Kazooie is one of those rare titles that was actually made with a glowing spark of inspiration and passion for gaming. It's not one of these soulless, droning cash grabs. It's an illustrious, unforgettable experience. These games are older than dinosaur dirt and yet people still fondly remember Banjo. Even if you never played the games, you know about 'em. It's a bear wearing a backpack that he jammed a little bird into. Who comes up with this? [ shotgun pump ] It's amazing! You're dropped right into a cheery, colorful world bursting with characters and liveliness. There's hundreds of things to collect and discover. You start humming along to that classic music and boom: You've been playing for five hours and it went by just like that [ snaps fingers ]. The game is fun as hell because it offers up a lot of simple pleasures. Like the little jinjos. You walk into them and it goes [ imitating jinjo fanfare ]. It's just so immediate and satisfying. You