In [35]:
import io
import requests
from datetime import datetime

import numpy as np
import pandas as pd
import webvtt
import pyktok as pyk

pyk.specify_browser("chrome")

In [8]:
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
headers = {"User-Agent": user_agent}

In [9]:
url = "https://www.tiktok.com/@tiktok/video/7361448925972155679?lang=en"

In [37]:
tt_json = pyk.alt_get_tiktok_json(url)

video_details = tt_json["__DEFAULT_SCOPE__"]["webapp.video-detail"]["itemInfo"]["itemStruct"]

for info in video_details["video"]["subtitleInfos"]:
    if info["LanguageCodeName"] == "eng-US":
        vtt_link = info["Url"]
        break

vtt_link

'https://v16-webapp.tiktok.com/6e771abe71f56aa5a03f1f9fa5963abf/662e68e0/video/tos/maliva/tos-maliva-v-0068c799-us/48ce321d98024465bddf55106de28219/?a=1988&bti=ODszNWYuMDE6&ch=0&cr=3&dr=0&lr=unwatermarked&cd=0%7C0%7C0%7C&cv=1&br=9982&bt=4991&cs=0&ds=6&ft=4b~OyMFx8Zmo0_-rd-4jV5B1upWrKsd.&mime_type=video_mp4&qs=13&rc=Mzx3aWw5cjtrcjMzaTgzNEBpMzx3aWw5cjtrcjMzaTgzNEAvcnBrMmQ0b2hgLS1kLzJzYSMvcnBrMmQ0b2hgLS1kLzJzcw%3D%3D&l=202404280916435F1B3660E96D00234FC0&btag=e00050000'

In [29]:
result = requests.get(vtt_link, headers=headers)
vtt = result.content.decode()
vtt

"WEBVTT\n\n\n00:00:00.080 --> 00:00:01.920\nHi, everyone. It's show here.\n\n00:00:01.921 --> 00:00:02.801\nAs you may have heard,\n\n00:00:02.802 --> 00:00:05.641\nCongress passed the Bill that the president signed into law\n\n00:00:05.642 --> 00:00:08.721\nthat is designed to ban TikTok in the United States.\n\n00:00:08.722 --> 00:00:13.001\nThat will take TikTok away from you and 170 million Americans\n\n00:00:13.002 --> 00:00:16.581\nwho find community and connection on all platform.\n\n00:00:16.720 --> 00:00:17.880\nMake no mistake,\n\n00:00:17.881 --> 00:00:19.421\nthis is a ban.\n\n00:00:19.520 --> 00:00:23.060\nA ban on TikTok and a ban on you and your voice.\n\n00:00:23.080 --> 00:00:24.760\nPoliticians may see otherwise,\n\n00:00:24.761 --> 00:00:25.841\nbut don't get confused.\n\n00:00:25.842 --> 00:00:30.581\nMany who sponsored the Bill admit a TikTok ban is the ultimate goal.\n\n00:00:30.880 --> 00:00:32.600\nIt's obviously a disappointing moment,\n\n00:00:32.601 --> 00:00

In [32]:
transcript = ""

for caption in webvtt.read_buffer(io.StringIO(vtt)):
    # Eng requires extra space between caption texts
    transcript += f"{caption.text} "

transcript

"Hi, everyone. It's show here. As you may have heard, Congress passed the Bill that the president signed into law that is designed to ban TikTok in the United States. That will take TikTok away from you and 170 million Americans who find community and connection on all platform. Make no mistake, this is a ban. A ban on TikTok and a ban on you and your voice. Politicians may see otherwise, but don't get confused. Many who sponsored the Bill admit a TikTok ban is the ultimate goal. It's obviously a disappointing moment, but it does not need to be a defining one. It's actually ironic because the freedom of expression on TikTok reflects the same American values that make the United States a beacon of freedom. TikTok gives everyday Americans a powerful way to be seen and heard. And that's why so many people have been TikTok part of their daily lives. Rest assured, we aren't going anywhere. We are confident and we will keep fighting for your rights in the courts. The facts and the Constituti

In [55]:
def generate_data_row(video_obj):
    data_header = ['video_id',
                   'video_timestamp',
                   'video_duration',
                   'video_locationcreated',
                   'video_diggcount',
                   'video_sharecount',
                   'video_commentcount',
                   'video_playcount',
                   'video_description',
                   'video_is_ad',
                   'video_stickers',
                   'author_username',
                   'author_name',
                   'author_followercount',
                   'author_followingcount',
                   'author_heartcount',
                   'author_videocount',
                   'author_diggcount',
                   'author_verified',
                   'suggested_words',
                   'transcript']
    data_list = []
    data_list.append(video_obj['id'])
    try:
        ctime = video_obj['createTime']
        data_list.append(datetime.fromtimestamp(int(ctime)).isoformat())
    except Exception:
        data_list.append('')
    try:
        data_list.append(video_obj['video']['duration'])
    except Exception:
        data_list.append(np.nan)
    try:
        data_list.append(video_obj['locationCreated'])
    except Exception:
        data_list.append('')
    try:
        data_list.append(video_obj['stats']['diggCount'])
    except Exception:
        data_list.append(np.nan)
    try:
        data_list.append(video_obj['stats']['shareCount'])
    except Exception:
        data_list.append(np.nan)
    try:
        data_list.append(video_obj['stats']['commentCount'])
    except Exception:
        data_list.append(np.nan)
    try:
        data_list.append(video_obj['stats']['playCount'])
    except Exception:
        data_list.append(np.nan)
    try:
        data_list.append(video_obj['desc'])
    except Exception:
        data_list.append('')
    try:
        data_list.append(video_obj['isAd'])
    except Exception:
        data_list.append(False)
    try:
        video_stickers = []
        for sticker in video_obj['stickersOnItem']:
            for text in sticker['stickerText']:
                video_stickers.append(text)
        data_list.append(';'.join(video_stickers))
    except Exception:
        data_list.append('')
    try:
        data_list.append(video_obj['author']['uniqueId'])
    except Exception:
        try:
            data_list.append(video_obj['author'])
        except Exception:
            data_list.append('')
    try:
        data_list.append(video_obj['author']['nickname'])
    except Exception:
        try:
            data_list.append(video_obj['nickname'])
        except Exception:
            data_list.append('')
    try:
        data_list.append(video_obj['authorStats']['followerCount'])
    except Exception:
        data_list.append(np.nan)
    try:
        data_list.append(video_obj['authorStats']['followingCount'])
    except Exception:
        data_list.append(np.nan)
    try:
        data_list.append(video_obj['authorStats']['heartCount'])
    except Exception:
        data_list.append(np.nan)
    try:
        data_list.append(video_obj['authorStats']['videoCount'])
    except Exception:
        data_list.append(np.nan)
    try:
        data_list.append(video_obj['authorStats']['diggCount'])
    except Exception:
        data_list.append(np.nan)
    try:
        data_list.append(video_obj['author']['verified'])
    except Exception:
        data_list.append(False)
    try:
        data_list.append(" / ".join(video_obj['suggestedWords']))
    except Exception:
        data_list.append(np.nan)
    data_list.append(transcript)

    data_row = pd.DataFrame(dict(zip(data_header,data_list)), index=[0])
    return data_row

In [56]:
meta_data = generate_data_row(video_obj=video_details)

meta_data.to_csv("meta.csv")