In [499]:
import io
import os
import requests

import numpy as np
import pandas as pd
import webvtt
import pyktok as pyk

pyk.specify_browser("chrome")

### Establish variables

In [500]:
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
headers = {"User-Agent": user_agent}

In [502]:
url: str

### Get the tiktok's json and narrow down to the necessary info

In [503]:
tt_json = pyk.alt_get_tiktok_json(url)

video_details = tt_json["__DEFAULT_SCOPE__"]["webapp.video-detail"]["itemInfo"]["itemStruct"]

### Obtain the transcriptions (if present)

In [504]:
vtt_links = {}
transcriptions = {}

for info in video_details["video"]["subtitleInfos"]:
    if (language := info["LanguageCodeName"]) in ["eng-US", "deu-DE"]:
        vtt_links[language] = info["Url"]

for lang, link in vtt_links.items():
    result = requests.get(link, headers=headers)
    vtt = result.content.decode()
    if vtt:
        transcript = ""
        for caption in webvtt.read_buffer(io.StringIO(vtt)):
            # Eng requires extra space between caption texts
            transcript += f"{caption.text} "
    transcriptions[lang] = transcript

if not transcriptions:
    # save audio/video and perform speech to text
    pass

### Get list of suggested related searches based on Tiktok algorithm

In [505]:
if suggested_words := video_details.get("suggestedWords"):
    suggested_words = " / ".join(suggested_words)
else:
    suggested_words = np.nan

### Generate data and add to csv file

In [506]:
# Gather video meta data
meta_data = pyk.generate_data_row(video_obj=video_details)

# Add custom desired info
meta_data["suggested_words"] = suggested_words
meta_data["url"] = url
meta_data["english_transcript"] = transcriptions.get("eng-US", np.nan)
meta_data["german_transcript"] = transcriptions.get("deu-DE", np.nan)

if os.path.exists("./meta.csv"):
    df = pd.read_csv("meta.csv", index_col=0)
    meta_data = pd.concat([df, meta_data], ignore_index=True)
else:
    print("New file")

meta_data.to_csv("meta.csv")