# TikTok Productive Video Classification
Hengbin Fang <br>
Credits to Adam Omarali for inspiration on the idea.

If you want to replicate it, the process is:
1. Download a ton of videos along with their descripton.
2. Label them.
3. Transcribe them.
4. Put it all in a CSV file, then use it to construct a .jsonl
5. Preprepare then fine-tune the GPT model.
6. Test them out


It a lot of iterations from scratch to get the data and train it

In [4]:
from deepgram import Deepgram
import requests
import subprocess
import openai
import os

openai.api_key = "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" # For Whisper API + GPT
dg_key = "e8a5d861a3275a075b829bc211875e52140ca15b" # They give $200 USD free credits on the sign up.
dg = Deepgram(dg_key)

session = requests.Session()
session.headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
ffmpeg_dir = "C:/ffmpeg/bin/ffmpeg.exe" # I chose to do it like this since it was faster than adding it to PATH, but you could, doesn't matter.

# After downloading the TikTok Video, Convert it to Wav.
def conversion(target, destination):
    command = f'{ffmpeg_dir} -i "the_videos/{target}" -ab 160k -ac 2 -ar 44100 -vn "the_videos/{destination}"' # 160k bitrate, 2 channels, 44100 sample rate.
    process = subprocess.run(command, shell=True, capture_output=True) # Run ot through CMD
    if process.returncode == 0: # If it was successful
        return "the_videos/" + destination
    else: # If it didn't work
        raise Exception(process.stderr.decode())
        
# Download the video from TikTok through a proxy since it has zero API restrictions.
# Tried out TikTok's API (security issues), TikAPI (needs money), then I stumbled across ProxiTok
def download_audio(id):
    session = requests.Session() # Faster from my experience

    html_text = session.get("https://proxitok.pussthecat.org/@placeholder/video/%s" % id)

    video_url = html_text.text.split('<source src="/stream?url=')[1].split('"')[0] # Extarcting the URL. If the HTML changes on this specifically, this will break.
    try:
        data = session.get("https://proxitok.pussthecat.org/stream?url=" + video_url) # Raw MP4 download
    except Exception as error:
        raise Exception("Failed to download video: %s, id: %s, url: %s" % (error, id, video_url))
    
    with open(f"the_videos/{id}.mp4", "wb") as f: # Save MP4
        f.write(data.content)

    try:
        location = conversion(f"{id}.mp4", f"{id}.wav") # MP4 -> Wav
    except Exception as error:
        print(error)
        return None
    

    os.remove(f"the_videos/{id}.mp4") # Delete MP4, no need for it
    return location # Return the location of the Wav file

def predict(subtitles, description):  # Using my fine-tuned GPT Model on the data to predict True or False. 
    prompt = f"{subtitles} | {description}   ->"
    response = openai.Completion.create(
        model="curie:ft-personal-2023-07-15-23-43-03",
        prompt=prompt,
        temperature=1, # Controls Randomness
        max_tokens=256, # Max token to generate
        top_p=1, 
        frequency_penalty=0,
        presence_penalty=0, # Penalize model depending on what its said so far
        stop=["."] # Prevent from saying more than 1 prediction
    )
    prediction = response.choices[0].text
    if "true" in prediction.lower():
        return True
    elif "false" in prediction.lower():
        return False
    else:
        raise Exception("Prediction not found: %s" % prediction)

# Whisper cannot transcribe files bigger than 25 MB. I used DeepGram for bigger ones. Though It takes a while to transcribe. 
def transcribe_large(audio_file):
    source = {"buffer": audio_file, "mimetype":'audio/wav'}
    res = dg.transcription.sync_prerecorded(source, options = {
        "punctuate": True,
        "model": 'general',
        "tier": 'enhanced'
        }
    )
    return res['results']['channels'][0]['alternatives'][0]['transcript']

# Transcribe the audio file using OpenAI's API. If the file is bigger than 25 MB, use `transcribe_large`.
def transcribe(audio_dir):
    with open(audio_dir, "rb") as audio_file:
        if os.path.getsize(audio_dir) > 25000000:
            print("Large file detected, transcribing with deepgram")
            transcript = transcribe_large(audio_file)
        else:
            transcript = openai.Audio.transcribe("whisper-1", audio_file)['text']

    return transcript or ' '

# Scrape tiktoks description through the Website
def get_desc(id):
    resp = session.get("https://www.tiktok.com/@./video/%s" % id)
    return resp.text.split('property="og:description" content="')[1].split('"/>')[0]

# Putting all the functions together to get a prediction
def video_id_to_prediction(id):
    description = get_desc(id)
    print(description)
    location = download_audio(id)
    print(location)
    transcript = transcribe(location)
    print(transcript)
    pred = predict(transcript, description)
    return pred

video_id_to_prediction("7248945204588203306")

Lake Day Punch 
the_videos/7248945204588203306.wav
Large file detected, transcribing with deepgram
This one's wild. It's called late day Punch. Anybody know what this is? Sugar, vodka. Okay. Never seen nothing like this before. I need a little bottle to make this work right. Look at the sugar. Casey Cures, guys. This is two cups of sugar. I'm sorry. Two and a third cups of sugar. I've just been told by our mathematician here. We gotta get our kool Aid in. We use in blue raspberry lemonade. Do you wanna seen a real blue raspberry in the wild. Just curious. Okay. Now we come behind with our vodka, juice of life. You can smell a lot of my food and all over this. Alright. Beautiful. That's the real deal. Control your whole sky. You must be smooth. This is what you do. You're spilling. You should not spill. Almost be continuously smooth. You'll see him as five boy up right. And you just let it sit, play it though. You gotta get this to dissolve as much as possible Hey, there's no way to dis

False

In [None]:
skipped = [] # re-do these

In [None]:
# Adding new data to the csv file
def add_data(d_vid, label):
    with open("new_data.csv", "a") as file:
        for x in d_vid:
            print("attempting", x)
            description = get_desc(x)
            print("got desc")
            try:
                location = download_audio(x)
            except Exception as err:
                if "404" in str(err):
                    print("Video not found")
                    continue
                else:
                    raise Exception(err)
                

            if location:
                print("transcribing")
                transcript = transcribe(location)
                construct = "~".join(
                    [
                        x,
                        label,
                        description,
                        transcript
                    ]
                ) + "\n"
                print(construct)
                file.write(construct)
            else:
                skipped.append(x)
        
add_data(
    """
    
    """.strip().splitlines(),
    "1" # 1 for educational 0 for not
)

In [22]:
import csv
import os

# Grab data from label_data.csv, all the IDS, compare them with the videos in the_videos. Output the ones that aren't there.
already = [x.split(".")[0] for x in os.listdir("the_videos")]
to_download = []

with open('label_data.csv', 'r', encoding="UTF-8") as file:
    reader = csv.reader(file, delimiter = '~')
    t = 0
    whole_list = list(reader)
    whole_list.pop(0)

for video in whole_list:
    if video in already:
        pass
    else:
        to_download.append(video)


In [152]:
# write list to csv

with open('label_data.csv', 'w', newline='', encoding="UTF-8") as file:
    writer = csv.writer(file, delimiter = '~')
    for row in whole_list:
        writer.writerow(row)

<h1> Other stuff I did to code it.
Won't bother to explain everything but. This is just the other stuff, feel free to take a look.

In [29]:
print(len(to_download))

# remove the ids from file label_data.csv on to_download
whole_list = []
with open('label_data.csv', 'r', encoding="UTF-8") as file:
    reader = csv.reader(file, delimiter = '~')
    for row in reader:
        if row[0] in to_download:
            pass
        else:
            whole_list.append(row)

# write `whole_list` to label_data.csv

with open('label_data.csv', 'w', newline='', encoding="UTF-8") as file:
    writer = csv.writer(file, delimiter = '~')
    for row in whole_list:
        writer.writerow(row)

17


In [19]:
print(len(to_download))

for x in deleted:
    to_download.remove(x)
print(len(to_download))


18
3


In [20]:
# Download All videos in to_download. With 5 threads.

import threading

def downloader():
    while True:
        try:
            id = to_download.pop(0)
        except:
            break

        try:
            print(f"attempting to download: {id}, {len(to_download)}")
            download_audio(id)
        except IndexError:
            print("vid got deleted? moving on")
            deleted.append(id)
        except Exception as error:
            print(error)
            break

threads = [threading.Thread(target=downloader) for i in range(5)]
for i in threads:
    i.start()
for i in threads:
    i.join()


attempting to download: 7227955368297647402, 2attempting to download: 6857520648491732229, 1

attempting to download: 7057915158139702575, 0


ffmpeg version N-110185-gb564ad8eac-20230406 Copyright (c) 2000-2023 the FFmpeg developers
  built with gcc 12.2.0 (crosstool-NG 1.25.0.152_89671bf)
  configuration: --prefix=/ffbuild/prefix --pkg-config-flags=--static --pkg-config=pkg-config --cross-prefix=x86_64-w64-mingw32- --arch=x86_64 --target-os=mingw32 --enable-gpl --enable-version3 --disable-debug --disable-w32threads --enable-pthreads --enable-iconv --enable-libxml2 --enable-zlib --enable-libfreetype --enable-libfribidi --enable-gmp --enable-lzma --enable-fontconfig --enable-libvorbis --enable-opencl --disable-libpulse --enable-libvmaf --disable-libxcb --disable-xlib --enable-amf --enable-libaom --enable-libaribb24 --enable-avisynth --enable-chromaprint --enable-libdav1d --enable-libdavs2 --disable-libfdk-aac --enable-ffnvcodec --enable-cuda-llvm --enable-frei0r --enable-libgme --enable-libkvazaar --enable-libass --enable-libbluray --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librist --enable-libssh --enable-

In [1]:
from tikapi import TikAPI, ValidationException, ResponseException

api = TikAPI("")


#for download in to_download:
try:
    response = api.public.video(
        id="6880508779109125378"
    )

    json: dict = response.json()
    
    print(json)
    #response.save_video(json['itemInfo']['itemStruct']['video']['downloadAddr'], 'video.mp4')

except ValidationException as e:
    print(e, e.field)

except ResponseException as e:
    print(e, e.response.status_code)

Subscription has expired. 403


In [14]:
import os


files = os.listdir("the_videos")

with open("label_data.csv", "r", encoding="UTF-8") as f:
    lines = f.readlines()
    data.remove(data[0])

descs = {}
for domo in data:
    domo = domo.split("~")
    desc = domo[1] if domo[1] else f"TikTok video #{domo[0]}"

    descs[domo[0]] = desc

for file in files:
    for k, v in descs.items():
        if file.replace(".WAV", "").replace("'", "").lower().strip().encode("UTF-8") == v.strip().lower().encode("UTF-8"):
            os.rename(f"the_videos/{file}", f"the_videos/{k}.WAV")
            continue
    
    

Down here is me transcribing the pre-downloaded videos and saving them. Wont be needed now but dont want to waste it either

In [75]:
data = {x.split(".")[0]:"" for x in os.listdir("the_videos")}

print(data)

{'6746476868196715781': '', '6779602446974078213': '', '6780835401406958853': '', '6787606017942260998': '', '6790406969837489413': '', '6794437525353123074': '', '6794872180212534534': '', '6800429398639021317': '', '6804835814162435329': '', '6812424785067248902': '', '6813390055818759430': '', '6819374495724227846': '', '6821944087726296326': '', '6824907801429609734': '', '6831106353516367109': '', '6834264071320194309': '', '6835738604531191046': '', '6837225180876524806': '', '6837530628792077574': '', '6838217251586444549': '', '6840431053056789762': '', '6843822091402267910': '', '6844695381167607046': '', '6858750920516193541': '', '6862901727251811590': '', '6864366630923717893': '', '6866927712745901318': '', '6868752143575256325': '', '6869149203290000645': '', '6869605206066351365': '', '6870237697009569030': '', '6870292446677404934': '', '6871383968294489346': '', '6875342937002085633': '', '6875394675985992966': '', '6875405441472498949': '', '6875872124968439046': '', 

In [62]:
gdata = data.copy()

In [76]:
for x in gdata:
    data[x] = gdata[x]

In [90]:
dupe = []

for x in data:
    if data[x]:
        dupe.append(x)

In [91]:
len(data) - len(dupe)

11

In [92]:
import openai
import threading


# My Process for adding transcriptions do the data
def transcribe_large(audio_file):
    source = {"buffer": audio_file, "mimetype":'audio/wav'}
    res = dg.transcription.sync_prerecorded(source, options = {
        "punctuate": True,
        "model": 'general',
        "tier": 'enhanced'
        }
    )
    return res['results']['channels'][0]['alternatives'][0]['transcript']

def main():
    for k, v in data.items():
        if (not v) and (not k in dupe):
            dupe.append(k)

            with open(f"the_videos/{k}.wav", "rb") as audio_file:
                if os.path.getsize(f"the_videos/{k}.wav") > 25000000:
                    print("Large file detected:", k)
                    transcript = transcribe_large(audio_file)
                else:
                    try:
                        transcript = openai.Audio.transcribe("whisper-1", audio_file)['text']
                    except Exception as error:
                        print(error)
                        continue
                    
            data[k] = transcript or ' '
            print("Transcribed: %s, %s left" % (k, len(data) - len(dupe)))

threads = [threading.Thread(target=main) for i in range(5)] # Make it faster
for i in threads:
    i.start()
for i in threads:
    i.join()


Large file: 6993052633963531526
Large file: 7004427997189065990
Large file: 7081667401523367214
Large file: 7171529138329718058
Large file: 7216898268062633259
Transcribed: 7171529138329718058, 6 left
Large file: 7218868017982229765
Transcribed: 7216898268062633259, 5 left
Large file: 7219710115157413125
Transcribed: 7004427997189065990, 4 left
Large file: 7222073255807454469
Transcribed: 6993052633963531526, 3 left
Large file: 7224559608373841178
Transcribed: 7081667401523367214, 2 left
Large file: 7225787531923623195
Transcribed: 7222073255807454469, 1 left
Large file: 7227886983039814939
Transcribed: 7225787531923623195, 0 left
Transcribed: 7224559608373841178, 0 left
Transcribed: 7218868017982229765, 0 left
Transcribed: 7227886983039814939, 0 left
Transcribed: 7219710115157413125, 0 left
