# Data Collection

In [1]:
import logging

from dionysus.nodes.project_logging import default_logging

default_logging()

logger = logging.getLogger(__name__)


In [44]:
import numpy as np

def int_from_discrete_gaussian_dist(mean: float = 3.0, std: float = 2.0) -> int:
    return abs(np.random.normal(mean, std, 1)[0].round())

In [34]:
from playwright.async_api import async_playwright, Browser, Page, BrowserContext

async def get_browser_params():
    browser_params = {}
    
    async with async_playwright() as p:
        device = p.devices['iPhone 13 Pro Max']
        browser_params['user_agent'] = device['user_agent']
        
        browser: Browser = await p.webkit.launch(headless=False)
        context: BrowserContext = await browser.new_context(**device)
        page: Page = await context.new_page()
        
        await page.goto('https://m.tiktok.com/')
        
        logger.debug(f'Opened a page titled {await page.title()}')
        
        await context.close()
        await browser.close()
        
        return browser_params

browser_params = await get_browser_params()
user_agent = browser_params['user_agent']

2022-11-06 16:16:30,881 __main__ - DEBUG:Opened a page titled <bound method Page.title of <Page url='https://www.tiktok.com/'>>


In [39]:
from urllib.parse import urlencode
from playwright.async_api import async_playwright
import requests
import time
from pprint import pformat

#
# Define variables
#

root_url = 'https://m.tiktok.com/'

ms_token = 'tams4V3e-ehAaA1tnMqlcfg0bL_XKcI3adjCKcwzR4v6AnovTRpMWz5bSWqhPiZqxeBAmIa3nhnboOOOh-AkuVqGB9C-y97l5jqPe_Rdxgxzc-m0eVP0zp9pA0QCwm03wSYU2-4='

challenge_name = 'zoukbrasileiro'

hashtag_query = {'challengeName': challenge_name, 'msToken': ms_token}

n_video = int(500)

#
# Form the hashtag URL
#

hashtag_sub_url = 'api/challenge/detail/?{}'.format(urlencode(hashtag_query))

hashtag_url = root_url + hashtag_sub_url


In [37]:
#
# Execute delay
#

request_delay = int_from_discrete_gaussian_dist()
time.sleep(request_delay)

#
# Form headers
#

headers = {
        "authority": "m.tiktok.com",
        "method": "GET",
        "path": hashtag_url.split("tiktok.com")[1],
        "scheme": "https",
        "accept": "application/json, text/plain, */*",
        "accept-encoding": "gzip",
        "accept-language": "en-US,en;q=0.9",
        "origin": root_url,
        "referer": root_url,
        "user-agent": user_agent,
    }

#
# Execute the request for the hashtag
#

r = requests.get(
    hashtag_url,
    headers=headers,
)

hashtag_response = r.json()

if hashtag_response['status_code'] != 0:
    raise ValueError(f"JSON query returned non-zero status code:\n{hashtag_response['status_code']}")

logger.debug(f'The request to path {hashtag_url} with the following headers:\n'
             f'{pformat(headers)}\nreturns the following content:\n'
             f'{pformat(r.text)}')

hashtag_id = hashtag_response['challengeInfo']['challenge']['id']

2022-11-06 21:30:25,541 urllib3.connectionpool - DEBUG:Starting new HTTPS connection (1): m.tiktok.com:443
2022-11-06 21:30:25,818 urllib3.connectionpool - DEBUG:https://m.tiktok.com:443 "GET /api/challenge/detail/?challengeName=zoukbrasileiro&msToken=tams4V3e-ehAaA1tnMqlcfg0bL_XKcI3adjCKcwzR4v6AnovTRpMWz5bSWqhPiZqxeBAmIa3nhnboOOOh-AkuVqGB9C-y97l5jqPe_Rdxgxzc-m0eVP0zp9pA0QCwm03wSYU2-4%3D HTTP/1.1" 200 299
2022-11-06 21:30:25,830 __main__ - DEBUG:The request to path https://m.tiktok.com/api/challenge/detail/?challengeName=zoukbrasileiro&msToken=tams4V3e-ehAaA1tnMqlcfg0bL_XKcI3adjCKcwzR4v6AnovTRpMWz5bSWqhPiZqxeBAmIa3nhnboOOOh-AkuVqGB9C-y97l5jqPe_Rdxgxzc-m0eVP0zp9pA0QCwm03wSYU2-4%3D with the following headers:
{'accept': 'application/json, text/plain, */*',
 'accept-encoding': 'gzip',
 'accept-language': 'en-US,en;q=0.9',
 'authority': 'm.tiktok.com',
 'method': 'GET',
 'origin': 'https://m.tiktok.com/',
 'path': '/api/challenge/detail/?challengeName=zoukbrasileiro&msToken=tams4V3e-ehAaA1

In [45]:
list_video_response = []

#
# Form video URLs
#

cursor = int(0)
while cursor < n_video:
    #
    # Execute delay
    #

    request_delay = int_from_discrete_gaussian_dist()
    time.sleep(request_delay)

    query = {
        "aid": 1988,
        "count": 30,
        "challengeID": hashtag_id,
        "cursor": cursor,
        "msToken": ms_token
        }
    video_sub_url = "api/challenge/item_list/?{}".format(urlencode(query))
    video_url = root_url + video_sub_url
    
    r = requests.get(
        url=video_url,
        headers=headers
    )
    videos_response = r.json()
    cursor = int(videos_response['cursor'])
    
    if not videos_response['hasMore'] and 'itemList' not in videos_response.keys():
        logger.info(f'Reached the end of video list when cursor is at position {cursor}')
        break
    else:
        for video_response in videos_response['itemList']:
            list_video_response.append(video_response)

2022-11-06 21:34:19,921 urllib3.connectionpool - DEBUG:Starting new HTTPS connection (1): m.tiktok.com:443
2022-11-06 21:34:20,575 urllib3.connectionpool - DEBUG:https://m.tiktok.com:443 "GET /api/challenge/item_list/?aid=1988&count=30&challengeID=74915315&cursor=0&msToken=tams4V3e-ehAaA1tnMqlcfg0bL_XKcI3adjCKcwzR4v6AnovTRpMWz5bSWqhPiZqxeBAmIa3nhnboOOOh-AkuVqGB9C-y97l5jqPe_Rdxgxzc-m0eVP0zp9pA0QCwm03wSYU2-4%3D HTTP/1.1" 200 41266
2022-11-06 21:34:24,586 urllib3.connectionpool - DEBUG:Starting new HTTPS connection (1): m.tiktok.com:443
2022-11-06 21:34:25,100 urllib3.connectionpool - DEBUG:https://m.tiktok.com:443 "GET /api/challenge/item_list/?aid=1988&count=30&challengeID=74915315&cursor=30&msToken=tams4V3e-ehAaA1tnMqlcfg0bL_XKcI3adjCKcwzR4v6AnovTRpMWz5bSWqhPiZqxeBAmIa3nhnboOOOh-AkuVqGB9C-y97l5jqPe_Rdxgxzc-m0eVP0zp9pA0QCwm03wSYU2-4%3D HTTP/1.1" 200 37406
2022-11-06 21:34:28,110 urllib3.connectionpool - DEBUG:Starting new HTTPS connection (1): m.tiktok.com:443
2022-11-06 21:34:28,660 ur

KeyboardInterrupt: 