In [None]:
from util import get_engine, get_upload_url
import pandas as pd
from selenium.webdriver import Chrome, ChromeOptions
from pyvirtualdisplay import Display
import os
import requests
from shutil import unpack_archive, rmtree
import json
from datetime import datetime
from tqdm.auto import tqdm
from time import sleep

In [None]:
crawls = pd.read_sql('crawls', con=get_engine())

In [None]:
def download_profile(profile_name, bp='profiles'):
    # determine output directory
    name, ext = os.path.splitext(profile_name)
    profile_path = os.path.join(bp, name)
    
    # create output directory if not exist
    if os.path.exists(profile_path):
        rmtree(profile_path)
    
    # create directory
    os.makedirs(bp, exist_ok=True)
    
    # download profile zip
    url = f'{get_upload_url()}/{profile_name}'
    out = os.path.join(bp, profile_name)
    r = requests.get(url)
    with open(out, 'wb') as f:
        f.write(r.content)
        
    # extract profile zip
    unpack_archive(out, profile_path)
    
    # return path to profile
    return profile_path

def load_driver(profile_name=None, headless=False):
    # download profile from server
    options = ChromeOptions()
    options.add_argument('--window-size=1920,1080')
    options.add_argument('--no-sandbox')
    
    if headless:
        options.add_argument('--headless')
    
    if profile_name is not None:
        user_data_dir = download_profile(profile_name)
        options.add_argument(f'--user-data-dir={user_data_dir}')
        
    driver = Chrome(options=options)
    driver.implicitly_wait(30)
    return driver

## Interest Segments

In [None]:
def get_interest_categories(profile_name):
    driver = load_driver(profile_name, headless=True)
    driver.get('https://registry.bluekai.com/get_categories')
    # find tag containing response
    body = driver.find_elements_by_tag_name('pre')[0]
    interest_categories = json.loads(body.text)

    driver.close()
    return interest_categories

In [None]:
fails = []
for crawl in tqdm(crawls.itertuples(), total=crawls.shape[0]):
    try:
        ic = get_interest_categories(crawl.Filename)
        print(crawl.Category, len(ic))
        # write to database
        df = pd.DataFrame(ic, columns=['Segment'])
        df['Profile'] = crawl.Filename
        df['Time'] = datetime.now()
        df.to_sql('interest-categories', con=get_engine(), index=False, if_exists='append')
    except:
        fails.append(crawl.Filename)
        print("Error while processing", crawl.Filename, ic)
        
for f in tqdm(fails):
    ic = get_interest_categories(f)
    # write to database
    df = pd.DataFrame(ic, columns=['Segment'])
    df['Profile'] = crawl.Filename
    df['Time'] = datetime.now()
    df.to_sql('interest-categories', con=get_engine(), index=False, if_exists='append')

## Bid Values

In [None]:
def getHBInfo(profile_name, site=None):
    driver = load_driver(None, headless=False)
    driver.get('https://%s' % site)
    
    row = {
        'Profile': profile_name,
        'HB_URL': site
    }
    # find pbjs var
    pbjsGlobal = driver.execute_script('return _pbjsGlobals')[0]
    # find methods for pbjs
    methods = driver.execute_script('return Object.keys(%s).filter(x => typeof(%s[x]) == "function")' % (pbjsGlobal, pbjsGlobal))
    # store each methods response
    for method in methods:
        try:
            row[method] = json.dumps(driver.execute_script('return %s.%s()' % (pbjsGlobal, method)))
        except:
            pass

    driver.close()
    return row

In [None]:
hb_sites = pd.read_sql('SELECT DISTINCT sw.URL, Category FROM similarweb sw JOIN `header-bidding-sites` hb ON sw.URL = hb.URL', con=get_engine())

In [None]:
for crawl in tqdm(crawls.itertuples(), total=crawls.shape[0]):
    if crawl.Category =='Adult':
        continue
    # for each crawl, check its category
    crawl_category = crawl.Category
    
    # organize sites by category
    sites = hb_sites[hb_sites['Category'] == crawl_category]['URL'].to_list() + hb_sites[hb_sites['Category'] != crawl_category]['URL'].to_list()
    print(crawl_category, sites)
    hb_responses = []
    
    for site in sites:
        hb_responses.append(getHBInfo(crawl.Filename, site))
    
    pd.DataFrame(hb_responses, 'header-bidding-responses', con=get_engine(), index=False, if_exists='append')

In [None]:
crawl_category = 'Vehicles'
hb_sites[hb_sites['Category'] == crawl_category]['URL'].to_list() + hb_sites[hb_sites['Category'] != crawl_category]['URL'].to_list()    