In [None]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from dateutil import parser
import time
import json
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from IPython.display import clear_output
from dotenv import dotenv_values
import selenium.webdriver as webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [None]:
main_url = 'https://www.behance.net/'
search_url = main_url + 'search/projects?search=financial+dashboard'
username = dotenv_values('.env')['BEHANCE_USERNAME']
password = dotenv_values('.env')['BEHANCE_PASSWORD']
opts = ChromeOptions()
opts.add_argument("--window-size=1800,1000")

browser = webdriver.Chrome(options=opts)
browser.get(search_url)
WebDriverWait(browser, 20).until(EC.element_to_be_clickable((By.CLASS_NAME, 'e2e-PrimaryNav-Signin'))).click()
WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.ID, 'EmailPage-EmailField'))).send_keys(username)
browser.find_element(By.XPATH, '//*[@data-id="EmailPage-ContinueButton"]').click()
WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@data-id="Page-PrimaryButton"]'))).click()

In [None]:
# enter OTC from email
WebDriverWait(browser, 50).until(EC.element_to_be_clickable((By.ID, 'PasswordPage-PasswordField'))).send_keys(password)
browser.find_element(By.XPATH, '//*[@data-id="PasswordPage-ContinueButton"]').click()
# click cookies

In [None]:
browser.find_element(By.CLASS_NAME, 'BasicDropdown-dropdownButton-i1Z').click()
sort_choice = browser.find_elements(By.CLASS_NAME, 'SortMenu-typeItem-WWW')
text = [s.text for s in sort_choice]
sort_choice[text.index('Most Recent')].click()

In [None]:
browser.get(search_url)

In [None]:
pages = browser.find_elements(By.CLASS_NAME, 'Projects-firstSectionCover-pNF')
hrefs = [page.find_element(By.CLASS_NAME, 'js-project-link').get_attribute('href').split('?')[0] for page in pages]

In [None]:
pages_ = browser.find_elements(By.CLASS_NAME, 'ContentGrid-gridItem-VXS')
hrefs_ = [page.find_element(By.CLASS_NAME, 'js-project-link').get_attribute('href').split('?')[0] for page in pages_]

In [None]:
pbar = tqdm(total = 10000)
pages_ = browser.find_elements(By.CLASS_NAME, 'ContentGrid-gridItem-VXS')
hrefs_ = [page.find_element(By.CLASS_NAME, 'js-project-link').get_attribute('href').split('?')[0] for page in pages_]
end = browser.find_elements(By.CLASS_NAME, 'Search-resultsEnd-UqF')
pbar.update(len(hrefs_))

while len(end) == 0:
    browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
    clear_output(wait=True)
    pages_ = browser.find_elements(By.CLASS_NAME, 'ContentGrid-gridItem-VXS')
    for i in range(len(hrefs_), len(pages_)):
        h = pages_[i].find_element(By.CLASS_NAME, 'js-project-link').get_attribute('href').split('?')[0]
        if not h in hrefs_:
            hrefs_.append(h)
            pbar.update(1)
    end = browser.find_elements(By.CLASS_NAME, 'Search-resultsEnd-UqF')
pbar.close()

In [None]:
def str2int(x):
    x = x.lower()
    if x[-1].isnumeric():
        return int(x)
    elif x[-1] == 'k':
        return int(1e3*float(x[:-1]))
    elif x[-1] == 'm':
        return int(1e6*float(x[:-1]))
    elif x[-1] == 'b':
        return int(1e9*float(x[:-1]))
    raise ValueError('unknown format')
    
def get_stats(hr):
    project_stats = []
    for h in tqdm(hr):
        try:
            browser.get(h)
            time.sleep(1)
            stats = {
                'href': h,
                'projectTitle': browser.find_element(By.CLASS_NAME, 'Project-title-Q6Q').text,
                'ownerName': browser.find_element(By.CLASS_NAME, 'Project-ownerItems-qza').text.split('\n')[0],
                'imageSource': [
                    BeautifulSoup(
                        x.get_attribute('outerHTML'), 'lxml'
                    ).find('img')['src'] for x in browser.find_elements(
                        By.CLASS_NAME, 'ImageElement-root-kir'
                    )
                ]
            }
            info = BeautifulSoup(
                browser.find_element(
                    By.CLASS_NAME, 'e2e-Project-infoSection'
                ).get_attribute('outerHTML'), 'lxml'
            )
            stats.update(
                dict(
                    zip(
                        ['likes', 'views', 'comments'], 
                        [str2int(x.text) for x in info.find_all('span')]
                    )
                )
            )
            stats['published'] = info.find('time').text
        except:
            stats = {}
            print(f'fail {h}')

        project_stats.append(stats)
    return project_stats

In [None]:
def load_images(ps):
    for stats in tqdm(ps):
        for source in stats['imageSource']:
            response = requests.get(source)
            if response.status_code == 200:
                id = stats['href'].split('/')[-2]
                fn = source.split('/')[-1]
                ext = fn[-3:]
                fn = fn[:-4].replace('.', '_')
                fn = '_'.join([id, fn])
                fn = '.'.join([fn, ext])
                file_name = 'behance/' + fn
                with open(file_name, "wb") as f:
                    f.write(response.content)
            else:
                print(f'{stats["href"]}: {response.status_code}')
            clear_output(wait=True)
            time.sleep(1)

In [None]:
hrefs = hrefs[:-2] + hrefs_

In [None]:
behance_stats = get_stats(hrefs)

In [None]:
jo = json.dumps(behance_stats)
with open('bs.json', 'w') as outfile:
    outfile.write(jo)

In [None]:
with open('bs.json', 'r') as f:
    bs = json.load(f)

In [None]:
image_names = []
sources = []
for stats in tqdm(bs):
    for source in stats['imageSource']:
        id = stats['href'].split('/')[-2]
        fn = source.split('/')[-1]
        ext = fn[-3:]
        fn = fn[:-4].replace('.', '_')
        fn = '_'.join([id, fn])
        fn = '.'.join([fn, ext])
        image_names.append('behance/' + fn)
        sources.append(source)

In [None]:
bad = []
for i in tqdm(range(i0, len(sources))):
    time.sleep(0.5)
    response = requests.get(sources[i])
    if response.status_code == 200:
        with open(image_names[i], "wb") as f:
            f.write(response.content)
    elif response.status_code == 400:
        print('error')
        break
    else:
        print(i)