In [105]:
!pip install -q bs4 numpy

In [106]:
def get_market_info(market_file_path):
    with open(market_file_path) as cookie_file:
        def get_valuepair_from_line(line):
            idx = line.find(' ')
            key = line[0:idx].strip()
            value = line[idx:].strip()
            return [key,value]

        pairs = [get_valuepair_from_line(line) for line in cookie_file.readlines()]
        file_info = {key_value[0]: key_value[1] for key_value in pairs}

        print(f"Loaded market with url '{file_info['Url']}' and cookie '{file_info['Cookie']}'")
        return file_info

In [107]:
from requests_tor import RequestsTor
from bs4 import BeautifulSoup

def get_page_list(url, cookie):
    headers = {
        "Cookie": cookie,
    }

    html = rt.get(url, headers=headers).text
    soup = BeautifulSoup(html)
    pagination_elements = soup.find_all('a',{'class': 'pagination_link'})
    if len(pagination_elements) < 1:
        print('No items returned, please check if session is not expired')
        # print(str(soup))
        return []

    href_lastpage = pagination_elements[-1].get('href')
    last_page_nr = int(href_lastpage.split('pg=')[1])

    urls_to_scrape = [f"{url}?category=0&pg={page_nr}" for page_nr in range(1,last_page_nr+1)]

    return urls_to_scrape

def scrape_page(url, cookie):
    headers = {
        "Cookie": cookie,
    }

    html = rt.get(url, headers=headers).text
    soup = BeautifulSoup(html)
    return soup

def get_products_from_page(soup):
    products = soup.find_all('div', {'class':'wLf'})

    return products

def get_profile_info_from_product(product):
    user_name = product.find('div',{'class':'wLfVendor'}).find('a').text.strip().split('    ')[0].strip()
    href = product.find('div',{'class':'wLfVendor'}).find('a').get('href')
    return user_name, href

def scrape_profile_url(url, cookie, username):
    headers = {
        "Cookie": cookie,
    }

    html = rt.get(f"{url}?page=profile&amp;user={username}", headers=headers).text
    soup = BeautifulSoup(html)
    return soup

In [108]:
rt = RequestsTor()
market_info = get_market_info('./marketinfo_vice.txt')

Loaded market with url 'http://52qlucglu6fuaqist2herssakipapig2higaaayu7446n55xw4ylxqid.onion/' and cookie 'loomer323232=ad53a936897cc2f3a8f6156d60dc47f9; vicecityprotect=5edc3bed517dbf9865fb815552c1eb21; prtector25edc3bed517dbf9865fb815552c1eb2160=lol; cookiejar=IZSWBCYULP1FWGPCRHXHTLE07RSRCU; dcap=60C2435026F12898F37ED4B41F9F5371E4B2E1CF624111244A12CDDE31A544070996539749BC8E36CDA63505E1ABE1BB; PHPSESSID=55hvg9oti1ikcod8dji1c19gv0'


In [109]:
urls_to_scrape = get_page_list(market_info['Url'], market_info['Cookie'])

KeyboardInterrupt: 

In [111]:
def get_current_page_nr():
    with open('./vice_products/current_page_number.txt', 'r') as page_file:
        return int(page_file.read().strip())
def set_current_page_nr(page_number):
    with open('./vice_products/current_page_number.txt', 'w') as page_file:
        page_file.write(str(page_number))

In [112]:
import os
def extract_id_from_product(product):
    href = product.find('div',{'class':'wLfLeft'}).find('a').get('href')
    split = href.split('lid=')
    return split[1]

page_nr = get_current_page_nr()

while page_nr <= len(urls_to_scrape):
    url_to_scrape = urls_to_scrape[page_nr]
    print()
    print('Scraping page:', url_to_scrape)

    page = scrape_page(url_to_scrape, market_info['Cookie'])
    products = get_products_from_page(page)

    if len(products) == 0:
        print('There are no products on url:', url_to_scrape, 'maybe the session has expired?')
        print('Please update the cookie in the cookie file before contineouing')
        input()
        market_info = get_market_info('./marketinfo_vice.txt')
        continue
    for product in products:
        product_id = extract_id_from_product(product)
        print('Product with id:', product_id)
        
        file_name_to_write = f'./vice_products/product_{product_id}.html'
        
        if os.path.exists(file_name_to_write):
            print('product with ID', product_id, 'already exists!') # why is this the case?

        with open(file_name_to_write,'w') as product_file:
            html = product.prettify()
            product_file.write(html)

    page_nr += 1
    set_current_page_nr(page_nr)


Scraping page: http://52qlucglu6fuaqist2herssakipapig2higaaayu7446n55xw4ylxqid.onion?category=0&pg=8
Product with id: 1RkYPSlc5yV0A3QW
Product with id: 1JqIiALtwB3xES5f
Product with id: 1T94BMmcZ01PJCLE
Product with id: 1KGmRff8U5VcxSU4
Product with id: 1ULTBTSxOmc2OhwZ
Product with id: 1RMU2pEeHwF0L5ZV
Product with id: 1Jy9k9B3dz9eANBB
Product with id: 1RKjuY5kAhQVU12f
Product with id: 1Mhbd6a6BjNPXas8
Product with id: 1SnSGLAaZDOmblAN
Product with id: 1IvMsR8vE2JWgnWE
Product with id: 1KxRz3b9EyJp3rUY
Product with id: 1V8EBHnKcRMT97ZZ
Product with id: 1RIRr0nKiJ501hdM
Product with id: 1VPxeQtsp4dY2Z82
Product with id: 1URSwKLfVKMfAhlk
Product with id: 1Njptb5wdqQvAluW
Product with id: 1Q1L9HhuV3RezzoL
Product with id: 1OoyWplQK4H5wQBH
Product with id: 1SINcsDztv7GmxI5
Product with id: 1PhkOwPCx1b4lJmq
Product with id: 1RReuoLCihOoDMt6
Product with id: 1JGtX3Z197yAhRao
Product with id: 1NEaG4zucvgv3BuO
Product with id: 1PUVWU4FI39Pudl0
Product with id: 1QkCii7c4qYcb0lL
Product with i

KeyboardInterrupt: Interrupted by user

In [None]:
with open('./vice_products/product_13AXrv622x2iQAVJ.html', 'r') as prod_file:
    html = prod_file.read()
soup = BeautifulSoup(html)



In [114]:
import os
import numpy as np

market_info = get_market_info('./marketinfo_vice.txt')

usernames = []
for product_filename in os.listdir('./vice_products/'):
     if not product_filename.startswith('product_'):
          print('Skipping', product_filename)
          continue

     with open('./vice_products/'+product_filename, 'r') as product_file:
          product = BeautifulSoup(product_file.read())
          username, link = get_profile_info_from_product(product)
          usernames.append(username)

usernames = np.array(usernames)
usernames = np.unique(usernames)

len(usernames)

Loaded market with url 'http://52qlucglu6fuaqist2herssakipapig2higaaayu7446n55xw4ylxqid.onion/' and cookie 'loomer323232=ad53a936897cc2f3a8f6156d60dc47f9; vicecityprotect=5edc3bed517dbf9865fb815552c1eb21; prtector25edc3bed517dbf9865fb815552c1eb2160=lol; cookiejar=IZSWBCYULP1FWGPCRHXHTLE07RSRCU; dcap=60C2435026F12898F37ED4B41F9F5371C1E5343B790440E2B4B951FBE21E10C47B20CC5A9B43F25C1B360FCB821CE66E; PHPSESSID=ejlk4kc7jv0he7p4ehuva84lk3'
Skipping current_page_number.txt


281

In [None]:
for username_i in range(len(usernames)):
     username = usernames[username_i]

     userprofile_filepath = f'./vice_profiles/{username}.html'
     if os.path.exists(userprofile_filepath):
          print('Userprofile of', username, 'is already scraped, skipping...')
          continue

     print('Scraping profile of', username)
     profile_soup = scrape_profile_url(market_info['Url'], market_info['Cookie'], username)

     profile_html = profile_soup.prettify()

     with open(userprofile_filepath, 'w') as userprofile_file:
          userprofile_file.write(profile_html)

'?page=profile&user=tomandjerry'