In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

import stable_get_html
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
# 设置请求头部信息，模拟浏览器请求
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
# base_url = 'https://www.yachtworld.com'
# requests.get(base_url, 
#     headers=headers)

In [2]:
def kv_attributes(basic, name='Basic'):
    titles = [f'{name}.{"".join(map(lambda x:x.capitalize(),t.text.strip().split()))}' for t in basic.select('td.datatable-title')]
    values = [t.text.strip() for t in basic.select('td.datatable-value')]
    return {k:v for k,v in zip(titles, values)}
def specification_attributes(specification, name='Specification'):
    sub_categories = specification.select('div.datatable-category')
    titles = []
    values = []
    for category in sub_categories:
        cate_name = category.find(attrs='sub-title').text
        cate_name = ''.join(map(lambda x:x.capitalize(), cate_name.split()))
        titles += [f'{name}.{cate_name}.{"".join(map(lambda x:x.capitalize(),t.text.strip().split()))}' for t in category.select('td.datatable-title')]
        values += [t.text.strip() for t in category.select('td.datatable-value')]
    return {k:v for k,v in zip(titles, values)}
def get_price(soup):

    price = soup.select_one('#BoatDetails > div.content.nav-slide.with-sticky-contact.home-block > div.boat-details > div.body > div > div.summary > div > span.payment-total')

    # price.text.strip().split('$')[1]
    x = float("".join(list(filter(str.isdigit,price.text.split('US$')[1]))))
    return x

In [3]:
# sn = requests.Session()

In [4]:
import warnings
def get_yacht_data(url):
    """输入url返回一个字典

    Args:
        url (str): _description_

    Returns:
        dict: _description_
    """
    # res = requests.get(url, headers=headers)
    # res = sn.get(url, headers=headers)
    res = stable_get_html.get_html(url)
    return get_yacht_data_req_res(res)
def get_yacht_data_req_res(req_res):
    soup = BeautifulSoup(req_res.text, 'html.parser')
    detail = soup.select_one('#BoatDetails > div.content.nav-slide.with-sticky-contact.home-block > div.boat-details > div.body > div.boat-details-content > div.details')
    assert detail is not None
    items = detail.select('div.header')
    assert len(items)>0
    result = {}
    for i, item in enumerate(items):
            if item.text.lower()=='BASICS'.lower() or item.text.lower()=='PROPULSION'.lower():
                result.update(kv_attributes(items[i].next_sibling, name=item.text.lower().capitalize()))
            elif item.text.lower()=='SPECIFICATIONS'.lower() or item.text.lower()=='FEATURES'.lower():
                result.update(specification_attributes(items[i].next_sibling, name=item.text.lower().capitalize()))
    try:
        result['Price'] = get_price(soup)
    except Exception as e:
        # warnings.warn(f"no price for {url}")
        pass
    return result


In [5]:
links_table = pd.read_csv('all_links.csv', index_col=0)
links_table.head(6)

Unnamed: 0,Link,Price,Basics.Location,New
0,https://www.yachtworld.com/yacht/2023-jeanneau...,329795.0,"Seattle, Washington, United States",True
1,https://www.yachtworld.com/yacht/2023-dufour-3...,364999.0,"Racine, Wisconsin, United States",True
2,https://www.yachtworld.com/yacht/2023-bavaria-...,432319.0,"San Diego, California, United States",True
3,https://www.yachtworld.com/yacht/1984-union-po...,31000.0,"Emeryville, California, United States",True
4,https://www.yachtworld.com/yacht/2023-dufour-4...,850000.0,"Racine, Wisconsin, United States",True
5,https://www.yachtworld.com/yacht/2023-jeanneau...,519685.0,"San Diego, California, United States",True


In [6]:
# links_table.loc[4:6, 'Link']
links = links_table['Link'].to_list()
links[:5]

['https://www.yachtworld.com/yacht/2023-jeanneau-380-8171644/',
 'https://www.yachtworld.com/yacht/2023-dufour-37-8587334/',
 'https://www.yachtworld.com/yacht/2023-bavaria-c42-8710488/',
 'https://www.yachtworld.com/yacht/1984-union-polaris-8602417/',
 'https://www.yachtworld.com/yacht/2023-dufour-470-8577883/']

In [7]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import numpy as np
from tqdm import tqdm
yacht_data_list = list(range(len(links)))
error_list = {}
def set_data(i):
    # print(i)
    try:
        yacht_data_list[i] = get_yacht_data(links[i])
    except Exception as e:
        time.sleep(60)
        try:
            yacht_data_list[i] = get_yacht_data(links[i])
        except:
            error_list[i]=e
            warnings.warn(f"error for {i}: {e}")
        else:
            error_list.pop(i, None)
    else:
        error_list.pop(i, None)

In [8]:
with ThreadPoolExecutor(max_workers=32) as t:
    # tasks = [t.submit(set_data, i) for i in range(len(links))]
    tasks = [t.submit(set_data, i) for i in np.random.choice(len(links), 64, replace=False)]
    # tasks = [t.submit(set_data, i) for i in range(64)]
    for future in tqdm(as_completed(tasks), total=len(tasks)):
        pass    

100%|██████████| 64/64 [00:25<00:00,  2.52it/s]


In [9]:
error_list, len(error_list)

({}, 0)

In [12]:
links_table=links_table.rename(columns={'Price':'Price-1'})

In [13]:
links_table.head()

Unnamed: 0,Link,Price-1,Basics.Location,New
0,https://www.yachtworld.com/yacht/2023-jeanneau...,329795.0,"Seattle, Washington, United States",True
1,https://www.yachtworld.com/yacht/2023-dufour-3...,364999.0,"Racine, Wisconsin, United States",True
2,https://www.yachtworld.com/yacht/2023-bavaria-...,432319.0,"San Diego, California, United States",True
3,https://www.yachtworld.com/yacht/1984-union-po...,31000.0,"Emeryville, California, United States",True
4,https://www.yachtworld.com/yacht/2023-dufour-4...,850000.0,"Racine, Wisconsin, United States",True
