In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

import stable_get_html
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
# 设置请求头部信息，模拟浏览器请求
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
# base_url = 'https://www.yachtworld.com'
# requests.get(base_url, 
#     headers=headers)

In [2]:
def kv_attributes(basic, name='Basic'):
    titles = [f'{name}.{"".join(map(lambda x:x.capitalize(),t.text.strip().split()))}' for t in basic.select('td.datatable-title')]
    values = [t.text.strip() for t in basic.select('td.datatable-value')]
    return {k:v for k,v in zip(titles, values)}
def specification_attributes(specification, name='Specification'):
    sub_categories = specification.select('div.datatable-category')
    titles = []
    values = []
    for category in sub_categories:
        cate_name = category.find(attrs='sub-title').text
        cate_name = ''.join(map(lambda x:x.capitalize(), cate_name.split()))
        titles += [f'{name}.{cate_name}.{"".join(map(lambda x:x.capitalize(),t.text.strip().split()))}' for t in category.select('td.datatable-title')]
        values += [t.text.strip() for t in category.select('td.datatable-value')]
    return {k:v for k,v in zip(titles, values)}
def get_price(soup):

    price = soup.select_one('#BoatDetails > div.content.nav-slide.with-sticky-contact.home-block > div.boat-details > div.body > div > div.summary > div > span.payment-total')

    # price.text.strip().split('$')[1]
    x = float("".join(list(filter(str.isdigit,price.text.split('US$')[1]))))
    return x

In [3]:
# sn = requests.Session()

In [4]:
import warnings
def get_yacht_data(url):
    """输入url返回一个字典

    Args:
        url (str): _description_

    Returns:
        dict: _description_
    """
    # res = requests.get(url, headers=headers)
    # res = sn.get(url, headers=headers)
    res = stable_get_html.get_html(url)
    return get_yacht_data_req_res(res)
def get_yacht_data_req_res(req_res):
    soup = BeautifulSoup(req_res.text, 'html.parser')
    detail = soup.select_one('#BoatDetails > div.content.nav-slide.with-sticky-contact.home-block > div.boat-details > div.body > div.boat-details-content > div.details')
    assert detail is not None
    items = detail.select('div.header')
    assert len(items)>0
    result = {}
    for i, item in enumerate(items):
            if item.text.lower()=='BASICS'.lower() or item.text.lower()=='PROPULSION'.lower():
                result.update(kv_attributes(items[i].next_sibling, name=item.text.lower().capitalize()))
            elif item.text.lower()=='SPECIFICATIONS'.lower() or item.text.lower()=='FEATURES'.lower():
                result.update(specification_attributes(items[i].next_sibling, name=item.text.lower().capitalize()))
    try:
        result['Price'] = get_price(soup)
    except Exception as e:
        # warnings.warn(f"no price for {url}")
        pass
    return result


In [5]:
links_table = pd.read_csv('all_links.csv', index_col=0)
links_table.head(6)

Unnamed: 0,Link,Price,Basics.Location,New
0,https://www.yachtworld.com/yacht/2023-jeanneau...,329795.0,"Seattle, Washington, United States",True
1,https://www.yachtworld.com/yacht/2023-dufour-3...,364999.0,"Racine, Wisconsin, United States",True
2,https://www.yachtworld.com/yacht/2023-bavaria-...,432319.0,"San Diego, California, United States",True
3,https://www.yachtworld.com/yacht/1984-union-po...,31000.0,"Emeryville, California, United States",True
4,https://www.yachtworld.com/yacht/2023-dufour-4...,850000.0,"Racine, Wisconsin, United States",True
5,https://www.yachtworld.com/yacht/2023-jeanneau...,519685.0,"San Diego, California, United States",True


In [6]:
# links_table.loc[4:6, 'Link']
links = links_table['Link'].to_list()
links[:5]

['https://www.yachtworld.com/yacht/2023-jeanneau-380-8171644/',
 'https://www.yachtworld.com/yacht/2023-dufour-37-8587334/',
 'https://www.yachtworld.com/yacht/2023-bavaria-c42-8710488/',
 'https://www.yachtworld.com/yacht/1984-union-polaris-8602417/',
 'https://www.yachtworld.com/yacht/2023-dufour-470-8577883/']

In [7]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import numpy as np
from tqdm import tqdm
yacht_data_list = list(range(len(links)))
error_list = {}
def set_data(i):
    # print(i)
    try:
        yacht_data_list[i] = get_yacht_data(links[i])
    except Exception as e:
        # time.sleep(60)
        time.sleep(0.3)
        try:
            yacht_data_list[i] = get_yacht_data(links[i])
        except:
            error_list[i]=e
            warnings.warn(f"error for {i}: {e}")
            raise
            # return False
        else:
            error_list.pop(i, None)
            # return True
    else:
        error_list.pop(i, None)
        # return True

In [8]:
with ThreadPoolExecutor(max_workers=32) as t:
    tasks = [t.submit(set_data, i) for i in range(len(links))]
    # tasks = [t.submit(set_data, i) for i in np.random.choice(len(links), 64, replace=False)]
    # tasks = [t.submit(set_data, i) for i in range(64)]
    for future in tqdm(as_completed(tasks), total=len(tasks)):
        pass    

100%|██████████| 12475/12475 [12:05<00:00, 17.20it/s]


In [22]:
error_list, len(error_list)
links[8]

'https://www.yachtworld.com/yacht/1990-catalina-42-mk-i-3-sr-8617140/'

In [21]:
with ThreadPoolExecutor(max_workers=32) as t:
    tasks = [t.submit(set_data, k) for k in error_list.keys()]
    # tasks = [t.submit(set_data, i) for i in np.random.choice(len(links), 64, replace=False)]
    # tasks = [t.submit(set_data, i) for i in range(64)]
    for future in tqdm(as_completed(tasks), total=len(tasks)):
        pass   

100%|██████████| 18/18 [03:02<00:00, 10.13s/it]


In [20]:
import pickle
with open('data.pickle', 'wb') as f:
    pickle.dump({"error_list":error_list, "yacht_data_list":yacht_data_list}, f)

In [43]:
with open('data.pickle', 'wb') as f:
    data = pickle.load(f)
    error_list = data['error_list']
    yacht_data_list = data['yacht_data_list']

UnsupportedOperation: read

In [44]:
yacht_data_list[9]
yacht_data_list[8]

8

In [45]:
final_yacht_data_list = list(filter(lambda x: not isinstance(x, int), yacht_data_list))
len(final_yacht_data_list)

12457

In [55]:
error_list.keys()
len(error_list)

18

In [57]:
links_table.index[list(error_list.keys())]
len(links_table.index[list(error_list.keys())])

18

In [63]:
links_table.index.unique()

RangeIndex(start=0, stop=12475, step=1)

In [62]:
links_table = links_table.reset_index(drop=True)

In [91]:
links_table_new = links_table.drop(links_table.index[list(error_list.keys())])
len(links_table)-len(links_table_new)
len(links_table),len(links_table_new)

(12475, 12457)

In [33]:
df = pd.DataFrame(final_yacht_data_list)
df.head()

Unnamed: 0,Basics.Year,Basics.Make,Basics.Model,Basics.Class,Basics.Length,Basics.FuelType,Basics.HullMaterial,Basics.HullShape,Basics.HullWarranty,Basics.OfferedBy,...,Features.InsideEquipment.Seakeeper,Features.InsideEquipment.AirCompressor,Specifications.Speed&Distance.Range,Features.OutsideEquipment.WindGenerator,Specifications.Dimensions.Freeboard,Specifications.Miscellaneous.LiferaftCapacity,Propulsion.RopeCutter,Specifications.Miscellaneous.DeadriseAtTransom,Features.AdditionalEquipment.LaunchingTrailer,Features.AdditionalEquipment.BeachingLegs
0,2023,Jeanneau,380,Cruiser,38ft,Diesel,Fiberglass,Monohull,7 years,Marine Servicenter - Seattle,...,,,,,,,,,,
1,2023,Dufour,37,Racer/Cruiser,35.33ft,Diesel,Fiberglass,,,"Racine Riverside Marine, Inc.",...,,,,,,,,,,
2,2023,Bavaria,C42,Racer/Cruiser,42ft,Diesel,Fiberglass,,,Cruising Yachts - San Diego,...,,,,,,,,,,
3,1984,Union,Polaris,Cutter,36ft,Diesel,Fiberglass,,,Rubicon Yachts,...,,,,,,,,,,
4,2023,Dufour,470,Racer/Cruiser,48.75ft,Diesel,Fiberglass,,,"Racine Riverside Marine, Inc.",...,,,,,,,,,,


In [49]:
links_table=links_table.rename(columns={'Price':'Price-1'})


In [89]:
links_table_new.shape, df.shape

((12457, 4), (12457, 124))

In [90]:
df.index, links_table_new.index

(RangeIndex(start=0, stop=12457, step=1),
 Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     9,
                10,
             ...
             12465, 12466, 12467, 12468, 12469, 12470, 12471, 12472, 12473,
             12474],
            dtype='int64', length=12457))

In [92]:
links_table_new = links_table_new.reset_index(drop=True)

In [79]:
links_table_new.columns

Index(['Link', 'Price-1', 'Basics.Location', 'New'], dtype='object')

In [75]:
df = df.reset_index(drop=True)

In [93]:
total = pd.concat([links_table_new, df], axis=1)
total.shape

(12457, 128)

In [103]:
# (total['Price']-total['Price-1']).sum()/len(total)
# ((total['Price']-total['Price-1'])**2).sum()/len(total)
# (total['Price']==total['Price-1']).sum()/len(total)
((total['Price']-total['Price-1'])<1000).sum()/len(total)

0.8225897085975756

In [105]:
total.to_excel('../../data/包含区域信息.xlsx')