In [66]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

# 设置请求头部信息，模拟浏览器请求
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
# 设置YachtWorld网站上的搜索条件
search_params = {
    'class': 'sail',
    # 'price': 'USD100000-500000',
    # 'length': '40-50ft'
}
base_url = 'https://www.yachtworld.com'

requests.get(base_url, 
    headers=headers)

<Response [200]>

## 并发获得所有页上的船的url，形成一个url列表

In [67]:
# 使用 Concurrent.futures 模块中的 ThreadPoolExecutor 类来实现异步请求
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
# pages = 1
pages = 147
req_list = list(range(pages))
# sn = requests.Session()
def transform(i):
    # temp = sn.get(f'https://www.yachtworld.com/boats-for-sale/condition-new/type-sail/?page={i+1}', headers=headers).text
    temp = requests.get(f'https://www.yachtworld.com/boats-for-sale/condition-new/type-sail/?page={i+1}', headers=headers).text
    # temp = requests.get(f'https://www.yachtworld.com/boats-for-sale/condition-used/type-sail/sort-recommended:desc/?page={i+1}', headers=headers).text
    req_list[i] = BeautifulSoup(temp, 'html.parser')
with ThreadPoolExecutor(max_workers=64) as t:
    tasks = [t.submit(transform, i) for i in range(pages)]
    for future in tqdm(as_completed(tasks), total=len(tasks)):
        pass

100%|██████████| 147/147 [00:37<00:00,  3.97it/s]


In [58]:
soups = req_list
all(map(lambda x: isinstance(x, BeautifulSoup), soups))

True

In [63]:
from typing import List

def get_urls_from_search_page(soup:List[BeautifulSoup]):
    a = soup.select('#root > div.search > div.flex.flex-row > div.search-right-col > div.pagination-and-results-container.search-display > div.listings-container')
    a = a[0]
    links = [link.get('href') for link  in a.find_all(name='a')]
    return links

links = sum(map(get_urls_from_search_page, soups), [])

In [64]:
all(map(lambda x: x.startswith('https://'), links)) # 合法性
len(links) # 数量
# unique的数量
len(set(links))


2188

In [61]:
with open('links.txt', 'w') as f:
    f.write('\n'.join(links))

## 探索性分析每一个具体url的页面结构

In [8]:
a = {'a':1}
a.update({'b':2})
a

{'a': 1, 'b': 2}

In [9]:
link = links[0]
res = requests.get(link, headers=headers)
soup = BeautifulSoup(res.text, 'html.parser')
soup.text

'2023 Jeanneau Sun Odyssey 490 Cruiser for sale - YachtWorld✕YachtWorld AppMake your dream come true todayOpenBoatsPower BoatsSailboatsSell Your BoatPersonal Boat ShopperResearchServicesYacht BrokersBoat DealersSurveyorsYacht LoansInsuranceBoat TransportDirectoryToggle NavigationUnited States (English) ▼ Danmark - Dansk Deutschland - Deutsch Australia - English United Kingdom - English España - Español Suomi - suomi France - Français Italia - Italiano Nederland - Nederlands Norge - Norsk Sverige - SvenskaLogin  CloseMarine Servicenter - Seattle2442 Westlake Ave North, Seattle, 98109, United StatesView phone numberCloseContact the Seller* Please correct highlighted errors.NamePhone (optional)EmailMessageI recently viewed your listing on YachtWorld and I am interested in more details. Thank you.NameContact BrokerHome⁄/Boats for sale⁄/Sail⁄/Cruiser⁄/Jeanneau⁄/2023 Jeanneau Sun Odyssey 4902023 Jeanneau Sun Odyssey 490US$654,896Seattle, Washington1/57Image coming soonImage coming soonImage 

In [10]:
link

'https://www.yachtworld.com/yacht/2023-jeanneau-sun-odyssey-490-8592752/'

In [43]:
def basic_attributes(soup):
    basic = soup.select('#BoatDetails > div.content.nav-slide.with-sticky-contact.home-block > div.boat-details > div.body > div.boat-details-content > div.details > div:nth-child(1) > div.collapsible.no-pad')
    assert len(basic)==1
    basic = basic[0]
    titles = [f'Basic.{"".join(map(lambda x:x.capitalize(),t.text.strip().split()))}' for t in basic.select('td.datatable-title')]
    values = [t.text.strip() for t in basic.select('td.datatable-value')]
    return {k:v for k,v in zip(titles, values)}
basic_attributes(soup)

{'Basic.Year': '2023',
 'Basic.Make': 'Beneteau',
 'Basic.Model': 'Oceanis Yacht 60',
 'Basic.Class': 'Cruiser',
 'Basic.Length': '60ft',
 'Basic.FuelType': 'Gas',
 'Basic.HullMaterial': 'Fiberglass',
 'Basic.OfferedBy': 'Cape Yachts at South Wharf - South Dartmouth, MA'}

In [47]:
def propulsion_attributes(soup):
    title = soup.select_one("#BoatDetails > div.content.nav-slide.with-sticky-contact.home-block > div.boat-details > div.body > div.boat-details-content > div.details > div:nth-child(4) > div.header > h4 > a").text
    if title.lower()!='propulsion':
        return {}
    propulsion = soup.select('#BoatDetails > div.content.nav-slide.with-sticky-contact.home-block > div.boat-details > div.body > div.boat-details-content > div.details > div:nth-child(4) > div.collapsible.no-pad')
    # propulsion = soup.select('#BoatDetails > div.content.nav-slide.with-sticky-contact.home-block > div.boat-details > div.body > div.boat-details-content > div.details')
    assert len(propulsion)==1
    propulsion = propulsion[0]
    titles = [f'Propulsion.{"".join(map(lambda x:x.capitalize(),t.text.strip().split()))}' for t in propulsion.select('td.datatable-title')]
    values = [t.text.strip() for t in propulsion.select('td.datatable-value')]
    return {k:v for k,v in zip(titles, values)}
propulsion_attributes(soup)

{}

In [36]:
# specification = soup.select('#BoatDetails > div.content.nav-slide.with-sticky-contact.home-block > div.boat-details > div.body > div.boat-details-content > div.details > div:nth-child(5) > div.collapsible.no-pad > div')
specification = soup.select('#BoatDetails > div.content.nav-slide.with-sticky-contact.home-block > div.boat-details > div.body > div.boat-details-content > div.details')
len(specification)

1

In [37]:
# assert len(specification)==1
specification = specification[0]
sub_categories = specification.select('div.datatable-category')
category = sub_categories[1]

cate_name = category.find(attrs='sub-title').text
titles = [f'{cate_name}.{"".join(t.text.strip().split())}' for t in category.select('td.datatable-title')]
values = [t.text.strip() for t in category.select('td.datatable-value')]
titles, values

(['Dimensions.LengthOverall',
  'Dimensions.MaxDraft',
  'Dimensions.Beam',
  'Dimensions.LengthatWaterline'],
 ['48.5ft', '7.33ft', '14.67ft', '43.42ft'])

In [14]:
category

<div class="datatable-category"><h3 class="sub-title">Dimensions</h3><table class="datatable-section"><tbody class="datatable-section"><tr class="datatable-item"><td class="datatable-title">Length Overall</td><td class="datatable-value">48.5ft</td></tr><tr class="datatable-item"><td class="datatable-title">Max Draft</td><td class="datatable-value">7.33ft</td></tr><tr class="datatable-item"><td class="datatable-title">Beam</td><td class="datatable-value">14.67ft</td></tr><tr class="datatable-item"><td class="datatable-title">Length at Waterline</td><td class="datatable-value">43.42ft</td></tr></tbody></table></div>

In [15]:
' sdf  '.strip().capitalize()

'Sdf'

In [48]:
def specification_attributes(soup):
    title = soup.select_one("#BoatDetails > div.content.nav-slide.with-sticky-contact.home-block > div.boat-details > div.body > div.boat-details-content > div.details > div:nth-child(4) > div.header > h4 > a").text
    if title.lower()!='propulsion':
        specification = soup.select('#BoatDetails > div.content.nav-slide.with-sticky-contact.home-block > div.boat-details > div.body > div.boat-details-content > div.details > div:nth-child(4) > div.collapsible.no-pad > div')
    else:
        specification = soup.select('#BoatDetails > div.content.nav-slide.with-sticky-contact.home-block > div.boat-details > div.body > div.boat-details-content > div.details > div:nth-child(5) > div.collapsible.no-pad > div')
    assert len(specification)==1
    specification = specification[0]
    sub_categories = specification.select('div.datatable-category')
    titles = []
    values = []
    for category in sub_categories:
        cate_name = category.find(attrs='sub-title').text
        cate_name = ''.join(map(lambda x:x.capitalize(), cate_name.split()))
        titles += [f'Specification.{cate_name}.{"".join(map(lambda x:x.capitalize(),t.text.strip().split()))}' for t in category.select('td.datatable-title')]
        values += [t.text.strip() for t in category.select('td.datatable-value')]
    return {k:v for k,v in zip(titles, values)}
specification_attributes(soup)

{'Specification.Dimensions.LengthOverall': '62.17ft',
 'Specification.Dimensions.Beam': '17.42ft',
 'Specification.Tanks.FreshWaterTank': '1 x 211 gal ()',
 'Specification.Tanks.FuelTank': '1 x 132 gal ()',
 'Specification.Tanks.HoldingTank': '',
 'Specification.Accommodations.DoubleBerths': '3',
 'Specification.Accommodations.Cabins': '3',
 'Specification.Accommodations.Heads': '3'}

In [49]:
price = soup.select_one('#BoatDetails > div.content.nav-slide.with-sticky-contact.home-block > div.boat-details > div.body > div > div.summary > div > span.payment-total')

# price.text.strip().split('$')[1]
x = float("".join(list(filter(str.isdigit,price.text))))
x

ValueError: could not convert string to float: ''

In [50]:
sn = requests.Session()

In [51]:
# 爬取所有帆船的year属性和价格数据，并将它们存储到CSV文件中
def get_yacht_data(url):
    # res = requests.get(url, headers=headers)
    res = sn.get(url, headers=headers)
    soup = BeautifulSoup(res.text, 'html.parser')
    result = {}
    result.update(basic_attributes(soup))
    result.update(propulsion_attributes(soup))
    result.update(specification_attributes(soup))
    try:
        price = soup.select_one('#BoatDetails > div.content.nav-slide.with-sticky-contact.home-block > div.boat-details > div.body > div > div.summary > div > span.payment-total')
        result['Price'] = float("".join(list(filter(str.isdigit,price.text))))
    except Exception as e:
        pass
    return result
    
get_yacht_data(link)

{'Basic.Year': '2023',
 'Basic.Make': 'Beneteau',
 'Basic.Model': 'Oceanis Yacht 60',
 'Basic.Class': 'Cruiser',
 'Basic.Length': '60ft',
 'Basic.FuelType': 'Gas',
 'Basic.HullMaterial': 'Fiberglass',
 'Basic.OfferedBy': 'Cape Yachts at South Wharf - South Dartmouth, MA',
 'Specification.Dimensions.LengthOverall': '62.17ft',
 'Specification.Dimensions.Beam': '17.42ft',
 'Specification.Tanks.FreshWaterTank': '1 x 211 gal ()',
 'Specification.Tanks.FuelTank': '1 x 132 gal ()',
 'Specification.Tanks.HoldingTank': '',
 'Specification.Accommodations.DoubleBerths': '3',
 'Specification.Accommodations.Cabins': '3',
 'Specification.Accommodations.Heads': '3'}

In [52]:
import numpy as np


yacht_data_list = list(range(len(links)))
error_list = {}
def set_data(i):
    # print(i)
    try:
        yacht_data_list[i] = get_yacht_data(links[i])
    except Exception as e:
        error_list[i]=e
        print(e)
    
# with ThreadPoolExecutor(max_workers=64) as t:
with ThreadPoolExecutor(max_workers=32) as t:
    # tasks = [t.submit(set_data, i) for i in range(len(links))]
    tasks = [t.submit(set_data, i) for i in np.random.choice(len(links), 64, replace=False)]
    # tasks = [t.submit(set_data, i) for i in range(64)]
    for future in tqdm(as_completed(tasks), total=len(tasks)):
        pass    

  2%|▏         | 1/64 [00:01<01:32,  1.47s/it]

'NoneType' object has no attribute 'text'


  3%|▎         | 2/64 [00:02<01:28,  1.43s/it]

'NoneType' object has no attribute 'text'


  6%|▋         | 4/64 [00:10<02:50,  2.84s/it]

'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'


  9%|▉         | 6/64 [00:14<01:52,  1.94s/it]

'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'


 11%|█         | 7/64 [00:14<01:38,  1.73s/it]

'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'


 38%|███▊      | 24/64 [00:16<00:10,  3.82it/s]

'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'


 39%|███▉      | 25/64 [00:17<00:10,  3.63it/s]

'NoneType' object has no attribute 'text'


 41%|████      | 26/64 [00:18<00:14,  2.67it/s]

'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'


 44%|████▍     | 28/64 [00:19<00:17,  2.11it/s]

'NoneType' object has no attribute 'text'


 47%|████▋     | 30/64 [00:20<00:16,  2.01it/s]

'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'


 48%|████▊     | 31/64 [00:23<00:30,  1.07it/s]

'NoneType' object has no attribute 'text'


 50%|█████     | 32/64 [00:25<00:36,  1.14s/it]

'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'


 53%|█████▎    | 34/64 [00:27<00:34,  1.17s/it]

'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'


 59%|█████▉    | 38/64 [00:28<00:12,  2.04it/s]

'NoneType' object has no attribute 'text'


 88%|████████▊ | 56/64 [00:29<00:00, 11.28it/s]

'NoneType' object has no attribute 'text'


 94%|█████████▍| 60/64 [00:47<00:07,  1.92s/it]

'NoneType' object has no attribute 'text'


 95%|█████████▌| 61/64 [00:47<00:05,  1.70s/it]

'NoneType' object has no attribute 'text'


 97%|█████████▋| 62/64 [00:48<00:03,  1.65s/it]

'NoneType' object has no attribute 'text'


 98%|█████████▊| 63/64 [00:51<00:01,  1.89s/it]

'NoneType' object has no attribute 'text'


100%|██████████| 64/64 [01:20<00:00,  1.26s/it]


In [53]:
error_list
# links[31]

{212: AttributeError("'NoneType' object has no attribute 'text'"),
 885: AttributeError("'NoneType' object has no attribute 'text'"),
 1822: AttributeError("'NoneType' object has no attribute 'text'"),
 1866: AttributeError("'NoneType' object has no attribute 'text'"),
 2081: AttributeError("'NoneType' object has no attribute 'text'"),
 1548: AttributeError("'NoneType' object has no attribute 'text'"),
 337: AttributeError("'NoneType' object has no attribute 'text'"),
 273: AttributeError("'NoneType' object has no attribute 'text'"),
 1539: AttributeError("'NoneType' object has no attribute 'text'"),
 2078: AttributeError("'NoneType' object has no attribute 'text'"),
 87: AttributeError("'NoneType' object has no attribute 'text'"),
 1097: AttributeError("'NoneType' object has no attribute 'text'"),
 1976: AttributeError("'NoneType' object has no attribute 'text'"),
 1798: AttributeError("'NoneType' object has no attribute 'text'"),
 1684: AttributeError("'NoneType' object has no attrib

In [54]:
# link= links[566]
link= links[212]
soup = BeautifulSoup(sn.get(link, headers=headers).text, 'html.parser')
get_yacht_data(link)

AttributeError: 'NoneType' object has no attribute 'text'

In [55]:
link

'https://www.yachtworld.com/yacht/1985-catalina-capri-22-8493121/'

In [36]:
len(list(filter(lambda x:not isinstance(x, int), yacht_data_list)))


46

In [None]:

df = pd.DataFrame(yacht_data_list)
df.to_csv('yacht_data.csv', index=False)
